14465 Old usr/src/uts/common/os/vm

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2021 Oxide Computer Company
  24  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  25  */
  26 
  27 /*
  28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29  * Use is subject to license terms.
  30  */
  31 
  32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  33 /* All Rights Reserved */
  34 
  35 /*
  36  * University Copyright- Copyright (c) 1982, 1986, 1988
  37  * The Regents of the University of California
  38  * All Rights Reserved
  39  *
  40  * University Acknowledgment- Portions of this document are derived from
  41  * software developed by the University of California, Berkeley, and its
  42  * contributors.
  43  */
  44 
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/buf.h>
  49 #include <sys/uio.h>
  50 #include <sys/proc.h>
  51 #include <sys/systm.h>
  52 #include <sys/mman.h>
  53 #include <sys/cred.h>
  54 #include <sys/vnode.h>
  55 #include <sys/vm.h>
  56 #include <sys/vmparam.h>
  57 #include <sys/vtrace.h>
  58 #include <sys/cmn_err.h>
  59 #include <sys/cpuvar.h>
  60 #include <sys/user.h>
  61 #include <sys/kmem.h>
  62 #include <sys/debug.h>
  63 #include <sys/callb.h>
  64 #include <sys/tnf_probe.h>
  65 #include <sys/mem_cage.h>
  66 #include <sys/time.h>
  67 #include <sys/stdbool.h>
  68 
  69 #include <vm/hat.h>
  70 #include <vm/as.h>
  71 #include <vm/seg.h>
  72 #include <vm/page.h>
  73 #include <vm/pvn.h>
  74 #include <vm/seg_kmem.h>
  75 
  76 /*
  77  * FREE MEMORY MANAGEMENT
  78  *
  79  * Management of the pool of free pages is a tricky business.  There are
  80  * several critical threshold values which constrain our allocation of new
  81  * pages and inform the rate of paging out of memory to swap.  These threshold
  82  * values, and the behaviour they induce, are described below in descending
  83  * order of size -- and thus increasing order of severity!
  84  *
  85  *   +---------------------------------------------------- physmem (all memory)
  86  *   |
  87  *   | Ordinarily there are no particular constraints placed on page
  88  *   v allocation.  The page scanner is not running and page_create_va()
  89  *   | will effectively grant all page requests (whether from the kernel
  90  *   | or from user processes) without artificial delay.
  91  *   |
  92  *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
  93  *   |
  94  *   | When we have less than "lotsfree" pages, pageout_scanner() is
  95  *   v signalled by schedpaging() to begin looking for pages that can
  96  *   | be evicted to disk to bring us back above lotsfree.  At this
  97  *   | stage there is still no constraint on allocation of free pages.
  98  *   |
  99  *   | For small systems, we set a lower bound of 16MB for lotsfree;
 100  *   v this is the natural value for a system with 1GB memory.  This is
 101  *   | to ensure that the pageout reserve pool contains at least 4MB
 102  *   | for use by ZFS.
 103  *   |
 104  *   | For systems with a large amount of memory, we constrain lotsfree
 105  *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
 106  *   v at some point the required slack relates more closely to the
 107  *   | rate at which paging can occur than to the total amount of memory.
 108  *   |
 109  *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
 110  *   |
 111  *   | When we drop below desfree, a number of kernel facilities will
 112  *   v wait before allocating more memory, under the assumption that
 113  *   | pageout or reaping will make progress and free up some memory.
 114  *   | This behaviour is not especially coordinated; look for comparisons
 115  *   | of desfree and freemem.
 116  *   |
 117  *   | In addition to various attempts at advisory caution, clock()
 118  *   | will wake up the thread that is ordinarily parked in sched().
 119  *   | This routine is responsible for the heavy-handed swapping out
 120  *   v of entire processes in an attempt to arrest the slide of free
 121  *   | memory.  See comments in sched.c for more details.
 122  *   |
 123  *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
 124  *   |
 125  *   | These two separate tunables have, by default, the same value.
 126  *   v Various parts of the kernel use minfree to signal the need for
 127  *   | more aggressive reclamation of memory, and sched() is more
 128  *   | aggressive at swapping processes out.
 129  *   |
 130  *   | If free memory falls below throttlefree, page_create_va() will
 131  *   | use page_create_throttle() to begin holding most requests for
 132  *   | new pages while pageout and reaping free up memory.  Sleeping
 133  *   v allocations (e.g., KM_SLEEP) are held here while we wait for
 134  *   | more memory.  Non-sleeping allocations are generally allowed to
 135  *   | proceed, unless their priority is explicitly lowered with
 136  *   | KM_NORMALPRI.
 137  *   |
 138  *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
 139  *   |
 140  *   | When we hit throttlefree, the situation is already dire.  The
 141  *   v system is generally paging out memory and swapping out entire
 142  *   | processes in order to free up memory for continued operation.
 143  *   |
 144  *   | Unfortunately, evicting memory to disk generally requires short
 145  *   | term use of additional memory; e.g., allocation of buffers for
 146  *   | storage drivers, updating maps of free and used blocks, etc.
 147  *   | As such, pageout_reserve is the number of pages that we keep in
 148  *   | special reserve for use by pageout() and sched() and by any
 149  *   v other parts of the kernel that need to be working for those to
 150  *   | make forward progress such as the ZFS I/O pipeline.
 151  *   |
 152  *   | When we are below pageout_reserve, we fail or hold any allocation
 153  *   | that has not explicitly requested access to the reserve pool.
 154  *   | Access to the reserve is generally granted via the KM_PUSHPAGE
 155  *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
 156  *   | can implicitly tap the reserve.  For more details, see the
 157  *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
 158  *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
 159  *   |
 160  *   +---------------------------------------------------------- no free memory
 161  *   |
 162  *   | If we have arrived here, things are very bad indeed.  It is
 163  *   v surprisingly difficult to tell if this condition is even fatal,
 164  *   | as enough memory may have been granted to pageout() and to the
 165  *   | ZFS I/O pipeline that requests for eviction that have already been
 166  *   | made will complete and free up memory some time soon.
 167  *   |
 168  *   | If free memory does not materialise, the system generally remains
 169  *   | deadlocked.  The pageout_deadman() below is run once per second
 170  *   | from clock(), seeking to limit the amount of time a single request
 171  *   v to page out can be blocked before the system panics to get a crash
 172  *   | dump and return to service.
 173  *   |
 174  *   +-------------------------------------------------------------------------
 175  */
 176 
 177 /*
 178  * The following parameters control operation of the page replacement
 179  * algorithm.  They are initialized to 0, and then computed at boot time based
 180  * on the size of the system; see setupclock().  If they are patched non-zero
 181  * in a loaded vmunix they are left alone and may thus be changed per system
 182  * using "mdb -kw" on the loaded system.
 183  */
 184 pgcnt_t         slowscan = 0;
 185 pgcnt_t         fastscan = 0;
 186 
 187 static pgcnt_t  handspreadpages = 0;
 188 
 189 /*
 190  * looppages:
 191  *     Cached copy of the total number of pages in the system (total_pages).
 192  *
 193  * loopfraction:
 194  *     Divisor used to relate fastscan to looppages in setupclock().
 195  */
 196 static uint_t   loopfraction = 2;
 197 static pgcnt_t  looppages;
 198 
 199 static uint_t   min_percent_cpu = 4;
 200 static uint_t   max_percent_cpu = 80;
 201 static pgcnt_t  maxfastscan = 0;
 202 static pgcnt_t  maxslowscan = 100;
 203 
 204 #define         MEGABYTES               (1024ULL * 1024ULL)
 205 
 206 /*
 207  * pageout_threshold_style:
 208  *     set to 1 to use the previous default threshold size calculation;
 209  *     i.e., each threshold is half of the next largest value.
 210  */
 211 uint_t          pageout_threshold_style = 0;
 212 
 213 /*
 214  * The operator may override these tunables to request a different minimum or
 215  * maximum lotsfree value, or to change the divisor we use for automatic
 216  * sizing.
 217  *
 218  * By default, we make lotsfree 1/64th of the total memory in the machine.  The
 219  * minimum and maximum are specified in bytes, rather than pages; a zero value
 220  * means the default values (below) are used.
 221  */
 222 uint_t          lotsfree_fraction = 64;
 223 pgcnt_t         lotsfree_min = 0;
 224 pgcnt_t         lotsfree_max = 0;
 225 
 226 #define         LOTSFREE_MIN_DEFAULT    (16 * MEGABYTES)
 227 #define         LOTSFREE_MAX_DEFAULT    (2048 * MEGABYTES)
 228 
 229 /*
 230  * If these tunables are set to non-zero values in /etc/system, and provided
 231  * the value is not larger than the threshold above, the specified value will
 232  * be used directly without any additional calculation or adjustment.  The boot
 233  * time value of these overrides is preserved in the "clockinit" struct.  More
 234  * detail is available in the comment at the top of the file.
 235  */
 236 pgcnt_t         maxpgio = 0;
 237 pgcnt_t         minfree = 0;
 238 pgcnt_t         desfree = 0;
 239 pgcnt_t         lotsfree = 0;
 240 pgcnt_t         needfree = 0;
 241 pgcnt_t         throttlefree = 0;
 242 pgcnt_t         pageout_reserve = 0;
 243 
 244 pgcnt_t         deficit;
 245 pgcnt_t         nscan;
 246 pgcnt_t         desscan;
 247 
 248 /*
 249  * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
 250  * number of nanoseconds in each wakeup cycle that gives the equivalent of some
 251  * underlying %CPU duty cycle.
 252  *
 253  * min_pageout_nsec:
 254  *     nanoseconds/wakeup equivalent of min_percent_cpu.
 255  *
 256  * max_pageout_nsec:
 257  *     nanoseconds/wakeup equivalent of max_percent_cpu.
 258  *
 259  * pageout_nsec:
 260  *     Number of nanoseconds budgeted for each wakeup cycle.
 261  *     Computed each time around by schedpaging().
 262  *     Varies between min_pageout_nsec and max_pageout_nsec,
 263  *     depending on memory pressure.
 264  */
 265 static hrtime_t min_pageout_nsec;
 266 static hrtime_t max_pageout_nsec;
 267 static hrtime_t pageout_nsec;
 268 
 269 static uint_t   reset_hands;
 270 
 271 #define PAGES_POLL_MASK 1023
 272 
 273 /*
 274  * pageout_sample_lim:
 275  *     The limit on the number of samples needed to establish a value for new
 276  *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
 277  *     handspreadpages.
 278  *
 279  * pageout_sample_cnt:
 280  *     Current sample number.  Once the sample gets large enough, set new
 281  *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
 282  *
 283  * pageout_sample_pages:
 284  *     The accumulated number of pages scanned during sampling.
 285  *
 286  * pageout_sample_etime:
 287  *     The accumulated nanoseconds for the sample.
 288  *
 289  * pageout_rate:
 290  *     Rate in pages/nanosecond, computed at the end of sampling.
 291  *
 292  * pageout_new_spread:
 293  *     Initially zero while the system scan rate is measured by
 294  *     pageout_scanner(), which then sets this value once per system boot after
 295  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
 296  *     new value is used for fastscan and handspreadpages.
 297  *
 298  * sample_start, sample_end:
 299  *     The hrtime at which the last pageout_scanner() sample began and ended.
 300  */
 301 typedef hrtime_t hrrate_t;
 302 
 303 static uint64_t pageout_sample_lim = 4;
 304 static uint64_t pageout_sample_cnt = 0;
 305 static pgcnt_t  pageout_sample_pages = 0;
 306 static hrrate_t pageout_rate = 0;
 307 static pgcnt_t  pageout_new_spread = 0;
 308 
 309 static hrtime_t pageout_cycle_nsec;
 310 static hrtime_t sample_start, sample_end;
 311 static hrtime_t pageout_sample_etime = 0;
 312 
 313 /*
 314  * Record number of times a pageout_scanner() wakeup cycle finished because it
 315  * timed out (exceeded its CPU budget), rather than because it visited
 316  * its budgeted number of pages.
 317  */
 318 uint64_t        pageout_timeouts = 0;
 319 
 320 #ifdef VM_STATS
 321 static struct pageoutvmstats_str {
 322         ulong_t checkpage[3];
 323 } pageoutvmstats;
 324 #endif /* VM_STATS */
 325 
 326 /*
 327  * Threads waiting for free memory use this condition variable and lock until
 328  * memory becomes available.
 329  */
 330 kmutex_t        memavail_lock;
 331 kcondvar_t      memavail_cv;
 332 
 333 typedef enum pageout_hand {
 334         POH_FRONT = 1,
 335         POH_BACK,
 336 } pageout_hand_t;
 337 
 338 typedef enum {
 339         CKP_INELIGIBLE,
 340         CKP_NOT_FREED,
 341         CKP_FREED,
 342 } checkpage_result_t;
 343 
 344 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
 345 
 346 static struct clockinit {
 347         bool ci_init;
 348         pgcnt_t ci_lotsfree_min;
 349         pgcnt_t ci_lotsfree_max;
 350         pgcnt_t ci_lotsfree;
 351         pgcnt_t ci_desfree;
 352         pgcnt_t ci_minfree;
 353         pgcnt_t ci_throttlefree;
 354         pgcnt_t ci_pageout_reserve;
 355         pgcnt_t ci_maxpgio;
 356         pgcnt_t ci_maxfastscan;
 357         pgcnt_t ci_fastscan;
 358         pgcnt_t ci_slowscan;
 359         pgcnt_t ci_handspreadpages;
 360 } clockinit = { .ci_init = false };
 361 
 362 static pgcnt_t
 363 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 364 {
 365         if (value < minimum) {
 366                 return (minimum);
 367         } else if (value > maximum) {
 368                 return (maximum);
 369         } else {
 370                 return (value);
 371         }
 372 }
 373 
 374 static pgcnt_t
 375 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 376 {
 377         if (initval == 0 || initval >= initval_ceiling) {
 378                 return (defval);
 379         } else {
 380                 return (initval);
 381         }
 382 }
 383 
 384 /*
 385  * Set up the paging constants for the clock algorithm used by
 386  * pageout_scanner(), and by the virtual memory system overall.  See the
 387  * comments at the top of this file for more information about the threshold
 388  * values and system responses to memory pressure.
 389  *
 390  * This routine is called once by main() at startup, after the initial size of
 391  * physical memory is determined.  It may be called again later if memory is
 392  * added to or removed from the system, or if new measurements of the page scan
 393  * rate become available.
 394  */
 395 void
 396 setupclock(void)
 397 {
 398         pgcnt_t defval;
 399         bool half = (pageout_threshold_style == 1);
 400         bool recalc = true;
 401 
 402         looppages = total_pages;
 403 
 404         /*
 405          * The operator may have provided specific values for some of the
 406          * tunables via /etc/system.  On our first call, we preserve those
 407          * values so that they can be used for subsequent recalculations.
 408          *
 409          * A value of zero for any tunable means we will use the default
 410          * sizing.
 411          */
 412         if (!clockinit.ci_init) {
 413                 clockinit.ci_init = true;
 414 
 415                 clockinit.ci_lotsfree_min = lotsfree_min;
 416                 clockinit.ci_lotsfree_max = lotsfree_max;
 417                 clockinit.ci_lotsfree = lotsfree;
 418                 clockinit.ci_desfree = desfree;
 419                 clockinit.ci_minfree = minfree;
 420                 clockinit.ci_throttlefree = throttlefree;
 421                 clockinit.ci_pageout_reserve = pageout_reserve;
 422                 clockinit.ci_maxpgio = maxpgio;
 423                 clockinit.ci_maxfastscan = maxfastscan;
 424                 clockinit.ci_fastscan = fastscan;
 425                 clockinit.ci_slowscan = slowscan;
 426                 clockinit.ci_handspreadpages = handspreadpages;
 427 
 428                 /*
 429                  * The first call does not trigger a recalculation, only
 430                  * subsequent calls.
 431                  */
 432                 recalc = false;
 433         }
 434 
 435         /*
 436          * Configure paging threshold values.  For more details on what each
 437          * threshold signifies, see the comments at the top of this file.
 438          */
 439         lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
 440             btop(LOTSFREE_MAX_DEFAULT));
 441         lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
 442             btop(LOTSFREE_MIN_DEFAULT));
 443 
 444         lotsfree = tune(clockinit.ci_lotsfree, looppages,
 445             clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
 446 
 447         desfree = tune(clockinit.ci_desfree, lotsfree,
 448             lotsfree / 2);
 449 
 450         minfree = tune(clockinit.ci_minfree, desfree,
 451             half ? desfree / 2 : 3 * desfree / 4);
 452 
 453         throttlefree = tune(clockinit.ci_throttlefree, desfree,
 454             minfree);
 455 
 456         pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
 457             half ? throttlefree / 2 : 3 * throttlefree / 4);
 458 
 459         /*
 460          * Maxpgio thresholds how much paging is acceptable.
 461          * This figures that 2/3 busy on an arm is all that is
 462          * tolerable for paging.  We assume one operation per disk rev.
 463          *
 464          * XXX - Does not account for multiple swap devices.
 465          */
 466         if (clockinit.ci_maxpgio == 0) {
 467                 maxpgio = (DISKRPM * 2) / 3;
 468         } else {
 469                 maxpgio = clockinit.ci_maxpgio;
 470         }
 471 
 472         /*
 473          * The clock scan rate varies between fastscan and slowscan
 474          * based on the amount of free memory available.  Fastscan
 475          * rate should be set based on the number pages that can be
 476          * scanned per sec using ~10% of processor time.  Since this
 477          * value depends on the processor, MMU, Mhz etc., it is
 478          * difficult to determine it in a generic manner for all
 479          * architectures.
 480          *
 481          * Instead of trying to determine the number of pages scanned
 482          * per sec for every processor, fastscan is set to be the smaller
 483          * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 484          * time is limited to ~4% of processor time.
 485          *
 486          * Setting fastscan to be 1/2 of memory allows pageout to scan
 487          * all of memory in ~2 secs.  This implies that user pages not
 488          * accessed within 1 sec (assuming, handspreadpages == fastscan)
 489          * can be reclaimed when free memory is very low.  Stealing pages
 490          * not accessed within 1 sec seems reasonable and ensures that
 491          * active user processes don't thrash.
 492          *
 493          * Smaller values of fastscan result in scanning fewer pages
 494          * every second and consequently pageout may not be able to free
 495          * sufficient memory to maintain the minimum threshold.  Larger
 496          * values of fastscan result in scanning a lot more pages which
 497          * could lead to thrashing and higher CPU usage.
 498          *
 499          * Fastscan needs to be limited to a maximum value and should not
 500          * scale with memory to prevent pageout from consuming too much
 501          * time for scanning on slow CPU's and avoid thrashing, as a
 502          * result of scanning too many pages, on faster CPU's.
 503          * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 504          * (the upper bound for fastscan) based on the average number
 505          * of pages that can potentially be scanned in ~1 sec (using ~4%
 506          * of the CPU) on some of the following machines that currently
 507          * run Solaris 2.x:
 508          *
 509          *                      average memory scanned in ~1 sec
 510          *
 511          *      25 Mhz SS1+:            23 Meg
 512          *      LX:                     37 Meg
 513          *      50 Mhz SC2000:          68 Meg
 514          *
 515          *      40 Mhz 486:             26 Meg
 516          *      66 Mhz 486:             42 Meg
 517          *
 518          * When free memory falls just below lotsfree, the scan rate
 519          * goes from 0 to slowscan (i.e., pageout starts running).  This
 520          * transition needs to be smooth and is achieved by ensuring that
 521          * pageout scans a small number of pages to satisfy the transient
 522          * memory demand.  This is set to not exceed 100 pages/sec (25 per
 523          * wakeup) since scanning that many pages has no noticible impact
 524          * on system performance.
 525          *
 526          * In addition to setting fastscan and slowscan, pageout is
 527          * limited to using ~4% of the CPU.  This results in increasing
 528          * the time taken to scan all of memory, which in turn means that
 529          * user processes have a better opportunity of preventing their
 530          * pages from being stolen.  This has a positive effect on
 531          * interactive and overall system performance when memory demand
 532          * is high.
 533          *
 534          * Thus, the rate at which pages are scanned for replacement will
 535          * vary linearly between slowscan and the number of pages that
 536          * can be scanned using ~4% of processor time instead of varying
 537          * linearly between slowscan and fastscan.
 538          *
 539          * Also, the processor time used by pageout will vary from ~1%
 540          * at slowscan to ~4% at fastscan instead of varying between
 541          * ~1% at slowscan and ~10% at fastscan.
 542          *
 543          * The values chosen for the various VM parameters (fastscan,
 544          * handspreadpages, etc) are not universally true for all machines,
 545          * but appear to be a good rule of thumb for the machines we've
 546          * tested.  They have the following ranges:
 547          *
 548          *      cpu speed:      20 to 70 Mhz
 549          *      page size:      4K to 8K
 550          *      memory size:    16M to 5G
 551          *      page scan rate: 4000 - 17400 4K pages per sec
 552          *
 553          * The values need to be re-examined for machines which don't
 554          * fall into the various ranges (e.g., slower or faster CPUs,
 555          * smaller or larger pagesizes etc) shown above.
 556          *
 557          * On an MP machine, pageout is often unable to maintain the
 558          * minimum paging thresholds under heavy load.  This is due to
 559          * the fact that user processes running on other CPU's can be
 560          * dirtying memory at a much faster pace than pageout can find
 561          * pages to free.  The memory demands could be met by enabling
 562          * more than one CPU to run the clock algorithm in such a manner
 563          * that the various clock hands don't overlap.  This also makes
 564          * it more difficult to determine the values for fastscan, slowscan
 565          * and handspreadpages.
 566          *
 567          * The swapper is currently used to free up memory when pageout
 568          * is unable to meet memory demands by swapping out processes.
 569          * In addition to freeing up memory, swapping also reduces the
 570          * demand for memory by preventing user processes from running
 571          * and thereby consuming memory.
 572          */
 573         if (clockinit.ci_maxfastscan == 0) {
 574                 if (pageout_new_spread != 0) {
 575                         maxfastscan = pageout_new_spread;
 576                 } else {
 577                         maxfastscan = MAXHANDSPREADPAGES;
 578                 }
 579         } else {
 580                 maxfastscan = clockinit.ci_maxfastscan;
 581         }
 582 
 583         if (clockinit.ci_fastscan == 0) {
 584                 fastscan = MIN(looppages / loopfraction, maxfastscan);
 585         } else {
 586                 fastscan = clockinit.ci_fastscan;
 587         }
 588 
 589         if (fastscan > looppages / loopfraction) {
 590                 fastscan = looppages / loopfraction;
 591         }
 592 
 593         /*
 594          * Set slow scan time to 1/10 the fast scan time, but
 595          * not to exceed maxslowscan.
 596          */
 597         if (clockinit.ci_slowscan == 0) {
 598                 slowscan = MIN(fastscan / 10, maxslowscan);
 599         } else {
 600                 slowscan = clockinit.ci_slowscan;
 601         }
 602 
 603         if (slowscan > fastscan / 2) {
 604                 slowscan = fastscan / 2;
 605         }
 606 
 607         /*
 608          * Handspreadpages is distance (in pages) between front and back
 609          * pageout daemon hands.  The amount of time to reclaim a page
 610          * once pageout examines it increases with this distance and
 611          * decreases as the scan rate rises. It must be < the amount
 612          * of pageable memory.
 613          *
 614          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 615          * to be "fastscan" results in the front hand being a few secs
 616          * (varies based on the processor speed) ahead of the back hand
 617          * at fastscan rates.  This distance can be further reduced, if
 618          * necessary, by increasing the processor time used by pageout
 619          * to be more than ~4% and preferrably not more than ~10%.
 620          *
 621          * As a result, user processes have a much better chance of
 622          * referencing their pages before the back hand examines them.
 623          * This also significantly lowers the number of reclaims from
 624          * the freelist since pageout does not end up freeing pages which
 625          * may be referenced a sec later.
 626          */
 627         if (clockinit.ci_handspreadpages == 0) {
 628                 handspreadpages = fastscan;
 629         } else {
 630                 handspreadpages = clockinit.ci_handspreadpages;
 631         }
 632 
 633         /*
 634          * Make sure that back hand follows front hand by at least
 635          * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
 636          * back hand to look at a page during the same wakeup of the pageout
 637          * daemon in which the front hand cleared its ref bit.
 638          */
 639         if (handspreadpages >= looppages) {
 640                 handspreadpages = looppages - 1;
 641         }
 642 
 643         /*
 644          * If we have been called to recalculate the parameters, set a flag to
 645          * re-evaluate the clock hand pointers.
 646          */
 647         if (recalc) {
 648                 reset_hands = 1;
 649         }
 650 }
 651 
 652 /*
 653  * Pageout scheduling.
 654  *
 655  * Schedpaging controls the rate at which the page out daemon runs by
 656  * setting the global variables nscan and desscan SCHEDPAGING_HZ
 657  * times a second.  Nscan records the number of pages pageout has examined
 658  * in its current pass; schedpaging() resets this value to zero each time
 659  * it runs.  Desscan records the number of pages pageout should examine
 660  * in its next pass; schedpaging() sets this value based on the amount of
 661  * currently available memory.
 662  */
 663 #define SCHEDPAGING_HZ  4
 664 
 665 static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
 666 
 667 /*
 668  * Pool of available async pageout putpage requests.
 669  */
 670 static struct async_reqs *push_req;
 671 static struct async_reqs *req_freelist; /* available req structs */
 672 static struct async_reqs *push_list;    /* pending reqs */
 673 static kmutex_t push_lock;              /* protects req pool */
 674 static kcondvar_t push_cv;
 675 
 676 /*
 677  * If pageout() is stuck on a single push for this many seconds,
 678  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
 679  * to 0, the deadman will have no effect.
 680  *
 681  * Note that we are only looking for stalls in the calls that pageout() makes
 682  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
 683  * I/O, which should not take long unless the underlying strategy call blocks
 684  * indefinitely for memory.  The actual I/O request happens (or fails) later.
 685  */
 686 uint_t pageout_deadman_seconds = 90;
 687 
 688 static uint_t pageout_stucktime = 0;
 689 static bool pageout_pushing = false;
 690 static uint64_t pageout_pushcount = 0;
 691 static uint64_t pageout_pushcount_seen = 0;
 692 
 693 static int async_list_size = 256;       /* number of async request structs */
 694 
 695 static void pageout_scanner(void);
 696 
 697 /*
 698  * If a page is being shared more than "po_share" times
 699  * then leave it alone- don't page it out.
 700  */
 701 #define MIN_PO_SHARE    (8)
 702 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 703 ulong_t po_share = MIN_PO_SHARE;
 704 
 705 /*
 706  * Schedule rate for paging.
 707  * Rate is linear interpolation between
 708  * slowscan with lotsfree and fastscan when out of memory.
 709  */
 710 static void
 711 schedpaging(void *arg)
 712 {
 713         spgcnt_t vavail;
 714 
 715         if (freemem < lotsfree + needfree + kmem_reapahead)
 716                 kmem_reap();
 717 
 718         if (freemem < lotsfree + needfree)
 719                 seg_preap();
 720 
 721         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 722                 kcage_cageout_wakeup();
 723 
 724         if (mutex_tryenter(&pageout_mutex)) {
 725                 /* pageout() not running */
 726                 nscan = 0;
 727                 vavail = freemem - deficit;
 728                 if (pageout_new_spread != 0)
 729                         vavail -= needfree;
 730                 if (vavail < 0)
 731                         vavail = 0;
 732                 if (vavail > lotsfree)
 733                         vavail = lotsfree;
 734 
 735                 /*
 736                  * Fix for 1161438 (CRS SPR# 73922).  All variables
 737                  * in the original calculation for desscan were 32 bit signed
 738                  * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 739                  * more of memory, the calculation can overflow.  When this
 740                  * happens, desscan becomes negative and pageout_scanner()
 741                  * stops paging out.
 742                  */
 743                 if (needfree > 0 && pageout_new_spread == 0) {
 744                         /*
 745                          * If we've not yet collected enough samples to
 746                          * calculate a spread, use the old logic of kicking
 747                          * into high gear anytime needfree is non-zero.
 748                          */
 749                         desscan = fastscan / SCHEDPAGING_HZ;
 750                 } else {
 751                         /*
 752                          * Once we've calculated a spread based on system
 753                          * memory and usage, just treat needfree as another
 754                          * form of deficit.
 755                          */
 756                         spgcnt_t faststmp, slowstmp, result;
 757 
 758                         slowstmp = slowscan * vavail;
 759                         faststmp = fastscan * (lotsfree - vavail);
 760                         result = (slowstmp + faststmp) /
 761                             nz(lotsfree) / SCHEDPAGING_HZ;
 762                         desscan = (pgcnt_t)result;
 763                 }
 764 
 765                 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
 766                     (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
 767 
 768                 if (freemem < lotsfree + needfree ||
 769                     pageout_sample_cnt < pageout_sample_lim) {
 770                         /*
 771                          * Either we need more memory, or we still need to
 772                          * measure the average scan rate.  Wake the scanner.
 773                          */
 774                         DTRACE_PROBE(pageout__cv__signal);
 775                         cv_signal(&proc_pageout->p_cv);
 776                 } else {
 777                         /*
 778                          * There are enough free pages, no need to
 779                          * kick the scanner thread.  And next time
 780                          * around, keep more of the `highly shared'
 781                          * pages.
 782                          */
 783                         cv_signal_pageout();
 784                         if (po_share > MIN_PO_SHARE) {
 785                                 po_share >>= 1;
 786                         }
 787                 }
 788                 mutex_exit(&pageout_mutex);
 789         }
 790 
 791         /*
 792          * Signal threads waiting for available memory.
 793          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 794          * in this case it is not needed - the waiters will be waken up during
 795          * the next invocation of this function.
 796          */
 797         if (kmem_avail() > 0)
 798                 cv_broadcast(&memavail_cv);
 799 
 800         (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
 801 }
 802 
 803 pgcnt_t         pushes;
 804 ulong_t         push_list_size;         /* # of requests on pageout queue */
 805 
 806 /*
 807  * Paging out should always be enabled.  This tunable exists to hold pageout
 808  * for debugging purposes.  If set to 0, pageout_scanner() will go back to
 809  * sleep each time it is woken by schedpaging().
 810  */
 811 uint_t dopageout = 1;
 812 
 813 /*
 814  * The page out daemon, which runs as process 2.
 815  *
 816  * As long as there are at least lotsfree pages,
 817  * this process is not run.  When the number of free
 818  * pages stays in the range desfree to lotsfree,
 819  * this daemon runs through the pages in the loop
 820  * at a rate determined in schedpaging().  Pageout manages
 821  * two hands on the clock.  The front hand moves through
 822  * memory, clearing the reference bit,
 823  * and stealing pages from procs that are over maxrss.
 824  * The back hand travels a distance behind the front hand,
 825  * freeing the pages that have not been referenced in the time
 826  * since the front hand passed.  If modified, they are pushed to
 827  * swap before being freed.
 828  *
 829  * There are 2 threads that act on behalf of the pageout process.
 830  * One thread scans pages (pageout_scanner) and frees them up if
 831  * they don't require any VOP_PUTPAGE operation. If a page must be
 832  * written back to its backing store, the request is put on a list
 833  * and the other (pageout) thread is signaled. The pageout thread
 834  * grabs VOP_PUTPAGE requests from the list, and processes them.
 835  * Some filesystems may require resources for the VOP_PUTPAGE
 836  * operations (like memory) and hence can block the pageout
 837  * thread, but the scanner thread can still operate. There is still
 838  * no guarantee that memory deadlocks cannot occur.
 839  *
 840  * For now, this thing is in very rough form.
 841  */
 842 void
 843 pageout()
 844 {
 845         struct async_reqs *arg;
 846         pri_t pageout_pri;
 847         int i;
 848         pgcnt_t max_pushes;
 849         callb_cpr_t cprinfo;
 850 
 851         proc_pageout = ttoproc(curthread);
 852         proc_pageout->p_cstime = 0;
 853         proc_pageout->p_stime =  0;
 854         proc_pageout->p_cutime =  0;
 855         proc_pageout->p_utime = 0;
 856         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
 857         bcopy("pageout", PTOU(curproc)->u_comm, 7);
 858 
 859         /*
 860          * Create pageout scanner thread
 861          */
 862         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
 863         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
 864 
 865         /*
 866          * Allocate and initialize the async request structures
 867          * for pageout.
 868          */
 869         push_req = (struct async_reqs *)
 870             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
 871 
 872         req_freelist = push_req;
 873         for (i = 0; i < async_list_size - 1; i++) {
 874                 push_req[i].a_next = &push_req[i + 1];
 875         }
 876 
 877         pageout_pri = curthread->t_pri;
 878 
 879         /* Create the pageout scanner thread. */
 880         (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
 881             pageout_pri - 1);
 882 
 883         /*
 884          * kick off pageout scheduler.
 885          */
 886         schedpaging(NULL);
 887 
 888         /*
 889          * Create kernel cage thread.
 890          * The kernel cage thread is started under the pageout process
 891          * to take advantage of the less restricted page allocation
 892          * in page_create_throttle().
 893          */
 894         kcage_cageout_init();
 895 
 896         /*
 897          * Limit pushes to avoid saturating pageout devices.
 898          */
 899         max_pushes = maxpgio / SCHEDPAGING_HZ;
 900         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
 901 
 902         for (;;) {
 903                 mutex_enter(&push_lock);
 904 
 905                 while ((arg = push_list) == NULL || pushes > max_pushes) {
 906                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 907                         cv_wait(&push_cv, &push_lock);
 908                         pushes = 0;
 909                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
 910                 }
 911                 push_list = arg->a_next;
 912                 arg->a_next = NULL;
 913                 pageout_pushing = true;
 914                 mutex_exit(&push_lock);
 915 
 916                 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
 917                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 918                         pushes++;
 919                 }
 920 
 921                 /* vp held by checkpage() */
 922                 VN_RELE(arg->a_vp);
 923 
 924                 mutex_enter(&push_lock);
 925                 pageout_pushing = false;
 926                 pageout_pushcount++;
 927                 arg->a_next = req_freelist;  /* back on freelist */
 928                 req_freelist = arg;
 929                 push_list_size--;
 930                 mutex_exit(&push_lock);
 931         }
 932 }
 933 
 934 /*
 935  * Kernel thread that scans pages looking for ones to free
 936  */
 937 static void
 938 pageout_scanner(void)
 939 {
 940         struct page *fronthand, *backhand;
 941         uint_t laps;
 942         callb_cpr_t cprinfo;
 943         pgcnt_t nscan_limit;
 944         pgcnt_t pcount;
 945         bool sampling;
 946 
 947         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
 948         mutex_enter(&pageout_mutex);
 949 
 950         /*
 951          * The restart case does not attempt to point the hands at roughly
 952          * the right point on the assumption that after one circuit things
 953          * will have settled down, and restarts shouldn't be that often.
 954          */
 955 
 956         /*
 957          * Set the two clock hands to be separated by a reasonable amount,
 958          * but no more than 360 degrees apart.
 959          */
 960         backhand = page_first();
 961         if (handspreadpages >= total_pages) {
 962                 fronthand = page_nextn(backhand, total_pages - 1);
 963         } else {
 964                 fronthand = page_nextn(backhand, handspreadpages);
 965         }
 966 
 967         /*
 968          * Establish the minimum and maximum length of time to be spent
 969          * scanning pages per wakeup, limiting the scanner duty cycle.  The
 970          * input percentage values (0-100) must be converted to a fraction of
 971          * the number of nanoseconds in a second of wall time, then further
 972          * scaled down by the number of scanner wakeups in a second:
 973          */
 974         min_pageout_nsec = MAX(1,
 975             NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
 976         max_pageout_nsec = MAX(min_pageout_nsec,
 977             NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
 978 
 979 loop:
 980         cv_signal_pageout();
 981 
 982         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 983         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
 984         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
 985 
 986         /*
 987          * Check if pageout has been disabled for debugging purposes:
 988          */
 989         if (!dopageout) {
 990                 goto loop;
 991         }
 992 
 993         /*
 994          * One may reset the clock hands for debugging purposes.  Hands will
 995          * also be reset if memory is added to or removed from the system.
 996          */
 997         if (reset_hands) {
 998                 reset_hands = 0;
 999 
1000                 backhand = page_first();
1001                 if (handspreadpages >= total_pages) {
1002                         fronthand = page_nextn(backhand, total_pages - 1);
1003                 } else {
1004                         fronthand = page_nextn(backhand, handspreadpages);
1005                 }
1006         }
1007 
1008         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1009 
1010         /*
1011          * Keep track of the number of times we have scanned all the way around
1012          * the loop:
1013          */
1014         laps = 0;
1015 
1016         DTRACE_PROBE(pageout__start);
1017 
1018         /*
1019          * Track the number of pages visited during this scan so that we can
1020          * periodically measure our duty cycle.
1021          */
1022         pcount = 0;
1023 
1024         if (pageout_sample_cnt < pageout_sample_lim) {
1025                 /*
1026                  * We need to measure the rate at which the system is able to
1027                  * scan pages of memory.  Each of these initial samples is a
1028                  * scan of all system memory, regardless of whether or not we
1029                  * are experiencing memory pressure.
1030                  */
1031                 nscan_limit = total_pages;
1032                 sampling = true;
1033         } else {
1034                 nscan_limit = desscan;
1035                 sampling = false;
1036         }
1037 
1038         sample_start = gethrtime();
1039 
1040         /*
1041          * Scan the appropriate number of pages for a single duty cycle.
1042          */
1043         while (nscan < nscan_limit) {
1044                 checkpage_result_t rvfront, rvback;
1045 
1046                 if (!sampling && freemem >= lotsfree + needfree) {
1047                         /*
1048                          * We are not sampling and enough memory has become
1049                          * available that scanning is no longer required.
1050                          */
1051                         break;
1052                 }
1053 
1054                 /*
1055                  * Periodically check to see if we have exceeded the CPU duty
1056                  * cycle for a single wakeup.
1057                  */
1058                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1059                         pageout_cycle_nsec = gethrtime() - sample_start;
1060                         if (pageout_cycle_nsec >= pageout_nsec) {
1061                                 ++pageout_timeouts;
1062                                 break;
1063                         }
1064                 }
1065 
1066                 /*
1067                  * If checkpage manages to add a page to the free list,
1068                  * we give ourselves another couple of trips around the loop.
1069                  */
1070                 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1071                         laps = 0;
1072                 }
1073                 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1074                         laps = 0;
1075                 }
1076 
1077                 ++pcount;
1078 
1079                 /*
1080                  * Protected by pageout_mutex instead of cpu_stat_lock:
1081                  */
1082                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1083 
1084                 /*
1085                  * Don't include ineligible pages in the number scanned.
1086                  */
1087                 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1088                         nscan++;
1089                 }
1090 
1091                 backhand = page_next(backhand);
1092                 fronthand = page_next(fronthand);
1093 
1094                 /*
1095                  * The front hand has wrapped around to the first page in the
1096                  * loop.
1097                  */
1098                 if (fronthand == page_first()) {
1099                         laps++;
1100                         DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps);
1101 
1102                         /*
1103                          * Protected by pageout_mutex instead of cpu_stat_lock:
1104                          */
1105                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
1106 
1107                         if (laps > 1) {
1108                                 /*
1109                                  * Extremely unlikely, but it happens.
1110                                  * We went around the loop at least once
1111                                  * and didn't get far enough.
1112                                  * If we are still skipping `highly shared'
1113                                  * pages, skip fewer of them.  Otherwise,
1114                                  * give up till the next clock tick.
1115                                  */
1116                                 if (po_share < MAX_PO_SHARE) {
1117                                         po_share <<= 1;
1118                                 } else {
1119                                         break;
1120                                 }
1121                         }
1122                 }
1123         }
1124 
1125         sample_end = gethrtime();
1126 
1127         DTRACE_PROBE1(pageout__end, uint_t, laps);
1128 
1129         if (pageout_new_spread == 0) {
1130                 if (pageout_sample_cnt < pageout_sample_lim) {
1131                         /*
1132                          * Continue accumulating samples until we have enough
1133                          * to get a reasonable value for average scan rate:
1134                          */
1135                         pageout_sample_pages += pcount;
1136                         pageout_sample_etime += sample_end - sample_start;
1137                         ++pageout_sample_cnt;
1138                 }
1139 
1140                 if (pageout_sample_cnt >= pageout_sample_lim) {
1141                         /*
1142                          * We have enough samples, set the spread.
1143                          */
1144                         pageout_rate = (hrrate_t)pageout_sample_pages *
1145                             (hrrate_t)(NANOSEC) / pageout_sample_etime;
1146                         pageout_new_spread = pageout_rate / 10;
1147                         setupclock();
1148                 }
1149         }
1150 
1151         goto loop;
1152 }
1153 
1154 /*
1155  * The pageout deadman is run once per second by clock().
1156  */
1157 void
1158 pageout_deadman(void)
1159 {
1160         if (panicstr != NULL) {
1161                 /*
1162                  * There is no pageout after panic.
1163                  */
1164                 return;
1165         }
1166 
1167         if (pageout_deadman_seconds == 0) {
1168                 /*
1169                  * The deadman is not enabled.
1170                  */
1171                 return;
1172         }
1173 
1174         if (!pageout_pushing) {
1175                 goto reset;
1176         }
1177 
1178         /*
1179          * We are pushing a page.  Check to see if it is the same call we saw
1180          * last time we looked:
1181          */
1182         if (pageout_pushcount != pageout_pushcount_seen) {
1183                 /*
1184                  * It is a different call from the last check, so we are not
1185                  * stuck.
1186                  */
1187                 goto reset;
1188         }
1189 
1190         if (++pageout_stucktime >= pageout_deadman_seconds) {
1191                 panic("pageout_deadman: stuck pushing the same page for %d "
1192                     "seconds (freemem is %lu)", pageout_deadman_seconds,
1193                     freemem);
1194         }
1195 
1196         return;
1197 
1198 reset:
1199         /*
1200          * Reset our tracking state to reflect that we are not stuck:
1201          */
1202         pageout_stucktime = 0;
1203         pageout_pushcount_seen = pageout_pushcount;
1204 }
1205 
1206 /*
1207  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
1208  * system (u., page table) or free, then leave it alone.  Otherwise,
1209  * if we are running the front hand, turn off the page's reference bit.
1210  * If the proc is over maxrss, we take it.  If running the back hand,
1211  * check whether the page has been reclaimed.  If not, free the page,
1212  * pushing it to disk first if necessary.
1213  *
1214  * Return values:
1215  *      CKP_INELIGIBLE if the page is not a candidate at all,
1216  *      CKP_NOT_FREED  if the page was not freed, or
1217  *      CKP_FREED      if we freed it.
1218  */
1219 static checkpage_result_t
1220 checkpage(struct page *pp, pageout_hand_t whichhand)
1221 {
1222         int ppattr;
1223         int isfs = 0;
1224         int isexec = 0;
1225         int pagesync_flag;
1226 
1227         /*
1228          * Skip pages:
1229          *      - associated with the kernel vnode since
1230          *          they are always "exclusively" locked.
1231          *      - that are free
1232          *      - that are shared more than po_share'd times
1233          *      - its already locked
1234          *
1235          * NOTE:  These optimizations assume that reads are atomic.
1236          */
1237 
1238         if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1239             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1240             hat_page_checkshare(pp, po_share)) {
1241                 return (CKP_INELIGIBLE);
1242         }
1243 
1244         if (!page_trylock(pp, SE_EXCL)) {
1245                 /*
1246                  * Skip the page if we can't acquire the "exclusive" lock.
1247                  */
1248                 return (CKP_INELIGIBLE);
1249         } else if (PP_ISFREE(pp)) {
1250                 /*
1251                  * It became free between the above check and our actually
1252                  * locking the page.  Oh well, there will be other pages.
1253                  */
1254                 page_unlock(pp);
1255                 return (CKP_INELIGIBLE);
1256         }
1257 
1258         /*
1259          * Reject pages that cannot be freed. The page_struct_lock
1260          * need not be acquired to examine these
1261          * fields since the page has an "exclusive" lock.
1262          */
1263         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1264                 page_unlock(pp);
1265                 return (CKP_INELIGIBLE);
1266         }
1267 
1268         /*
1269          * Maintain statistics for what we are freeing
1270          */
1271         if (pp->p_vnode != NULL) {
1272                 if (pp->p_vnode->v_flag & VVMEXEC)
1273                         isexec = 1;
1274 
1275                 if (!IS_SWAPFSVP(pp->p_vnode))
1276                         isfs = 1;
1277         }
1278 
1279         /*
1280          * Turn off REF and MOD bits with the front hand.
1281          * The back hand examines the REF bit and always considers
1282          * SHARED pages as referenced.
1283          */
1284         if (whichhand == POH_FRONT) {
1285                 pagesync_flag = HAT_SYNC_ZERORM;
1286         } else {
1287                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1288                     HAT_SYNC_STOPON_SHARED;
1289         }
1290 
1291         ppattr = hat_pagesync(pp, pagesync_flag);
1292 
1293 recheck:
1294         /*
1295          * If page is referenced; make unreferenced but reclaimable.
1296          * If this page is not referenced, then it must be reclaimable
1297          * and we can add it to the free list.
1298          */
1299         if (ppattr & P_REF) {
1300                 DTRACE_PROBE2(pageout__isref, page_t *, pp,
1301                     pageout_hand_t, whichhand);
1302 
1303                 if (whichhand == POH_FRONT) {
1304                         /*
1305                          * Checking of rss or madvise flags needed here...
1306                          *
1307                          * If not "well-behaved", fall through into the code
1308                          * for not referenced.
1309                          */
1310                         hat_clrref(pp);
1311                 }
1312 
1313                 /*
1314                  * Somebody referenced the page since the front
1315                  * hand went by, so it's not a candidate for
1316                  * freeing up.
1317                  */
1318                 page_unlock(pp);
1319                 return (CKP_NOT_FREED);
1320         }
1321 
1322         VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1323 
1324         /*
1325          * If large page, attempt to demote it. If successfully demoted,
1326          * retry the checkpage.
1327          */
1328         if (pp->p_szc != 0) {
1329                 if (!page_try_demote_pages(pp)) {
1330                         VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1331                         page_unlock(pp);
1332                         return (CKP_INELIGIBLE);
1333                 }
1334 
1335                 ASSERT(pp->p_szc == 0);
1336                 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1337 
1338                 /*
1339                  * Since page_try_demote_pages() could have unloaded some
1340                  * mappings it makes sense to reload ppattr.
1341                  */
1342                 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1343         }
1344 
1345         /*
1346          * If the page is currently dirty, we have to arrange to have it
1347          * cleaned before it can be freed.
1348          *
1349          * XXX - ASSERT(pp->p_vnode != NULL);
1350          */
1351         if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1352                 struct vnode *vp = pp->p_vnode;
1353                 u_offset_t offset = pp->p_offset;
1354 
1355                 /*
1356                  * XXX - Test for process being swapped out or about to exit?
1357                  * [Can't get back to process(es) using the page.]
1358                  */
1359 
1360                 /*
1361                  * Hold the vnode before releasing the page lock to
1362                  * prevent it from being freed and re-used by some
1363                  * other thread.
1364                  */
1365                 VN_HOLD(vp);
1366                 page_unlock(pp);
1367 
1368                 /*
1369                  * Queue I/O request for the pageout thread.
1370                  */
1371                 if (!queue_io_request(vp, offset)) {
1372                         VN_RELE(vp);
1373                         return (CKP_NOT_FREED);
1374                 }
1375                 return (CKP_FREED);
1376         }
1377 
1378         /*
1379          * Now we unload all the translations and put the page back on to the
1380          * free list.  If the page was used (referenced or modified) after the
1381          * pagesync but before it was unloaded we catch it and handle the page
1382          * properly.
1383          */
1384         DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1385         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1386         ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1387         if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1388                 goto recheck;
1389         }
1390 
1391         VN_DISPOSE(pp, B_FREE, 0, kcred);
1392 
1393         CPU_STATS_ADD_K(vm, dfree, 1);
1394 
1395         if (isfs) {
1396                 if (isexec) {
1397                         CPU_STATS_ADD_K(vm, execfree, 1);
1398                 } else {
1399                         CPU_STATS_ADD_K(vm, fsfree, 1);
1400                 }
1401         } else {
1402                 CPU_STATS_ADD_K(vm, anonfree, 1);
1403         }
1404 
1405         return (CKP_FREED);
1406 }
1407 
1408 /*
1409  * Queue async i/o request from pageout_scanner and segment swapout
1410  * routines on one common list.  This ensures that pageout devices (swap)
1411  * are not saturated by pageout_scanner or swapout requests.
1412  * The pageout thread empties this list by initiating i/o operations.
1413  */
1414 int
1415 queue_io_request(vnode_t *vp, u_offset_t off)
1416 {
1417         struct async_reqs *arg;
1418 
1419         /*
1420          * If we cannot allocate an async request struct,
1421          * skip this page.
1422          */
1423         mutex_enter(&push_lock);
1424         if ((arg = req_freelist) == NULL) {
1425                 mutex_exit(&push_lock);
1426                 return (0);
1427         }
1428         req_freelist = arg->a_next;          /* adjust freelist */
1429         push_list_size++;
1430 
1431         arg->a_vp = vp;
1432         arg->a_off = off;
1433         arg->a_len = PAGESIZE;
1434         arg->a_flags = B_ASYNC | B_FREE;
1435         arg->a_cred = kcred;         /* always held */
1436 
1437         /*
1438          * Add to list of pending write requests.
1439          */
1440         arg->a_next = push_list;
1441         push_list = arg;
1442 
1443         if (req_freelist == NULL) {
1444                 /*
1445                  * No free async requests left. The lock is held so we
1446                  * might as well signal the pusher thread now.
1447                  */
1448                 cv_signal(&push_cv);
1449         }
1450         mutex_exit(&push_lock);
1451         return (1);
1452 }
1453 
1454 /*
1455  * Wakeup pageout to initiate i/o if push_list is not empty.
1456  */
1457 void
1458 cv_signal_pageout()
1459 {
1460         if (push_list != NULL) {
1461                 mutex_enter(&push_lock);
1462                 cv_signal(&push_cv);
1463                 mutex_exit(&push_lock);
1464         }
1465 }