Print this page
13097 improve VM tunables for modern systems (fix mismerge)


 225 pgcnt_t         lotsfree_min = 0;
 226 pgcnt_t         lotsfree_max = 0;
 227 
 228 #define         LOTSFREE_MIN_DEFAULT    (16 * MEGABYTES)
 229 #define         LOTSFREE_MAX_DEFAULT    (2048 * MEGABYTES)
 230 
 231 /*
 232  * If these tunables are set to non-zero values in /etc/system, and provided
 233  * the value is not larger than the threshold above, the specified value will
 234  * be used directly without any additional calculation or adjustment.  The boot
 235  * time value of these overrides is preserved in the "clockinit" struct.  More
 236  * detail is available in the comment at the top of the file.
 237  */
 238 pgcnt_t         maxpgio = 0;
 239 pgcnt_t         minfree = 0;
 240 pgcnt_t         desfree = 0;
 241 pgcnt_t         lotsfree = 0;
 242 pgcnt_t         needfree = 0;
 243 pgcnt_t         throttlefree = 0;
 244 pgcnt_t         pageout_reserve = 0;

 245 
 246 pgcnt_t         deficit;
 247 pgcnt_t         nscan;
 248 pgcnt_t         desscan;
 249 
 250 /* kstats */
 251 uint64_t low_mem_scan;
 252 uint64_t zone_cap_scan;
 253 uint64_t n_throttle;
 254 


 255 /*
 256  * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
 257  * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
 258  * that gives the equivalent of some underlying %CPU duty cycle.
 259  *
 260  * min_pageout_nsec:
 261  *     nanoseconds/wakeup equivalent of min_percent_cpu.
 262  *
 263  * max_pageout_nsec:
 264  *     nanoseconds/wakeup equivalent of max_percent_cpu.
 265  *
 266  * pageout_nsec:
 267  *     Number of nanoseconds budgeted for each wakeup cycle.
 268  *     Computed each time around by schedpaging().
 269  *     Varies between min_pageout_nsec and max_pageout_nsec,
 270  *     depending on memory pressure or zones over their cap.
 271  *
 272  * zone_pageout_nsec:
 273  *     Number of nanoseconds budget for each cycle when a zone
 274  *     is over its memory cap. If this is zero, then the value
 275  *     of max_pageout_nsec is used instead.
 276  */
 277 
 278 static hrtime_t min_pageout_nsec;
 279 static hrtime_t max_pageout_nsec;
 280 static hrtime_t pageout_nsec;
 281 static hrtime_t zone_pageout_nsec;
 282 
 283 #define MAX_PSCAN_THREADS       16
 284 static boolean_t reset_hands[MAX_PSCAN_THREADS];
 285 



 286 /*
 287  * These can be tuned in /etc/system or set with mdb.
 288  * 'des_page_scanners' is the desired number of page scanner threads. The
 289  * system will bring the actual number of threads into line with the desired
 290  * number. If des_page_scanners is set to an invalid value, the system will
 291  * correct the setting.
 292  */
 293 uint_t des_page_scanners;
 294 uint_t pageout_reset_cnt = 64;  /* num. cycles for pageout_scanner hand reset */
 295 
 296 uint_t n_page_scanners;
 297 static pgcnt_t  pscan_region_sz; /* informational only */
 298 
 299 #define PAGES_POLL_MASK 1023
 300 
 301 /*
 302  * pageout_sample_lim:
 303  *     The limit on the number of samples needed to establish a value for new
 304  *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
 305  *     handspreadpages.
 306  *
 307  * pageout_sample_cnt:
 308  *     Current sample number.  Once the sample gets large enough, set new
 309  *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
 310  *
 311  * pageout_sample_pages:
 312  *     The accumulated number of pages scanned during sampling.
 313  *
 314  * pageout_sample_etime:
 315  *     The accumulated nanoseconds for the sample.
 316  *
 317  * pageout_rate:
 318  *     Rate in pages/nanosecond, computed at the end of sampling.
 319  *
 320  * pageout_new_spread:
 321  *     Initially zero while the system scan rate is measured by
 322  *     pageout_scanner(), which then sets this value once per system boot after
 323  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
 324  *     new value is used for fastscan and handspreadpages.
 325  */
 326 
 327 typedef hrtime_t hrrate_t;
 328 
 329 static uint64_t pageout_sample_lim = 4;
 330 static uint64_t pageout_sample_cnt = 0;
 331 static pgcnt_t  pageout_sample_pages = 0;

 332 static hrrate_t pageout_rate = 0;
 333 static pgcnt_t  pageout_new_spread = 0;
 334 
 335 static hrtime_t pageout_sample_etime = 0;
 336 
 337 /* True if page scanner is first starting up */
 338 #define PAGE_SCAN_STARTUP       (pageout_sample_cnt < pageout_sample_lim)
 339 





 340 /*
 341  * Record number of times a pageout_scanner() wakeup cycle finished because it
 342  * timed out (exceeded its CPU budget), rather than because it visited
 343  * its budgeted number of pages. This is only done when scanning under low
 344  * free memory conditions, not when scanning for zones over their cap.
 345  */
 346 uint64_t        pageout_timeouts = 0;
 347 
 348 #ifdef VM_STATS
 349 static struct pageoutvmstats_str {
 350         ulong_t checkpage[3];
 351 } pageoutvmstats;
 352 #endif /* VM_STATS */
 353 
 354 /*
 355  * Threads waiting for free memory use this condition variable and lock until
 356  * memory becomes available.
 357  */
 358 kmutex_t        memavail_lock;
 359 kcondvar_t      memavail_cv;


 368         CKP_NOT_FREED,
 369         CKP_FREED,
 370 } checkpage_result_t;
 371 
 372 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
 373 
 374 static struct clockinit {
 375         bool ci_init;
 376         pgcnt_t ci_lotsfree_min;
 377         pgcnt_t ci_lotsfree_max;
 378         pgcnt_t ci_lotsfree;
 379         pgcnt_t ci_desfree;
 380         pgcnt_t ci_minfree;
 381         pgcnt_t ci_throttlefree;
 382         pgcnt_t ci_pageout_reserve;
 383         pgcnt_t ci_maxpgio;
 384         pgcnt_t ci_maxfastscan;
 385         pgcnt_t ci_fastscan;
 386         pgcnt_t ci_slowscan;
 387         pgcnt_t ci_handspreadpages;

 388 } clockinit = { .ci_init = false };
 389 
 390 static pgcnt_t
 391 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 392 {
 393         if (value < minimum) {
 394                 return (minimum);
 395         } else if (value > maximum) {
 396                 return (maximum);
 397         } else {
 398                 return (value);
 399         }
 400 }
 401 
 402 static pgcnt_t
 403 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 404 {
 405         if (initval == 0 || initval >= initval_ceiling) {
 406                 return (defval);
 407         } else {
 408                 return (initval);
 409         }
 410 }
 411 
 412 /*
 413  * Local boolean to control scanning when zones are over their cap. Avoids
 414  * accessing the zone_num_over_cap variable except within schedpaging(), which
 415  * only runs periodically. This is here only to reduce our access to
 416  * zone_num_over_cap, since it is already accessed a lot during paging, and
 417  * the page scanner accesses the zones_over variable on each page during a
 418  * scan. There is no lock needed for zone_num_over_cap since schedpaging()
 419  * doesn't modify the variable, it only cares if the variable is 0 or non-0.
 420  */
 421 static boolean_t zones_over = B_FALSE;
 422 
 423 /*


































































 424  * Set up the paging constants for the clock algorithm used by
 425  * pageout_scanner(), and by the virtual memory system overall.  See the
 426  * comments at the top of this file for more information about the threshold
 427  * values and system responses to memory pressure.
 428  *
 429  * This routine is called once by main() at startup, after the initial size of
 430  * physical memory is determined.  It may be called again later if memory is
 431  * added to or removed from the system, or if new measurements of the page scan
 432  * rate become available.
 433  */
 434 void
 435 setupclock(void)
 436 {
 437         uint_t i;
 438         pgcnt_t sz, tmp;
 439         pgcnt_t defval;
 440         bool half = (pageout_threshold_style == 1);
 441         bool recalc = true;
 442 
 443         looppages = total_pages;
 444 
 445         /*
 446          * The operator may have provided specific values for some of the
 447          * tunables via /etc/system.  On our first call, we preserve those
 448          * values so that they can be used for subsequent recalculations.
 449          *
 450          * A value of zero for any tunable means we will use the default
 451          * sizing.
 452          */
 453 
 454         if (!clockinit.ci_init) {
 455                 clockinit.ci_init = true;
 456 
 457                 clockinit.ci_lotsfree_min = lotsfree_min;
 458                 clockinit.ci_lotsfree_max = lotsfree_max;
 459                 clockinit.ci_lotsfree = lotsfree;
 460                 clockinit.ci_desfree = desfree;
 461                 clockinit.ci_minfree = minfree;
 462                 clockinit.ci_throttlefree = throttlefree;
 463                 clockinit.ci_pageout_reserve = pageout_reserve;
 464                 clockinit.ci_maxpgio = maxpgio;
 465                 clockinit.ci_maxfastscan = maxfastscan;
 466                 clockinit.ci_fastscan = fastscan;
 467                 clockinit.ci_slowscan = slowscan;
 468                 clockinit.ci_handspreadpages = handspreadpages;

 469 
 470                 /*
 471                  * The first call does not trigger a recalculation, only
 472                  * subsequent calls.
 473                  */
 474                 recalc = false;
 475         }
 476 
 477         /*
 478          * Configure paging threshold values.  For more details on what each
 479          * threshold signifies, see the comments at the top of this file.
 480          */
 481         lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
 482             btop(LOTSFREE_MAX_DEFAULT));
 483         lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
 484             btop(LOTSFREE_MIN_DEFAULT));
 485 
 486         lotsfree = tune(clockinit.ci_lotsfree, looppages,
 487             clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
 488 


 630 
 631         if (fastscan > looppages / loopfraction) {
 632                 fastscan = looppages / loopfraction;
 633         }
 634 
 635         /*
 636          * Set slow scan time to 1/10 the fast scan time, but
 637          * not to exceed maxslowscan.
 638          */
 639         if (clockinit.ci_slowscan == 0) {
 640                 slowscan = MIN(fastscan / 10, maxslowscan);
 641         } else {
 642                 slowscan = clockinit.ci_slowscan;
 643         }
 644 
 645         if (slowscan > fastscan / 2) {
 646                 slowscan = fastscan / 2;
 647         }
 648 
 649         /*
 650          * Handspreadpages is distance (in pages) between front and back
 651          * pageout daemon hands.  The amount of time to reclaim a page
 652          * once pageout examines it increases with this distance and
 653          * decreases as the scan rate rises. It must be < the amount
 654          * of pageable memory.
 655          *
 656          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 657          * to be "fastscan" results in the front hand being a few secs
 658          * (varies based on the processor speed) ahead of the back hand
 659          * at fastscan rates.  This distance can be further reduced, if
 660          * necessary, by increasing the processor time used by pageout
 661          * to be more than ~4% and preferrably not more than ~10%.
 662          *
 663          * As a result, user processes have a much better chance of
 664          * referencing their pages before the back hand examines them.
 665          * This also significantly lowers the number of reclaims from
 666          * the freelist since pageout does not end up freeing pages which
 667          * may be referenced a sec later.
 668          */
 669         if (clockinit.ci_handspreadpages == 0) {
 670                 handspreadpages = fastscan;
 671         } else {
 672                 handspreadpages = clockinit.ci_handspreadpages;
 673         }
 674 
 675         /*
 676          * Make sure that back hand follows front hand by at least
 677          * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
 678          * back hand to look at a page during the same wakeup of the pageout
 679          * daemon in which the front hand cleared its ref bit.
 680          */
 681         if (handspreadpages >= looppages) {
 682                 handspreadpages = looppages - 1;
 683         }
 684 
 685         if (!recalc) {
 686                 /*
 687                  * Setup basic values at initialization.




 688                  */
 689                 pscan_region_sz = total_pages;
 690                 des_page_scanners = n_page_scanners = 1;
 691                 reset_hands[0] = B_TRUE;
 692                 return;
 693         }
 694 
 695         /*
 696          * Recalculating
 697          *
 698          * We originally set the number of page scanners to 1. Now that we
 699          * know what the handspreadpages is for a scanner, figure out how many
 700          * scanners we should run. We want to ensure that the regions don't
 701          * overlap and that they are not touching.
 702          *
 703          * A default 64GB region size is used as the initial value to calculate
 704          * how many scanner threads we should create on lower memory systems.
 705          * The idea is to limit the number of threads to a practical value
 706          * (e.g. a 64GB machine really only needs one scanner thread). For very
 707          * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
 708          * threads.
 709          *
 710          * The scanner threads themselves are evenly spread out around the
 711          * memory "clock" in pageout_scanner when we reset the hands, and each
 712          * thread will scan all of memory.
 713          */
 714         sz = (btop(64ULL * 0x40000000ULL));
 715         if (sz < handspreadpages) {

 716                 /*
 717                  * 64GB is smaller than the separation between the front
 718                  * and back hands; use double handspreadpages.
 719                  */
 720                 sz = handspreadpages << 1;
 721         }
 722         if (sz > total_pages) {
 723                 sz = total_pages;
 724         }
 725         /* Record region size for inspection with mdb, otherwise unused */
 726         pscan_region_sz = sz;
 727 
 728         tmp = sz;
 729         for (i = 1; tmp < total_pages; i++) {
 730                 tmp += sz;
 731         }
 732 
 733         if (i > MAX_PSCAN_THREADS)
 734                 i = MAX_PSCAN_THREADS;
 735 
 736         des_page_scanners = i;
 737 }
 738 
 739 /*
 740  * Pageout scheduling.
 741  *
 742  * Schedpaging controls the rate at which the page out daemon runs by
 743  * setting the global variables nscan and desscan SCHEDPAGING_HZ
 744  * times a second.  Nscan records the number of pages pageout has examined
 745  * in its current pass; schedpaging() resets this value to zero each time
 746  * it runs.  Desscan records the number of pages pageout should examine
 747  * in its next pass; schedpaging() sets this value based on the amount of
 748  * currently available memory.
 749  */
 750 #define SCHEDPAGING_HZ  4
 751 
 752 static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
 753 
 754 /*
 755  * Pool of available async pageout putpage requests.
 756  */
 757 static struct async_reqs *push_req;
 758 static struct async_reqs *req_freelist; /* available req structs */
 759 static struct async_reqs *push_list;    /* pending reqs */
 760 static kmutex_t push_lock;              /* protects req pool */
 761 static kcondvar_t push_cv;
 762 
 763 /*
 764  * If pageout() is stuck on a single push for this many seconds,
 765  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
 766  * to 0, the deadman will have no effect.
 767  *
 768  * Note that we are only looking for stalls in the calls that pageout() makes
 769  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
 770  * I/O, which should not take long unless the underlying strategy call blocks
 771  * indefinitely for memory.  The actual I/O request happens (or fails) later.
 772  */
 773 uint_t pageout_deadman_seconds = 90;
 774 
 775 static uint_t pageout_stucktime = 0;
 776 static bool pageout_pushing = false;
 777 static uint64_t pageout_pushcount = 0;
 778 static uint64_t pageout_pushcount_seen = 0;
 779 
 780 static int async_list_size = 256;       /* number of async request structs */
 781 
 782 static void pageout_scanner(void *);
 783 
 784 /*
 785  * If a page is being shared more than "po_share" times
 786  * then leave it alone- don't page it out.
 787  */
 788 #define MIN_PO_SHARE    (8)
 789 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 790 ulong_t po_share = MIN_PO_SHARE;
 791 
 792 /*
 793  * Schedule rate for paging.
 794  * Rate is linear interpolation between
 795  * slowscan with lotsfree and fastscan when out of memory.
 796  */
 797 static void
 798 schedpaging(void *arg)
 799 {
 800         spgcnt_t vavail;
 801 
 802         if (freemem < lotsfree + needfree + kmem_reapahead)
 803                 kmem_reap();
 804 
 805         if (freemem < lotsfree + needfree)
 806                 seg_preap();
 807 
 808         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 809                 kcage_cageout_wakeup();
 810 
 811         (void) atomic_swap_ulong(&nscan, 0);






 812         vavail = freemem - deficit;
 813         if (pageout_new_spread != 0)
 814                 vavail -= needfree;
 815         if (vavail < 0)
 816                 vavail = 0;
 817         if (vavail > lotsfree)
 818                 vavail = lotsfree;
 819 
 820         /*
 821          * Fix for 1161438 (CRS SPR# 73922).  All variables
 822          * in the original calculation for desscan were 32 bit signed
 823          * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 824          * more of memory, the calculation can overflow.  When this
 825          * happens, desscan becomes negative and pageout_scanner()
 826          * stops paging out.
 827          */
 828         if (needfree > 0 && pageout_new_spread == 0) {
 829                 /*
 830                  * If we've not yet collected enough samples to
 831                  * calculate a spread, kick into high gear anytime
 832                  * needfree is non-zero. Note that desscan will not be
 833                  * the limiting factor for systems with larger memory;
 834                  * the %CPU will limit the scan. That will also be
 835                  * maxed out below.
 836                  */
 837                 desscan = fastscan / SCHEDPAGING_HZ;
 838         } else {
 839                 /*
 840                  * Once we've calculated a spread based on system
 841                  * memory and usage, just treat needfree as another
 842                  * form of deficit.
 843                  */
 844                 spgcnt_t faststmp, slowstmp, result;
 845 
 846                 slowstmp = slowscan * vavail;
 847                 faststmp = fastscan * (lotsfree - vavail);
 848                 result = (slowstmp + faststmp) /
 849                     nz(lotsfree) / SCHEDPAGING_HZ;
 850                 desscan = (pgcnt_t)result;
 851         }
 852 
 853         /*
 854          * If we've not yet collected enough samples to calculate a
 855          * spread, also kick %CPU to the max.
 856          */
 857         if (pageout_new_spread == 0) {
 858                 pageout_nsec = max_pageout_nsec;
 859         } else {
 860                 pageout_nsec = min_pageout_nsec +
 861                     (lotsfree - vavail) *
 862                     (max_pageout_nsec - min_pageout_nsec) /
 863                     nz(lotsfree);
 864         }
 865 
 866         if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {




 867                 /*
 868                  * We have finished the pagescan initialization and the desired
 869                  * number of page scanners has changed, either because
 870                  * initialization just finished, because of a memory DR, or
 871                  * because des_page_scanners has been modified on the fly (i.e.
 872                  * by mdb). If we need more scanners, start them now, otherwise
 873                  * the excess scanners will terminate on their own when they
 874                  * reset their hands.
 875                  */
 876                 uint_t i;
 877                 uint_t curr_nscan = n_page_scanners;
 878                 pgcnt_t max = total_pages / handspreadpages;
 879 
 880                 if (des_page_scanners > max)
 881                         des_page_scanners = max;
 882 
 883                 if (des_page_scanners > MAX_PSCAN_THREADS) {
 884                         des_page_scanners = MAX_PSCAN_THREADS;
 885                 } else if (des_page_scanners == 0) {
 886                         des_page_scanners = 1;
 887                 }
 888 
 889                 /*
 890                  * Each thread has its own entry in the reset_hands array, so
 891                  * we don't need any locking in pageout_scanner to check the
 892                  * thread's reset_hands entry. Thus, we use a pre-allocated
 893                  * fixed size reset_hands array and upper limit on the number
 894                  * of pagescan threads.
 895                  *
 896                  * The reset_hands entries need to be true before we start new
 897                  * scanners, but if we're reducing, we don't want a race on the
 898                  * recalculation for the existing threads, so we set
 899                  * n_page_scanners first.
 900                  */
 901                 n_page_scanners = des_page_scanners;
 902                 for (i = 0; i < MAX_PSCAN_THREADS; i++) {
 903                         reset_hands[i] = B_TRUE;
 904                 }
 905 
 906                 if (des_page_scanners > curr_nscan) {
 907                         /* Create additional pageout scanner threads. */
 908                         for (i = curr_nscan; i < des_page_scanners; i++) {
 909                                 (void) lwp_kernel_create(proc_pageout,
 910                                     pageout_scanner, (void *)(uintptr_t)i,
 911                                     TS_RUN, curthread->t_pri);

 912                         }
 913                 }







 914         }

 915 
 916         zones_over = B_FALSE;
 917 
 918         if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
 919                 if (!PAGE_SCAN_STARTUP)
 920                         low_mem_scan++;
 921                 /*
 922                  * Either we need more memory, or we still need to
 923                  * measure the average scan rate.  Wake the scanner.



 924                  */
 925                 DTRACE_PROBE(schedpage__wake__low);



 926                 WAKE_PAGEOUT_SCANNER();





 927 


 928         } else if (zone_num_over_cap > 0) {
 929                 /* One or more zones are over their cap. */


 930 
 931                 /* No page limit */
 932                 desscan = total_pages;
 933 
 934                 /*
 935                  * Increase the scanning CPU% to the max. This implies
 936                  * 80% of one CPU/sec if the scanner can run each
 937                  * opportunity. Can also be tuned via setting
 938                  * zone_pageout_nsec in /etc/system or with mdb.
 939                  */
 940                 pageout_nsec = (zone_pageout_nsec != 0) ?
 941                     zone_pageout_nsec : max_pageout_nsec;
 942 
 943                 zones_over = B_TRUE;
 944                 zone_cap_scan++;
 945 
 946                 DTRACE_PROBE(schedpage__wake__zone);
 947                 WAKE_PAGEOUT_SCANNER();
 948 
 949         } else {
 950                 /*
 951                  * There are enough free pages, no need to
 952                  * kick the scanner thread.  And next time
 953                  * around, keep more of the `highly shared'
 954                  * pages.
 955                  */
 956                 cv_signal_pageout();
 957 
 958                 mutex_enter(&pageout_mutex);
 959                 if (po_share > MIN_PO_SHARE) {
 960                         po_share >>= 1;
 961                 }


 962                 mutex_exit(&pageout_mutex);
 963         }
 964 
 965         /*
 966          * Signal threads waiting for available memory.
 967          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 968          * in this case it is not needed - the waiters will be waken up during
 969          * the next invocation of this function.
 970          */
 971         if (kmem_avail() > 0)
 972                 cv_broadcast(&memavail_cv);
 973 
 974         (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
 975 }
 976 
 977 pgcnt_t         pushes;
 978 ulong_t         push_list_size;         /* # of requests on pageout queue */
 979 
 980 /*
 981  * Paging out should always be enabled.  This tunable exists to hold pageout
 982  * for debugging purposes.  If set to 0, pageout_scanner() will go back to
 983  * sleep each time it is woken by schedpaging().
 984  */
 985 uint_t dopageout = 1;
 986 
 987 /*
 988  * The page out daemon, which runs as process 2.
 989  *
 990  * Page out occurs when either:
 991  * a) there is less than lotsfree pages,
 992  * b) there are one or more zones over their physical memory cap.




 993  *
 994  * The daemon treats physical memory as a circular array of pages and scans the
 995  * pages using a 'two-handed clock' algorithm. The front hand moves through
 996  * the pages, clearing the reference bit. The back hand travels a distance
 997  * (handspreadpages) behind the front hand, freeing the pages that have not
 998  * been referenced in the time since the front hand passed. If modified, they
 999  * are first written to their backing store before being freed.
1000  *
1001  * In order to make page invalidation more responsive on machines with larger
1002  * memory, multiple pageout_scanner threads may be created. In this case, the
1003  * threads are evenly distributed around the the memory "clock face" so that
1004  * memory can be reclaimed more quickly (that is, there can be large regions in
1005  * which no pages can be reclaimed by a single thread, leading to lag which
1006  * causes undesirable behavior such as htable stealing).
1007  *
1008  * As long as there are at least lotsfree pages, or no zones over their cap,
1009  * then pageout_scanner threads are not run. When pageout_scanner threads are
1010  * running for case (a), all pages are considered for pageout. For case (b),
1011  * only pages belonging to a zone over its cap will be considered for pageout.
1012  *
1013  * There are multiple threads that act on behalf of the pageout process.
1014  * A set of threads scan pages (pageout_scanner) and frees them up if
1015  * they don't require any VOP_PUTPAGE operation. If a page must be
1016  * written back to its backing store, the request is put on a list
1017  * and the other (pageout) thread is signaled. The pageout thread
1018  * grabs VOP_PUTPAGE requests from the list, and processes them.
1019  * Some filesystems may require resources for the VOP_PUTPAGE
1020  * operations (like memory) and hence can block the pageout
1021  * thread, but the pageout_scanner threads can still operate. There is still
1022  * no guarantee that memory deadlocks cannot occur.
1023  *
1024  * The pageout_scanner parameters are determined in schedpaging().
1025  */
1026 void
1027 pageout()
1028 {
1029         struct async_reqs *arg;
1030         pri_t pageout_pri;
1031         int i;
1032         pgcnt_t max_pushes;
1033         callb_cpr_t cprinfo;
1034 
1035         proc_pageout = ttoproc(curthread);
1036         proc_pageout->p_cstime = 0;
1037         proc_pageout->p_stime =  0;
1038         proc_pageout->p_cutime =  0;
1039         proc_pageout->p_utime = 0;
1040         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1041         bcopy("pageout", PTOU(curproc)->u_comm, 7);
1042 
1043         /*
1044          * Create pageout scanner thread
1045          */
1046         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1047         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1048 
1049         /*
1050          * Allocate and initialize the async request structures
1051          * for pageout.
1052          */
1053         push_req = (struct async_reqs *)
1054             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1055 
1056         req_freelist = push_req;
1057         for (i = 0; i < async_list_size - 1; i++) {
1058                 push_req[i].a_next = &push_req[i + 1];
1059         }
1060 
1061         pageout_pri = curthread->t_pri;
1062 
1063         /* Create the (first) pageout scanner thread. */
1064         (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
1065             pageout_pri - 1);

1066 
1067         /*
1068          * kick off pageout scheduler.
1069          */
1070         schedpaging(NULL);
1071 
1072         /*
1073          * Create kernel cage thread.
1074          * The kernel cage thread is started under the pageout process
1075          * to take advantage of the less restricted page allocation
1076          * in page_create_throttle().
1077          */
1078         kcage_cageout_init();
1079 
1080         /*
1081          * Limit pushes to avoid saturating pageout devices.
1082          */
1083         max_pushes = maxpgio / SCHEDPAGING_HZ;
1084         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1085 
1086         for (;;) {
1087                 mutex_enter(&push_lock);
1088 
1089                 while ((arg = push_list) == NULL || pushes > max_pushes) {
1090                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1091                         cv_wait(&push_cv, &push_lock);
1092                         pushes = 0;
1093                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1094                 }
1095                 push_list = arg->a_next;
1096                 arg->a_next = NULL;
1097                 pageout_pushing = true;
1098                 mutex_exit(&push_lock);
1099 
1100                 DTRACE_PROBE(pageout__push);

1101                 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1102                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1103                         pushes++;
1104                 }
1105 
1106                 /* vp held by checkpage() */
1107                 VN_RELE(arg->a_vp);
1108 
1109                 mutex_enter(&push_lock);
1110                 pageout_pushing = false;
1111                 pageout_pushcount++;
1112                 arg->a_next = req_freelist;  /* back on freelist */
1113                 req_freelist = arg;
1114                 push_list_size--;
1115                 mutex_exit(&push_lock);
1116         }
1117 }
1118 
1119 /*
1120  * Kernel thread that scans pages looking for ones to free
1121  */
1122 static void
1123 pageout_scanner(void *a)
1124 {
1125         struct page *fronthand, *backhand;
1126         uint_t laps, iter = 0;

1127         callb_cpr_t cprinfo;
1128         pgcnt_t nscan_cnt, nscan_limit;
1129         pgcnt_t pcount;
1130         uint_t inst = (uint_t)(uintptr_t)a;
1131         hrtime_t sample_start, sample_end;
1132         kmutex_t pscan_mutex;
1133         bool sampling;
1134 
1135         VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1136 
1137         mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);

1138 
1139         CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
1140         mutex_enter(&pscan_mutex);
1141 
1142         /*
1143          * Establish the minimum and maximum length of time to be spent
1144          * scanning pages per wakeup, limiting the scanner duty cycle.  The
1145          * input percentage values (0-100) must be converted to a fraction of
1146          * the number of nanoseconds in a second of wall time, then further
1147          * scaled down by the number of scanner wakeups in a second:
1148          */
1149         min_pageout_nsec = MAX(1,
1150             NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
1151         max_pageout_nsec = MAX(min_pageout_nsec,
1152             NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
1153 



1154 loop:
1155         cv_signal_pageout();
1156 


1157         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1158         cv_wait(&proc_pageout->p_cv, &pscan_mutex);
1159         CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);


1160 
1161         /*
1162          * Check if pageout has been disabled for debugging purposes:
1163          */
1164         if (!dopageout) {
1165                 goto loop;
1166         }
1167 
1168         /*
1169          * One may reset the clock hands for debugging purposes.  Hands will
1170          * also be reset if memory is added to or removed from the system.


1171          */
1172         if (reset_hands[inst]) {
1173                 struct page *first;
1174                 pgcnt_t offset = total_pages / n_page_scanners;
1175 
1176                 reset_hands[inst] = B_FALSE;

1177                 if (inst >= n_page_scanners) {
1178                         /*
1179                          * The desired number of page scanners has been
1180                          * reduced and this instance is no longer wanted.
1181                          * Exit the lwp.
1182                          */
1183                         VERIFY3U(inst, !=, 0);
1184                         mutex_exit(&pscan_mutex);



1185                         mutex_enter(&curproc->p_lock);
1186                         lwp_exit();

1187                 }
1188 


1189                 /*
1190                  * The reset case repositions the hands at the proper place
1191                  * on the memory clock face to prevent creep into another
1192                  * thread's active region or when the number of threads has
1193                  * changed.
1194                  *
1195                  * Set the two clock hands to be separated by a reasonable
1196                  * amount, but no more than 360 degrees apart.
1197                  *
1198                  * If inst == 0, backhand starts at first page, otherwise
1199                  * it is (inst * offset) around the memory "clock face" so that
1200                  * we spread out each scanner instance evenly.
1201                  */
1202                 first = page_first();
1203                 backhand = page_nextn(first, offset * inst);
1204                 if (handspreadpages >= total_pages) {
1205                         fronthand = page_nextn(backhand, total_pages - 1);






1206                 } else {




1207                         fronthand = page_nextn(backhand, handspreadpages);







1208                 }
1209         }
1210 




1211         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1212 
1213         /*
1214          * Keep track of the number of times we have scanned all the way around
1215          * the loop:
1216          */
1217         laps = 0;
1218 
1219         /*
1220          * Track the number of pages visited during this scan so that we can
1221          * periodically measure our duty cycle.
1222          */
1223         pcount = 0;
1224         nscan_cnt = 0;

1225 
1226         if (PAGE_SCAN_STARTUP) {


1227                 /*
1228                  * We need to measure the rate at which the system is able to
1229                  * scan pages of memory.  Each of these initial samples is a
1230                  * scan of all system memory, regardless of whether or not we
1231                  * are experiencing memory pressure.
1232                  */
1233                 nscan_limit = total_pages;
1234                 sampling = true;
1235         } else {
1236                 nscan_limit = desscan;
1237                 sampling = false;
1238         }
1239 
1240         DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
1241             page_t *, backhand, page_t *, fronthand);
1242 
1243         sample_start = gethrtime();
1244 
1245         /*
1246          * Scan the appropriate number of pages for a single duty cycle.





1247          * Only scan while at least one of these is true:
1248          * 1) one or more zones is over its cap
1249          * 2) there is not enough free memory
1250          * 3) during page scan startup when determining sample data
1251          */
1252         while (nscan_cnt < nscan_limit) {
1253                 checkpage_result_t rvfront, rvback;
1254 
1255                 if (!sampling && !zones_over &&
1256                     freemem >= lotsfree + needfree) {
1257                         /*
1258                          * We are not sampling and enough memory has become
1259                          * available that scanning is no longer required.
1260                          */

1261                         break;
1262                 }
1263 
1264                 DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
1265 
1266                 /*
1267                  * Periodically check to see if we have exceeded the CPU duty
1268                  * cycle for a single wakeup.
1269                  */
1270                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1271                         hrtime_t pageout_cycle_nsec;
1272 
1273                         pageout_cycle_nsec = gethrtime() - sample_start;
1274                         if (pageout_cycle_nsec >= pageout_nsec) {
1275                                 /*
1276                                  * This is where we normally break out of the
1277                                  * loop when scanning zones or sampling.
1278                                  */
1279                                 if (!zones_over) {
1280                                         atomic_inc_64(&pageout_timeouts);
1281                                 }
1282                                 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1283                                 break;
1284                         }
1285                 }
1286 
1287                 /*
1288                  * If checkpage manages to add a page to the free list,
1289                  * we give ourselves another couple of trips around the loop.
1290                  */
1291                 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1292                         laps = 0;
1293                 }
1294                 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1295                         laps = 0;
1296                 }
1297 
1298                 ++pcount;
1299 
1300                 /*
1301                  * This CPU kstat is only incremented here and we're obviously
1302                  * on this CPU, so no lock.
1303                  */
1304                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1305 
1306                 /*
1307                  * Don't include ineligible pages in the number scanned.
1308                  */
1309                 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1310                         nscan_cnt++;
1311                 }
1312 
1313                 backhand = page_next(backhand);
1314                 fronthand = page_next(fronthand);






1315 
1316                 /*
1317                  * The front hand has wrapped around to the first page in the
1318                  * loop.
1319                  */
1320                 if (fronthand == page_first())  {
1321                         DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);


1322 
1323                         /*
1324                          * Every 64 wraps we reposition our hands within our
1325                          * region to prevent creep into another thread.
1326                          */
1327                         if ((++iter % pageout_reset_cnt) == 0)
1328                                 reset_hands[inst] = B_TRUE;


1329 
1330                         /*
1331                          * This CPU kstat is only incremented here and we're
1332                          * obviously on this CPU, so no lock.
1333                          */
1334                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
1335 
1336                         /*
1337                          * If scanning because the system is low on memory,
1338                          * then when we wraparound memory we want to try to
1339                          * reclaim more pages.
1340                          * If scanning only because zones are over their cap,
1341                          * then wrapping is common and we simply keep going.
1342                          */
1343                         if (freemem < lotsfree + needfree && ++laps > 1) {
1344                                 /*
1345                                  * The system is low on memory.
1346                                  * Extremely unlikely, but it happens.
1347                                  * We went around the loop at least once
1348                                  * and didn't get far enough.
1349                                  * If we are still skipping `highly shared'
1350                                  * pages, skip fewer of them.  Otherwise,
1351                                  * give up till the next clock tick.
1352                                  */
1353                                 mutex_enter(&pageout_mutex);
1354                                 if (po_share < MAX_PO_SHARE) {
1355                                         po_share <<= 1;
1356                                         mutex_exit(&pageout_mutex);
1357                                 } else {
1358                                         mutex_exit(&pageout_mutex);
1359                                         break;
1360                                 }
1361                         }
1362                 }
1363         }
1364 

1365         atomic_add_long(&nscan, nscan_cnt);
1366 
1367         sample_end = gethrtime();

1368 
1369         DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
1370             uint_t, inst);
1371 
1372         /*
1373          * The following two blocks are only relevant when the scanner is
1374          * first started up. After the scanner runs for a while, neither of
1375          * the conditions will ever be true again.
1376          *
1377          * The global variables used below are only modified by this thread and
1378          * only during initial scanning when there is a single page scanner
1379          * thread running. Thus, we don't use any locking.
1380          */
1381         if (pageout_new_spread == 0) {
1382                 VERIFY3U(inst, ==, 0);

1383                 if (PAGE_SCAN_STARTUP) {
1384                         /*
1385                          * Continue accumulating samples until we have enough
1386                          * to get a reasonable value for average scan rate:
1387                          */
1388                         pageout_sample_pages += pcount;
1389                         pageout_sample_etime += sample_end - sample_start;
1390                         ++pageout_sample_cnt;
1391                 }
1392 
1393                 if (!PAGE_SCAN_STARTUP) {
1394                         /*
1395                          * We have enough samples, set the spread.
1396                          */
1397                         pageout_rate = (hrrate_t)pageout_sample_pages *
1398                             (hrrate_t)(NANOSEC) / pageout_sample_etime;
1399                         pageout_new_spread = pageout_rate / 10;
1400                         setupclock();
1401                 }
1402         }
1403 
1404         goto loop;
1405 }
1406 


1520         }
1521 
1522         if (zones_over) {
1523                 ASSERT(pp->p_zoneid == ALL_ZONES ||
1524                     pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1525                 if (pp->p_zoneid == ALL_ZONES ||
1526                     zone_pdata[pp->p_zoneid].zpers_over == 0) {
1527                         /*
1528                          * Cross-zone shared page, or zone not over it's cap.
1529                          * Leave the page alone.
1530                          */
1531                         page_unlock(pp);
1532                         return (CKP_INELIGIBLE);
1533                 }
1534                 zid = pp->p_zoneid;
1535         }
1536 
1537         /*
1538          * Maintain statistics for what we are freeing
1539          */
1540 
1541         if (pp->p_vnode != NULL) {
1542                 if (pp->p_vnode->v_flag & VVMEXEC)
1543                         isexec = 1;
1544 
1545                 if (!IS_SWAPFSVP(pp->p_vnode))
1546                         isfs = 1;
1547         }
1548 
1549         /*
1550          * Turn off REF and MOD bits with the front hand.
1551          * The back hand examines the REF bit and always considers
1552          * SHARED pages as referenced.
1553          */
1554         if (whichhand == POH_FRONT) {
1555                 pagesync_flag = HAT_SYNC_ZERORM;
1556         } else {
1557                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1558                     HAT_SYNC_STOPON_SHARED;
1559         }
1560 




 225 pgcnt_t         lotsfree_min = 0;
 226 pgcnt_t         lotsfree_max = 0;
 227 
 228 #define         LOTSFREE_MIN_DEFAULT    (16 * MEGABYTES)
 229 #define         LOTSFREE_MAX_DEFAULT    (2048 * MEGABYTES)
 230 
 231 /*
 232  * If these tunables are set to non-zero values in /etc/system, and provided
 233  * the value is not larger than the threshold above, the specified value will
 234  * be used directly without any additional calculation or adjustment.  The boot
 235  * time value of these overrides is preserved in the "clockinit" struct.  More
 236  * detail is available in the comment at the top of the file.
 237  */
 238 pgcnt_t         maxpgio = 0;
 239 pgcnt_t         minfree = 0;
 240 pgcnt_t         desfree = 0;
 241 pgcnt_t         lotsfree = 0;
 242 pgcnt_t         needfree = 0;
 243 pgcnt_t         throttlefree = 0;
 244 pgcnt_t         pageout_reserve = 0;
 245 pri_t           pageout_pri;
 246 
 247 pgcnt_t         deficit;
 248 pgcnt_t         nscan;
 249 pgcnt_t         desscan;
 250 
 251 /* kstats */
 252 uint64_t low_mem_scan;
 253 uint64_t zone_cap_scan;

 254 
 255 #define MAX_PSCAN_THREADS       16
 256 
 257 /*
 258  * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
 259  * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
 260  * that gives the equivalent of some underlying %CPU duty cycle.
 261  *
 262  * min_pageout_nsec:
 263  *     nanoseconds/wakeup equivalent of min_percent_cpu.
 264  *
 265  * max_pageout_nsec:
 266  *     nanoseconds/wakeup equivalent of max_percent_cpu.
 267  *
 268  * pageout_nsec:
 269  *     Number of nanoseconds budgeted for each wakeup cycle.
 270  *     Computed each time around by schedpaging().
 271  *     Varies between min_pageout_nsec and max_pageout_nsec,
 272  *     depending on memory pressure or zones over their cap.
 273  *
 274  * zone_pageout_nsec:
 275  *      Number of nanoseconds budget for each cycle when a zone
 276  *      is over its memory cap. If this is zero, then the value
 277  *      of max_pageout_nsec is used instead.
 278  */

 279 static hrtime_t min_pageout_nsec;
 280 static hrtime_t max_pageout_nsec;
 281 static hrtime_t pageout_nsec;
 282 static hrtime_t zone_pageout_nsec;
 283 

 284 static boolean_t        reset_hands[MAX_PSCAN_THREADS];
 285 
 286 #define PAGES_POLL_MASK 1023
 287 #define SCHEDPAGING_HZ  4
 288 
 289 /*
 290  * despagescanners:
 291  *      The desired number of page scanner threads. The value can be set in
 292  *      /etc/system or tuned directly with 'mdb -kw'.  The system will bring
 293  *      the actual number of threads into line with the desired number. If set
 294  *      to an invalid value, the system will correct the setting.
 295  */
 296 uint_t despagescanners = 0;

 297 





 298 /*
 299  * pageout_sample_lim:
 300  *     The limit on the number of samples needed to establish a value for new
 301  *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
 302  *     handspreadpages.
 303  *
 304  * pageout_sample_cnt:
 305  *     Current sample number.  Once the sample gets large enough, set new
 306  *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
 307  *
 308  * pageout_sample_pages:
 309  *     The accumulated number of pages scanned during sampling.
 310  *
 311  * pageout_sample_etime:
 312  *     The accumulated nanoseconds for the sample.
 313  *
 314  * pageout_rate:
 315  *     Rate in pages/nanosecond, computed at the end of sampling.
 316  *
 317  * pageout_new_spread:
 318  *     Initially zero while the system scan rate is measured by
 319  *     pageout_scanner(), which then sets this value once per system boot after
 320  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
 321  *     new value is used for fastscan and handspreadpages.
 322  */

 323 typedef hrtime_t hrrate_t;
 324 
 325 static uint64_t pageout_sample_lim = 4;
 326 static uint64_t pageout_sample_cnt = 0;
 327 static pgcnt_t  pageout_sample_pages = 0;
 328 static hrtime_t pageout_sample_etime = 0;
 329 static hrrate_t pageout_rate = 0;
 330 static pgcnt_t  pageout_new_spread = 0;
 331 
 332 /* True if the page scanner is first starting up */


 333 #define PAGE_SCAN_STARTUP       (pageout_sample_cnt < pageout_sample_lim)
 334 
 335 /* The current number of page scanner threads */
 336 static uint_t n_page_scanners = 1;
 337 /* The number of page scanner threads that are actively scanning. */
 338 static uint_t pageouts_running;
 339 
 340 /*
 341  * Record number of times a pageout_scanner() wakeup cycle finished because it
 342  * timed out (exceeded its CPU budget), rather than because it visited
 343  * its budgeted number of pages. This is only done when scanning under low
 344  * free memory conditions, not when scanning for zones over their cap.
 345  */
 346 uint64_t        pageout_timeouts = 0;
 347 
 348 #ifdef VM_STATS
 349 static struct pageoutvmstats_str {
 350         ulong_t checkpage[3];
 351 } pageoutvmstats;
 352 #endif /* VM_STATS */
 353 
 354 /*
 355  * Threads waiting for free memory use this condition variable and lock until
 356  * memory becomes available.
 357  */
 358 kmutex_t        memavail_lock;
 359 kcondvar_t      memavail_cv;


 368         CKP_NOT_FREED,
 369         CKP_FREED,
 370 } checkpage_result_t;
 371 
 372 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
 373 
 374 static struct clockinit {
 375         bool ci_init;
 376         pgcnt_t ci_lotsfree_min;
 377         pgcnt_t ci_lotsfree_max;
 378         pgcnt_t ci_lotsfree;
 379         pgcnt_t ci_desfree;
 380         pgcnt_t ci_minfree;
 381         pgcnt_t ci_throttlefree;
 382         pgcnt_t ci_pageout_reserve;
 383         pgcnt_t ci_maxpgio;
 384         pgcnt_t ci_maxfastscan;
 385         pgcnt_t ci_fastscan;
 386         pgcnt_t ci_slowscan;
 387         pgcnt_t ci_handspreadpages;
 388         uint_t  ci_despagescanners;
 389 } clockinit = { .ci_init = false };
 390 
 391 static inline pgcnt_t
 392 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 393 {
 394         if (value < minimum) {
 395                 return (minimum);
 396         } else if (value > maximum) {
 397                 return (maximum);
 398         } else {
 399                 return (value);
 400         }
 401 }
 402 
 403 static pgcnt_t
 404 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 405 {
 406         if (initval == 0 || initval >= initval_ceiling) {
 407                 return (defval);
 408         } else {
 409                 return (initval);
 410         }
 411 }
 412 
 413 /*
 414  * Local boolean to control scanning when zones are over their cap. Avoids
 415  * accessing the zone_num_over_cap variable except within schedpaging(), which
 416  * only runs periodically. This is here only to reduce our access to
 417  * zone_num_over_cap, since it is already accessed a lot during paging, and
 418  * the page scanner accesses the zones_over variable on each page during a
 419  * scan. There is no lock needed for zone_num_over_cap since schedpaging()
 420  * doesn't modify the variable, it only cares if the variable is 0 or non-0.
 421  */
 422 static boolean_t zones_over = B_FALSE;
 423 
 424 /*
 425  * On large memory systems, multiple instances of the page scanner are run,
 426  * each responsible for a separate region of memory. This speeds up page
 427  * invalidation under low memory conditions.
 428  *
 429  * despagescanners can be set in /etc/system or via mdb and it will
 430  * be used as a guide for how many page scanners to create; the value
 431  * will be adjusted if it is not sensible. Otherwise, the number of
 432  * page scanners is determined dynamically based on handspreadpages.
 433  */
 434 static void
 435 recalc_pagescanners(void)
 436 {
 437         pgcnt_t sz;
 438         uint_t des;
 439 
 440         /* If the initial calibration has not been done, take no action. */
 441         if (pageout_new_spread == 0)
 442                 return;
 443 
 444         /*
 445          * If the desired number of scanners is set in /etc/system
 446          * then try to use it.
 447          */
 448         if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
 449                 despagescanners = clockinit.ci_despagescanners;
 450 
 451         if (despagescanners != 0) {
 452                 /*
 453                  * We have a desired number of page scanners, either from
 454                  * /etc/system or set via mdb. Try and use it (it will be
 455                  * clamped below).
 456                  */
 457                 des = despagescanners;
 458         } else {
 459                 /*
 460                  * Calculate the number of desired scanners based on the
 461                  * system's memory size.
 462                  *
 463                  * A 64GiB region size is used as the basis for calculating how
 464                  * many scanner threads should be created. For systems with up
 465                  * to 64GiB of RAM, a single thread is used; for very large
 466                  * memory systems the threads are limited to MAX_PSCAN_THREADS.
 467                  */
 468                 sz = btop(64ULL << 30);
 469 
 470                 if (sz > looppages) {
 471                         des = 1;
 472                 } else {
 473                         pgcnt_t tmp = sz;
 474 
 475                         for (des = 1; tmp < looppages; des++)
 476                                 tmp += sz;
 477                 }
 478         }
 479 
 480         /*
 481          * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
 482          * and so that each scanner covers at least 10% more than
 483          * handspreadpages.
 484          */
 485         des = clamp(des, 1,
 486             looppages / (handspreadpages + handspreadpages / 10));
 487         despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
 488 }
 489 
 490 /*
 491  * Set up the paging constants for the clock algorithm used by
 492  * pageout_scanner(), and by the virtual memory system overall.  See the
 493  * comments at the top of this file for more information about the threshold
 494  * values and system responses to memory pressure.
 495  *
 496  * This routine is called once by main() at startup, after the initial size of
 497  * physical memory is determined.  It may be called again later if memory is
 498  * added to or removed from the system, or if new measurements of the page scan
 499  * rate become available.
 500  */
 501 void
 502 setupclock(void)
 503 {



 504         bool half = (pageout_threshold_style == 1);
 505         bool recalc = true;
 506 
 507         looppages = total_pages;
 508 
 509         /*
 510          * The operator may have provided specific values for some of the
 511          * tunables via /etc/system.  On our first call, we preserve those
 512          * values so that they can be used for subsequent recalculations.
 513          *
 514          * A value of zero for any tunable means we will use the default
 515          * sizing.
 516          */

 517         if (!clockinit.ci_init) {
 518                 clockinit.ci_init = true;
 519 
 520                 clockinit.ci_lotsfree_min = lotsfree_min;
 521                 clockinit.ci_lotsfree_max = lotsfree_max;
 522                 clockinit.ci_lotsfree = lotsfree;
 523                 clockinit.ci_desfree = desfree;
 524                 clockinit.ci_minfree = minfree;
 525                 clockinit.ci_throttlefree = throttlefree;
 526                 clockinit.ci_pageout_reserve = pageout_reserve;
 527                 clockinit.ci_maxpgio = maxpgio;
 528                 clockinit.ci_maxfastscan = maxfastscan;
 529                 clockinit.ci_fastscan = fastscan;
 530                 clockinit.ci_slowscan = slowscan;
 531                 clockinit.ci_handspreadpages = handspreadpages;
 532                 clockinit.ci_despagescanners = despagescanners;
 533 
 534                 /*
 535                  * The first call does not trigger a recalculation, only
 536                  * subsequent calls.
 537                  */
 538                 recalc = false;
 539         }
 540 
 541         /*
 542          * Configure paging threshold values.  For more details on what each
 543          * threshold signifies, see the comments at the top of this file.
 544          */
 545         lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
 546             btop(LOTSFREE_MAX_DEFAULT));
 547         lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
 548             btop(LOTSFREE_MIN_DEFAULT));
 549 
 550         lotsfree = tune(clockinit.ci_lotsfree, looppages,
 551             clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
 552 


 694 
 695         if (fastscan > looppages / loopfraction) {
 696                 fastscan = looppages / loopfraction;
 697         }
 698 
 699         /*
 700          * Set slow scan time to 1/10 the fast scan time, but
 701          * not to exceed maxslowscan.
 702          */
 703         if (clockinit.ci_slowscan == 0) {
 704                 slowscan = MIN(fastscan / 10, maxslowscan);
 705         } else {
 706                 slowscan = clockinit.ci_slowscan;
 707         }
 708 
 709         if (slowscan > fastscan / 2) {
 710                 slowscan = fastscan / 2;
 711         }
 712 
 713         /*
 714          * Handspreadpages is the distance (in pages) between front and back
 715          * pageout daemon hands.  The amount of time to reclaim a page
 716          * once pageout examines it increases with this distance and
 717          * decreases as the scan rate rises. It must be < the amount
 718          * of pageable memory.
 719          *
 720          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 721          * to be "fastscan" results in the front hand being a few secs
 722          * (varies based on the processor speed) ahead of the back hand
 723          * at fastscan rates.  This distance can be further reduced, if
 724          * necessary, by increasing the processor time used by pageout
 725          * to be more than ~4% and preferrably not more than ~10%.
 726          *
 727          * As a result, user processes have a much better chance of
 728          * referencing their pages before the back hand examines them.
 729          * This also significantly lowers the number of reclaims from
 730          * the freelist since pageout does not end up freeing pages which
 731          * may be referenced a sec later.
 732          */
 733         if (clockinit.ci_handspreadpages == 0) {
 734                 handspreadpages = fastscan;
 735         } else {
 736                 handspreadpages = clockinit.ci_handspreadpages;
 737         }
 738 
 739         /*
 740          * Make sure that back hand follows front hand by at least
 741          * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
 742          * back hand to look at a page during the same wakeup of the pageout
 743          * daemon in which the front hand cleared its ref bit.
 744          */
 745         if (handspreadpages >= looppages) {
 746                 handspreadpages = looppages - 1;
 747         }
 748 

 749         /*
 750          * Establish the minimum and maximum length of time to be spent
 751          * scanning pages per wakeup, limiting the scanner duty cycle.  The
 752          * input percentage values (0-100) must be converted to a fraction of
 753          * the number of nanoseconds in a second of wall time, then further
 754          * scaled down by the number of scanner wakeups in a second.
 755          */
 756         min_pageout_nsec = MAX(1,
 757             NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
 758         max_pageout_nsec = MAX(min_pageout_nsec,
 759             NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);

 760 
 761         /*
 762          * If not called for recalculation, return and skip the remaining
 763          * steps.















 764          */
 765         if (!recalc)
 766                 return;
 767 
 768         /*
 769          * Set a flag to re-evaluate the clock hand positions.

 770          */
 771         for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
 772                 reset_hands[i] = B_TRUE;





 773 
 774         recalc_pagescanners();








 775 }
 776 
 777 /*
 778  * Pageout scheduling.
 779  *
 780  * Schedpaging controls the rate at which the page out daemon runs by
 781  * setting the global variables nscan and desscan SCHEDPAGING_HZ
 782  * times a second.  Nscan records the number of pages pageout has examined
 783  * in its current pass; schedpaging() resets this value to zero each time
 784  * it runs.  Desscan records the number of pages pageout should examine
 785  * in its next pass; schedpaging() sets this value based on the amount of
 786  * currently available memory.
 787  */

 788 
 789 static kmutex_t pageout_mutex;
 790 
 791 /*
 792  * Pool of available async pageout putpage requests.
 793  */
 794 static struct async_reqs *push_req;
 795 static struct async_reqs *req_freelist; /* available req structs */
 796 static struct async_reqs *push_list;    /* pending reqs */
 797 static kmutex_t push_lock;              /* protects req pool */
 798 static kcondvar_t push_cv;
 799 
 800 /*
 801  * If pageout() is stuck on a single push for this many seconds,
 802  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
 803  * to 0, the deadman will have no effect.
 804  *
 805  * Note that we are only looking for stalls in the calls that pageout() makes
 806  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
 807  * I/O, which should not take long unless the underlying strategy call blocks
 808  * indefinitely for memory.  The actual I/O request happens (or fails) later.
 809  */
 810 uint_t pageout_deadman_seconds = 90;
 811 
 812 static uint_t pageout_stucktime = 0;
 813 static bool pageout_pushing = false;
 814 static uint64_t pageout_pushcount = 0;
 815 static uint64_t pageout_pushcount_seen = 0;
 816 
 817 static int async_list_size = 8192;      /* number of async request structs */
 818 
 819 static void pageout_scanner(void *);
 820 
 821 /*
 822  * If a page is being shared more than "po_share" times
 823  * then leave it alone- don't page it out.
 824  */
 825 #define MIN_PO_SHARE    (8)
 826 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 827 ulong_t po_share = MIN_PO_SHARE;
 828 
 829 /*
 830  * Schedule rate for paging.
 831  * Rate is linear interpolation between
 832  * slowscan with lotsfree and fastscan when out of memory.
 833  */
 834 static void
 835 schedpaging(void *arg)
 836 {
 837         spgcnt_t vavail;
 838 
 839         if (freemem < lotsfree + needfree + kmem_reapahead)
 840                 kmem_reap();
 841 
 842         if (freemem < lotsfree + needfree)
 843                 seg_preap();
 844 
 845         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 846                 kcage_cageout_wakeup();
 847 
 848         if (mutex_tryenter(&pageout_mutex)) {
 849 
 850                 if (pageouts_running != 0)
 851                         goto out;
 852 
 853                 /* No pageout scanner threads running. */
 854                 nscan = 0;
 855                 vavail = freemem - deficit;
 856                 if (pageout_new_spread != 0)
 857                         vavail -= needfree;
 858                 vavail = clamp(vavail, 0, lotsfree);



 859 








 860                 if (needfree > 0 && pageout_new_spread == 0) {
 861                         /*
 862                          * If we've not yet collected enough samples to
 863                          * calculate a spread, use the old logic of kicking
 864                          * into high gear anytime needfree is non-zero.



 865                          */
 866                         desscan = fastscan / SCHEDPAGING_HZ;
 867                 } else {
 868                         /*
 869                          * Once we've calculated a spread based on system
 870                          * memory and usage, just treat needfree as another
 871                          * form of deficit.
 872                          */
 873                         spgcnt_t faststmp, slowstmp, result;
 874 
 875                         slowstmp = slowscan * vavail;
 876                         faststmp = fastscan * (lotsfree - vavail);
 877                         result = (slowstmp + faststmp) /
 878                             nz(lotsfree) / SCHEDPAGING_HZ;
 879                         desscan = (pgcnt_t)result;
 880                 }
 881 
 882                 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
 883                     (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);










 884 
 885                 DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
 886                     pageout_nsec);
 887 
 888                 if (pageout_new_spread != 0 && despagescanners != 0 &&
 889                     despagescanners != n_page_scanners) {
 890                         /*
 891                         * We have finished the pagescan initialisation and the
 892                         * desired number of page scanners has changed, either
 893                         * because initialisation just finished, because of a
 894                         * memory DR, or because despagescanners has been
 895                         * modified on the fly (i.e. by mdb).


 896                         */
 897                         uint_t i, curr_nscan = n_page_scanners;


 898 
 899                         /* Re-validate despagescanners */
 900                         recalc_pagescanners();
 901 
 902                         n_page_scanners = despagescanners;




 903 
 904                         for (i = 0; i < MAX_PSCAN_THREADS; i++)













 905                                 reset_hands[i] = B_TRUE;

 906 
 907                         /* If we need more scanners, start them now. */
 908                         if (n_page_scanners > curr_nscan) {
 909                                 for (i = curr_nscan; i < n_page_scanners; i++) {
 910                                         (void) lwp_kernel_create(proc_pageout,
 911                                             pageout_scanner,
 912                                             (void *)(uintptr_t)i, TS_RUN,
 913                                             pageout_pri);
 914                                 }
 915                         }
 916 
 917                         /*
 918                          * If the number of scanners has decreased, trigger a
 919                          * wakeup so that the excess threads will terminate.
 920                          */
 921                         if (n_page_scanners < curr_nscan) {
 922                                 WAKE_PAGEOUT_SCANNER();
 923                         }
 924                 }
 925 
 926                 zones_over = B_FALSE;
 927 
 928                 if (PAGE_SCAN_STARTUP) {


 929                         /*
 930                          * We still need to measure the rate at which the
 931                          * system is able to scan pages of memory. Each of
 932                          * these initial samples is a scan of as much system
 933                          * memory as practical, regardless of whether or not we
 934                          * are experiencing memory pressure.
 935                          */
 936                         desscan = total_pages;
 937                         pageout_nsec = max_pageout_nsec;
 938 
 939                         DTRACE_PROBE(schedpage__wake__sample);
 940                         WAKE_PAGEOUT_SCANNER();
 941                 } else if (freemem < lotsfree + needfree) {
 942                         /*
 943                          * We need more memory.
 944                          */
 945                         low_mem_scan++;
 946 
 947                         DTRACE_PROBE(schedpage__wake__low);
 948                         WAKE_PAGEOUT_SCANNER();
 949                 } else if (zone_num_over_cap > 0) {
 950                         /*
 951                          * One of more zones are over their cap.
 952                          */
 953 
 954                         /* No page limit */
 955                         desscan = total_pages;
 956 
 957                         /*
 958                         * Increase the scanning CPU% to the max. This implies
 959                         * 80% of one CPU/sec if the scanner can run each
 960                         * opportunity. Can also be tuned via setting
 961                         * zone_pageout_nsec in /etc/system or with mdb.
 962                         */
 963                         pageout_nsec = (zone_pageout_nsec != 0) ?
 964                            zone_pageout_nsec : max_pageout_nsec;
 965 
 966                         zones_over = B_TRUE;
 967                         zone_cap_scan++;
 968 
 969                         DTRACE_PROBE(schedpage__wake__zone);
 970                         WAKE_PAGEOUT_SCANNER();

 971                 } else {
 972                         /*
 973                          * There are enough free pages, no need to
 974                          * kick the scanner thread.  And next time
 975                          * around, keep more of the `highly shared'
 976                          * pages.
 977                          */
 978                         cv_signal_pageout();


 979                         if (po_share > MIN_PO_SHARE) {
 980                                 po_share >>= 1;
 981                         }
 982                 }
 983 out:
 984                 mutex_exit(&pageout_mutex);
 985         }
 986 
 987         /*
 988          * Signal threads waiting for available memory.
 989          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 990          * in this case it is not needed - the waiters will be waken up during
 991          * the next invocation of this function.
 992          */
 993         if (kmem_avail() > 0)
 994                 cv_broadcast(&memavail_cv);
 995 
 996         (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
 997 }
 998 
 999 pgcnt_t         pushes;
1000 ulong_t         push_list_size;         /* # of requests on pageout queue */
1001 
1002 /*
1003  * Paging out should always be enabled.  This tunable exists to hold pageout
1004  * for debugging purposes.  If set to 0, pageout_scanner() will go back to
1005  * sleep each time it is woken by schedpaging().
1006  */
1007 uint_t dopageout = 1;
1008 
1009 /*
1010  * The page out daemon, which runs as process 2.
1011  *
1012  * The daemon treats physical memory as a circular array of pages and scans
1013  * the pages using a 'two-handed clock' algorithm. The front hand moves
1014  * through the pages, clearing the reference bit. The back hand travels a
1015  * distance (handspreadpages) behind the front hand, freeing the pages that
1016  * have not been referenced in the time since the front hand passed. If
1017  * modified, they are first written to their backing store before being
1018  * freed.
1019  *
1020  * In order to make page invalidation more responsive on machines with
1021  * larger memory, multiple pageout_scanner threads may be created. In this
1022  * case, each thread is given a segment of the memory "clock face" so that
1023  * memory can be reclaimed more quickly.


1024  *
1025  * As long as there are at least lotsfree pages, or no zones over their
1026  * cap, then pageout_scanner threads are not run. When pageout_scanner
1027  * threads are running for case (a), all pages are considered for pageout.
1028  * For case (b), only pages belonging to a zone over its cap will be
1029  * considered for pageout.

1030  *
1031  * There are multiple threads that act on behalf of the pageout process. A
1032  * set of threads scan pages (pageout_scanner) and frees them up if they
1033  * don't require any VOP_PUTPAGE operation. If a page must be written back
1034  * to its backing store, the request is put on a list and the other
1035  * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
1036  * requests from the list, and processes them. Some filesystems may require
1037  * resources for the VOP_PUTPAGE operations (like memory) and hence can
1038  * block the pageout thread, but the scanner thread can still operate.
1039  * There is still no guarantee that memory deadlocks cannot occur.








1040  */
1041 void
1042 pageout()
1043 {
1044         struct async_reqs *arg;

1045         int i;
1046         pgcnt_t max_pushes;
1047         callb_cpr_t cprinfo;
1048 
1049         proc_pageout = ttoproc(curthread);
1050         proc_pageout->p_cstime = 0;
1051         proc_pageout->p_stime =  0;
1052         proc_pageout->p_cutime =  0;
1053         proc_pageout->p_utime = 0;
1054         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1055         bcopy("pageout", PTOU(curproc)->u_comm, 7);
1056 
1057         /*
1058          * Create pageout scanner thread
1059          */
1060         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1061         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1062 
1063         /*
1064          * Allocate and initialize the async request structures
1065          * for pageout.
1066          */
1067         push_req = (struct async_reqs *)
1068             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1069 
1070         req_freelist = push_req;
1071         for (i = 0; i < async_list_size - 1; i++) {
1072                 push_req[i].a_next = &push_req[i + 1];
1073         }
1074 
1075         pageout_pri = curthread->t_pri - 1;
1076 
1077         /* Create the first pageout scanner thread. */
1078         (void) lwp_kernel_create(proc_pageout, pageout_scanner,
1079             (void *)0,  /* this is instance 0, not NULL */
1080             TS_RUN, pageout_pri);
1081 
1082         /*
1083          * kick off pageout scheduler.
1084          */
1085         schedpaging(NULL);
1086 
1087         /*
1088          * Create kernel cage thread.
1089          * The kernel cage thread is started under the pageout process
1090          * to take advantage of the less restricted page allocation
1091          * in page_create_throttle().
1092          */
1093         kcage_cageout_init();
1094 
1095         /*
1096          * Limit pushes to avoid saturating pageout devices.
1097          */
1098         max_pushes = maxpgio / SCHEDPAGING_HZ;
1099         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1100 
1101         for (;;) {
1102                 mutex_enter(&push_lock);
1103 
1104                 while ((arg = push_list) == NULL || pushes > max_pushes) {
1105                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1106                         cv_wait(&push_cv, &push_lock);
1107                         pushes = 0;
1108                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1109                 }
1110                 push_list = arg->a_next;
1111                 arg->a_next = NULL;
1112                 pageout_pushing = true;
1113                 mutex_exit(&push_lock);
1114 
1115                 DTRACE_PROBE(pageout__push);
1116 
1117                 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1118                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1119                         pushes++;
1120                 }
1121 
1122                 /* vp held by checkpage() */
1123                 VN_RELE(arg->a_vp);
1124 
1125                 mutex_enter(&push_lock);
1126                 pageout_pushing = false;
1127                 pageout_pushcount++;
1128                 arg->a_next = req_freelist;  /* back on freelist */
1129                 req_freelist = arg;
1130                 push_list_size--;
1131                 mutex_exit(&push_lock);
1132         }
1133 }
1134 
1135 /*
1136  * Kernel thread that scans pages looking for ones to free
1137  */
1138 static void
1139 pageout_scanner(void *a)
1140 {
1141         struct page *fronthand, *backhand, *fronthandstart;
1142         struct page *regionstart, *regionend;
1143         uint_t laps;
1144         callb_cpr_t cprinfo;
1145         pgcnt_t nscan_cnt, tick;
1146         pgcnt_t pcount;
1147         bool bhwrapping, fhwrapping;
1148         hrtime_t sample_start, sample_end;
1149         uint_t inst = (uint_t)(uintptr_t)a;

1150 
1151         VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1152 
1153         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
1154         mutex_enter(&pageout_mutex);
1155 



1156         /*
1157          * The restart case does not attempt to point the hands at roughly
1158          * the right point on the assumption that after one circuit things
1159          * will have settled down, and restarts shouldn't be that often.


1160          */
1161         reset_hands[inst] = B_TRUE;



1162 
1163         pageouts_running++;
1164         mutex_exit(&pageout_mutex);
1165 
1166 loop:
1167         cv_signal_pageout();
1168 
1169         mutex_enter(&pageout_mutex);
1170         pageouts_running--;
1171         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1172         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
1173         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
1174         pageouts_running++;
1175         mutex_exit(&pageout_mutex);
1176 
1177         /*
1178          * Check if pageout has been disabled for debugging purposes.
1179          */
1180         if (!dopageout) {
1181                 goto loop;
1182         }
1183 
1184         /*
1185          * One may reset the clock hands and scanned region for debugging
1186          * purposes. Hands will also be reset on first thread startup, if
1187          * the number of scanning threads (n_page_scanners) changes, or if
1188          * memory is added to, or removed from, the system.
1189          */
1190         if (reset_hands[inst]) {
1191                 struct page *first;

1192 
1193                 reset_hands[inst] = B_FALSE;
1194 
1195                 if (inst >= n_page_scanners) {
1196                         /*
1197                         * The desired number of page scanners has been
1198                         * reduced and this instance is no longer wanted.
1199                         * Exit the lwp.
1200                         */
1201                         VERIFY3U(inst, !=, 0);
1202                         DTRACE_PROBE1(pageout__exit, uint_t, inst);
1203                         mutex_enter(&pageout_mutex);
1204                         pageouts_running--;
1205                         mutex_exit(&pageout_mutex);
1206                         mutex_enter(&curproc->p_lock);
1207                         lwp_exit();
1208                         /* NOTREACHED */
1209                 }
1210 
1211                 first = page_first();
1212 
1213                 /*
1214                  * Each scanner thread gets its own sector of the memory
1215                  * clock face.









1216                  */
1217                 pgcnt_t span, offset;
1218 
1219                 span = looppages / n_page_scanners;
1220                 VERIFY3U(span, >, handspreadpages);
1221 
1222                 offset = inst * span;
1223                 regionstart = page_nextn(first, offset);
1224                 if (inst == n_page_scanners - 1) {
1225                         /* The last instance goes up to the last page */
1226                         regionend = page_nextn(first, looppages - 1);
1227                 } else {
1228                         regionend = page_nextn(regionstart, span - 1);
1229                 }
1230 
1231                 backhand = regionstart;
1232                 fronthand = page_nextn(backhand, handspreadpages);
1233                 tick = 1;
1234 
1235                 bhwrapping = fhwrapping = B_FALSE;
1236 
1237                 DTRACE_PROBE4(pageout__reset, uint_t, inst,
1238                     pgcnt_t, regionstart, pgcnt_t, regionend,
1239                     pgcnt_t, fronthand);
1240         }

1241 
1242         /*
1243          * This CPU kstat is only incremented here and we're obviously
1244          * on this CPU, so no lock.
1245          */
1246         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1247 
1248         /*
1249          * Keep track of the number of times we have scanned all the way around
1250          * the loop on this wakeup.
1251          */
1252         laps = 0;
1253 
1254         /*
1255          * Track the number of pages visited during this scan so that we can
1256          * periodically measure our duty cycle.
1257          */

1258         nscan_cnt = 0;
1259         pcount = 0;
1260 
1261         DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
1262             hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
1263 
1264         /*
1265          * Record the initial position of the front hand for this cycle so
1266          * that we can detect when the hand wraps around.


1267          */
1268         fronthandstart = fronthand;





1269 



1270         sample_start = gethrtime();
1271 
1272         /*
1273          * Scan the appropriate number of pages for a single duty cycle.
1274          */
1275         while (nscan_cnt < desscan) {
1276                 checkpage_result_t rvfront, rvback;
1277 
1278                 /*
1279                  * Only scan while at least one of these is true:
1280                  *  1) one or more zones is over its cap
1281                  *  2) there is not enough free memory
1282                  *  3) during page scan startup when determining sample data
1283                  */
1284                 if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
1285                     !zones_over) {



1286                         /*
1287                          * We are not sampling and enough memory has become
1288                          * available that scanning is no longer required.
1289                          */
1290                         DTRACE_PROBE1(pageout__memfree, uint_t, inst);
1291                         break;
1292                 }
1293 
1294                 DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
1295 
1296                 /*
1297                  * Periodically check to see if we have exceeded the CPU duty
1298                  * cycle for a single wakeup.
1299                  */
1300                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1301                         hrtime_t pageout_cycle_nsec;
1302 
1303                         pageout_cycle_nsec = gethrtime() - sample_start;
1304                         if (pageout_cycle_nsec >= pageout_nsec) {
1305                                 if (!zones_over)




1306                                         atomic_inc_64(&pageout_timeouts);

1307                                 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1308                                 break;
1309                         }
1310                 }
1311 
1312                 /*
1313                  * If checkpage manages to add a page to the free list,
1314                  * we give ourselves another couple of trips around the loop.
1315                  */
1316                 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1317                         laps = 0;
1318                 }
1319                 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1320                         laps = 0;
1321                 }
1322 
1323                 ++pcount;
1324 
1325                 /*
1326                  * This CPU kstat is only incremented here and we're obviously
1327                  * on this CPU, so no lock.
1328                  */
1329                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1330 
1331                 /*
1332                  * Don't include ineligible pages in the number scanned.
1333                  */
1334                 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1335                         nscan_cnt++;
1336                 }
1337 
1338                 if (bhwrapping) {
1339                         backhand = regionstart;
1340                         bhwrapping = B_FALSE;
1341                 } else {
1342                         backhand = page_nextn(backhand, tick);
1343                         if (backhand == regionend)
1344                                 bhwrapping = B_TRUE;
1345                 }
1346 
1347                 if (fhwrapping) {
1348                         fronthand = regionstart;
1349                         fhwrapping = B_FALSE;
1350                 } else {
1351                         fronthand = page_nextn(fronthand, tick);
1352                         if (fronthand == regionend)
1353                                 fhwrapping = B_TRUE;
1354                 }
1355 
1356                 /*
1357                  * The front hand has wrapped around during this wakeup.

1358                  */
1359                 if (fronthand == fronthandstart) {
1360                         laps++;
1361                         DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
1362                             uint_t, laps);
1363 
1364                         /*
1365                          * This CPU kstat is only incremented here and we're
1366                          * obviously on this CPU, so no lock.
1367                          */
1368                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
1369 
1370                         /*

1371                          * then when we wraparound memory we want to try to
1372                          * reclaim more pages.
1373                          * If scanning only because zones are over their cap,
1374                          * then wrapping is common and we simply keep going.
1375                         */
1376                         if (laps > 1 && freemem < lotsfree + needfree) {
1377                                 /*

1378                                  * Extremely unlikely, but it happens.
1379                                  * We went around the loop at least once
1380                                  * and didn't get far enough.
1381                                  * If we are still skipping `highly shared'
1382                                  * pages, skip fewer of them.  Otherwise,
1383                                  * give up till the next clock tick.
1384                                  */

1385                                 if (po_share < MAX_PO_SHARE) {
1386                                         po_share <<= 1;

1387                                 } else {

1388                                         break;
1389                                 }
1390                         }
1391                 }
1392         }
1393 
1394         sample_end = gethrtime();
1395         atomic_add_long(&nscan, nscan_cnt);
1396 
1397         DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
1398             pgcnt_t, nscan_cnt, pgcnt_t, pcount)
1399 



1400         /*




1401          * The global variables used below are only modified by this thread and
1402          * only during initial scanning when there is a single page scanner
1403          * thread running.
1404          */
1405         if (pageout_new_spread == 0) {
1406                 VERIFY3U(inst, ==, 0);
1407 
1408                 if (PAGE_SCAN_STARTUP) {
1409                         /*
1410                          * Continue accumulating samples until we have enough
1411                          * to get a reasonable value for average scan rate.
1412                          */
1413                         pageout_sample_pages += pcount;
1414                         pageout_sample_etime += sample_end - sample_start;
1415                         ++pageout_sample_cnt;
1416                 }
1417 
1418                 if (!PAGE_SCAN_STARTUP) {
1419                         /*
1420                          * We have enough samples, set the spread.
1421                          */
1422                         pageout_rate = (hrrate_t)pageout_sample_pages *
1423                             (hrrate_t)(NANOSEC) / pageout_sample_etime;
1424                         pageout_new_spread = pageout_rate / 10;
1425                         setupclock();
1426                 }
1427         }
1428 
1429         goto loop;
1430 }
1431 


1545         }
1546 
1547         if (zones_over) {
1548                 ASSERT(pp->p_zoneid == ALL_ZONES ||
1549                     pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1550                 if (pp->p_zoneid == ALL_ZONES ||
1551                     zone_pdata[pp->p_zoneid].zpers_over == 0) {
1552                         /*
1553                         * Cross-zone shared page, or zone not over it's cap.
1554                         * Leave the page alone.
1555                         */
1556                         page_unlock(pp);
1557                         return (CKP_INELIGIBLE);
1558                 }
1559                 zid = pp->p_zoneid;
1560         }
1561 
1562         /*
1563          * Maintain statistics for what we are freeing
1564          */

1565         if (pp->p_vnode != NULL) {
1566                 if (pp->p_vnode->v_flag & VVMEXEC)
1567                         isexec = 1;
1568 
1569                 if (!IS_SWAPFSVP(pp->p_vnode))
1570                         isfs = 1;
1571         }
1572 
1573         /*
1574          * Turn off REF and MOD bits with the front hand.
1575          * The back hand examines the REF bit and always considers
1576          * SHARED pages as referenced.
1577          */
1578         if (whichhand == POH_FRONT) {
1579                 pagesync_flag = HAT_SYNC_ZERORM;
1580         } else {
1581                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1582                     HAT_SYNC_STOPON_SHARED;
1583         }
1584