Print this page
13097 improve VM tunables for modern systems (fix mismerge)

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/os/vm_pageout.c
          +++ new/usr/src/uts/common/os/vm_pageout.c
↓ open down ↓ 234 lines elided ↑ open up ↑
 235  235   * time value of these overrides is preserved in the "clockinit" struct.  More
 236  236   * detail is available in the comment at the top of the file.
 237  237   */
 238  238  pgcnt_t         maxpgio = 0;
 239  239  pgcnt_t         minfree = 0;
 240  240  pgcnt_t         desfree = 0;
 241  241  pgcnt_t         lotsfree = 0;
 242  242  pgcnt_t         needfree = 0;
 243  243  pgcnt_t         throttlefree = 0;
 244  244  pgcnt_t         pageout_reserve = 0;
      245 +pri_t           pageout_pri;
 245  246  
 246  247  pgcnt_t         deficit;
 247  248  pgcnt_t         nscan;
 248  249  pgcnt_t         desscan;
 249  250  
 250  251  /* kstats */
 251  252  uint64_t low_mem_scan;
 252  253  uint64_t zone_cap_scan;
 253      -uint64_t n_throttle;
 254  254  
      255 +#define MAX_PSCAN_THREADS       16
      256 +
 255  257  /*
 256  258   * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
 257  259   * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
 258  260   * that gives the equivalent of some underlying %CPU duty cycle.
 259  261   *
 260  262   * min_pageout_nsec:
 261  263   *     nanoseconds/wakeup equivalent of min_percent_cpu.
 262  264   *
 263  265   * max_pageout_nsec:
 264  266   *     nanoseconds/wakeup equivalent of max_percent_cpu.
 265  267   *
 266  268   * pageout_nsec:
 267  269   *     Number of nanoseconds budgeted for each wakeup cycle.
 268  270   *     Computed each time around by schedpaging().
 269  271   *     Varies between min_pageout_nsec and max_pageout_nsec,
 270  272   *     depending on memory pressure or zones over their cap.
 271  273   *
 272  274   * zone_pageout_nsec:
 273      - *     Number of nanoseconds budget for each cycle when a zone
 274      - *     is over its memory cap. If this is zero, then the value
 275      - *     of max_pageout_nsec is used instead.
      275 + *      Number of nanoseconds budget for each cycle when a zone
      276 + *      is over its memory cap. If this is zero, then the value
      277 + *      of max_pageout_nsec is used instead.
 276  278   */
 277      -
 278  279  static hrtime_t min_pageout_nsec;
 279  280  static hrtime_t max_pageout_nsec;
 280  281  static hrtime_t pageout_nsec;
 281  282  static hrtime_t zone_pageout_nsec;
 282  283  
 283      -#define MAX_PSCAN_THREADS       16
 284      -static boolean_t reset_hands[MAX_PSCAN_THREADS];
      284 +static boolean_t        reset_hands[MAX_PSCAN_THREADS];
 285  285  
      286 +#define PAGES_POLL_MASK 1023
      287 +#define SCHEDPAGING_HZ  4
      288 +
 286  289  /*
 287      - * These can be tuned in /etc/system or set with mdb.
 288      - * 'des_page_scanners' is the desired number of page scanner threads. The
 289      - * system will bring the actual number of threads into line with the desired
 290      - * number. If des_page_scanners is set to an invalid value, the system will
 291      - * correct the setting.
      290 + * despagescanners:
      291 + *      The desired number of page scanner threads. The value can be set in
      292 + *      /etc/system or tuned directly with 'mdb -kw'.  The system will bring
      293 + *      the actual number of threads into line with the desired number. If set
      294 + *      to an invalid value, the system will correct the setting.
 292  295   */
 293      -uint_t des_page_scanners;
 294      -uint_t pageout_reset_cnt = 64;  /* num. cycles for pageout_scanner hand reset */
      296 +uint_t despagescanners = 0;
 295  297  
 296      -uint_t n_page_scanners;
 297      -static pgcnt_t  pscan_region_sz; /* informational only */
 298      -
 299      -#define PAGES_POLL_MASK 1023
 300      -
 301  298  /*
 302  299   * pageout_sample_lim:
 303  300   *     The limit on the number of samples needed to establish a value for new
 304  301   *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
 305  302   *     handspreadpages.
 306  303   *
 307  304   * pageout_sample_cnt:
 308  305   *     Current sample number.  Once the sample gets large enough, set new
 309  306   *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
 310  307   *
↓ open down ↓ 5 lines elided ↑ open up ↑
 316  313   *
 317  314   * pageout_rate:
 318  315   *     Rate in pages/nanosecond, computed at the end of sampling.
 319  316   *
 320  317   * pageout_new_spread:
 321  318   *     Initially zero while the system scan rate is measured by
 322  319   *     pageout_scanner(), which then sets this value once per system boot after
 323  320   *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
 324  321   *     new value is used for fastscan and handspreadpages.
 325  322   */
 326      -
 327  323  typedef hrtime_t hrrate_t;
 328  324  
 329  325  static uint64_t pageout_sample_lim = 4;
 330  326  static uint64_t pageout_sample_cnt = 0;
 331  327  static pgcnt_t  pageout_sample_pages = 0;
      328 +static hrtime_t pageout_sample_etime = 0;
 332  329  static hrrate_t pageout_rate = 0;
 333  330  static pgcnt_t  pageout_new_spread = 0;
 334  331  
 335      -static hrtime_t pageout_sample_etime = 0;
 336      -
 337      -/* True if page scanner is first starting up */
      332 +/* True if the page scanner is first starting up */
 338  333  #define PAGE_SCAN_STARTUP       (pageout_sample_cnt < pageout_sample_lim)
 339  334  
      335 +/* The current number of page scanner threads */
      336 +static uint_t n_page_scanners = 1;
      337 +/* The number of page scanner threads that are actively scanning. */
      338 +static uint_t pageouts_running;
      339 +
 340  340  /*
 341  341   * Record number of times a pageout_scanner() wakeup cycle finished because it
 342  342   * timed out (exceeded its CPU budget), rather than because it visited
 343  343   * its budgeted number of pages. This is only done when scanning under low
 344  344   * free memory conditions, not when scanning for zones over their cap.
 345  345   */
 346  346  uint64_t        pageout_timeouts = 0;
 347  347  
 348  348  #ifdef VM_STATS
 349  349  static struct pageoutvmstats_str {
↓ open down ↓ 28 lines elided ↑ open up ↑
 378  378          pgcnt_t ci_lotsfree;
 379  379          pgcnt_t ci_desfree;
 380  380          pgcnt_t ci_minfree;
 381  381          pgcnt_t ci_throttlefree;
 382  382          pgcnt_t ci_pageout_reserve;
 383  383          pgcnt_t ci_maxpgio;
 384  384          pgcnt_t ci_maxfastscan;
 385  385          pgcnt_t ci_fastscan;
 386  386          pgcnt_t ci_slowscan;
 387  387          pgcnt_t ci_handspreadpages;
      388 +        uint_t  ci_despagescanners;
 388  389  } clockinit = { .ci_init = false };
 389  390  
 390      -static pgcnt_t
      391 +static inline pgcnt_t
 391  392  clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 392  393  {
 393  394          if (value < minimum) {
 394  395                  return (minimum);
 395  396          } else if (value > maximum) {
 396  397                  return (maximum);
 397  398          } else {
 398  399                  return (value);
 399  400          }
 400  401  }
↓ open down ↓ 13 lines elided ↑ open up ↑
 414  415   * accessing the zone_num_over_cap variable except within schedpaging(), which
 415  416   * only runs periodically. This is here only to reduce our access to
 416  417   * zone_num_over_cap, since it is already accessed a lot during paging, and
 417  418   * the page scanner accesses the zones_over variable on each page during a
 418  419   * scan. There is no lock needed for zone_num_over_cap since schedpaging()
 419  420   * doesn't modify the variable, it only cares if the variable is 0 or non-0.
 420  421   */
 421  422  static boolean_t zones_over = B_FALSE;
 422  423  
 423  424  /*
      425 + * On large memory systems, multiple instances of the page scanner are run,
      426 + * each responsible for a separate region of memory. This speeds up page
      427 + * invalidation under low memory conditions.
      428 + *
      429 + * despagescanners can be set in /etc/system or via mdb and it will
      430 + * be used as a guide for how many page scanners to create; the value
      431 + * will be adjusted if it is not sensible. Otherwise, the number of
      432 + * page scanners is determined dynamically based on handspreadpages.
      433 + */
      434 +static void
      435 +recalc_pagescanners(void)
      436 +{
      437 +        pgcnt_t sz;
      438 +        uint_t des;
      439 +
      440 +        /* If the initial calibration has not been done, take no action. */
      441 +        if (pageout_new_spread == 0)
      442 +                return;
      443 +
      444 +        /*
      445 +         * If the desired number of scanners is set in /etc/system
      446 +         * then try to use it.
      447 +         */
      448 +        if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
      449 +                despagescanners = clockinit.ci_despagescanners;
      450 +
      451 +        if (despagescanners != 0) {
      452 +                /*
      453 +                 * We have a desired number of page scanners, either from
      454 +                 * /etc/system or set via mdb. Try and use it (it will be
      455 +                 * clamped below).
      456 +                 */
      457 +                des = despagescanners;
      458 +        } else {
      459 +                /*
      460 +                 * Calculate the number of desired scanners based on the
      461 +                 * system's memory size.
      462 +                 *
      463 +                 * A 64GiB region size is used as the basis for calculating how
      464 +                 * many scanner threads should be created. For systems with up
      465 +                 * to 64GiB of RAM, a single thread is used; for very large
      466 +                 * memory systems the threads are limited to MAX_PSCAN_THREADS.
      467 +                 */
      468 +                sz = btop(64ULL << 30);
      469 +
      470 +                if (sz > looppages) {
      471 +                        des = 1;
      472 +                } else {
      473 +                        pgcnt_t tmp = sz;
      474 +
      475 +                        for (des = 1; tmp < looppages; des++)
      476 +                                tmp += sz;
      477 +                }
      478 +        }
      479 +
      480 +        /*
      481 +         * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
      482 +         * and so that each scanner covers at least 10% more than
      483 +         * handspreadpages.
      484 +         */
      485 +        des = clamp(des, 1,
      486 +            looppages / (handspreadpages + handspreadpages / 10));
      487 +        despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
      488 +}
      489 +
      490 +/*
 424  491   * Set up the paging constants for the clock algorithm used by
 425  492   * pageout_scanner(), and by the virtual memory system overall.  See the
 426  493   * comments at the top of this file for more information about the threshold
 427  494   * values and system responses to memory pressure.
 428  495   *
 429  496   * This routine is called once by main() at startup, after the initial size of
 430  497   * physical memory is determined.  It may be called again later if memory is
 431  498   * added to or removed from the system, or if new measurements of the page scan
 432  499   * rate become available.
 433  500   */
 434  501  void
 435  502  setupclock(void)
 436  503  {
 437      -        uint_t i;
 438      -        pgcnt_t sz, tmp;
 439      -        pgcnt_t defval;
 440  504          bool half = (pageout_threshold_style == 1);
 441  505          bool recalc = true;
 442  506  
 443  507          looppages = total_pages;
 444  508  
 445  509          /*
 446  510           * The operator may have provided specific values for some of the
 447  511           * tunables via /etc/system.  On our first call, we preserve those
 448  512           * values so that they can be used for subsequent recalculations.
 449  513           *
 450  514           * A value of zero for any tunable means we will use the default
 451  515           * sizing.
 452  516           */
 453      -
 454  517          if (!clockinit.ci_init) {
 455  518                  clockinit.ci_init = true;
 456  519  
 457  520                  clockinit.ci_lotsfree_min = lotsfree_min;
 458  521                  clockinit.ci_lotsfree_max = lotsfree_max;
 459  522                  clockinit.ci_lotsfree = lotsfree;
 460  523                  clockinit.ci_desfree = desfree;
 461  524                  clockinit.ci_minfree = minfree;
 462  525                  clockinit.ci_throttlefree = throttlefree;
 463  526                  clockinit.ci_pageout_reserve = pageout_reserve;
 464  527                  clockinit.ci_maxpgio = maxpgio;
 465  528                  clockinit.ci_maxfastscan = maxfastscan;
 466  529                  clockinit.ci_fastscan = fastscan;
 467  530                  clockinit.ci_slowscan = slowscan;
 468  531                  clockinit.ci_handspreadpages = handspreadpages;
      532 +                clockinit.ci_despagescanners = despagescanners;
 469  533  
 470  534                  /*
 471  535                   * The first call does not trigger a recalculation, only
 472  536                   * subsequent calls.
 473  537                   */
 474  538                  recalc = false;
 475  539          }
 476  540  
 477  541          /*
 478  542           * Configure paging threshold values.  For more details on what each
↓ open down ↓ 161 lines elided ↑ open up ↑
 640  704                  slowscan = MIN(fastscan / 10, maxslowscan);
 641  705          } else {
 642  706                  slowscan = clockinit.ci_slowscan;
 643  707          }
 644  708  
 645  709          if (slowscan > fastscan / 2) {
 646  710                  slowscan = fastscan / 2;
 647  711          }
 648  712  
 649  713          /*
 650      -         * Handspreadpages is distance (in pages) between front and back
      714 +         * Handspreadpages is the distance (in pages) between front and back
 651  715           * pageout daemon hands.  The amount of time to reclaim a page
 652  716           * once pageout examines it increases with this distance and
 653  717           * decreases as the scan rate rises. It must be < the amount
 654  718           * of pageable memory.
 655  719           *
 656  720           * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 657  721           * to be "fastscan" results in the front hand being a few secs
 658  722           * (varies based on the processor speed) ahead of the back hand
 659  723           * at fastscan rates.  This distance can be further reduced, if
 660  724           * necessary, by increasing the processor time used by pageout
↓ open down ↓ 14 lines elided ↑ open up ↑
 675  739          /*
 676  740           * Make sure that back hand follows front hand by at least
 677  741           * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
 678  742           * back hand to look at a page during the same wakeup of the pageout
 679  743           * daemon in which the front hand cleared its ref bit.
 680  744           */
 681  745          if (handspreadpages >= looppages) {
 682  746                  handspreadpages = looppages - 1;
 683  747          }
 684  748  
 685      -        if (!recalc) {
 686      -                /*
 687      -                 * Setup basic values at initialization.
 688      -                 */
 689      -                pscan_region_sz = total_pages;
 690      -                des_page_scanners = n_page_scanners = 1;
 691      -                reset_hands[0] = B_TRUE;
      749 +        /*
      750 +         * Establish the minimum and maximum length of time to be spent
      751 +         * scanning pages per wakeup, limiting the scanner duty cycle.  The
      752 +         * input percentage values (0-100) must be converted to a fraction of
      753 +         * the number of nanoseconds in a second of wall time, then further
      754 +         * scaled down by the number of scanner wakeups in a second.
      755 +         */
      756 +        min_pageout_nsec = MAX(1,
      757 +            NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
      758 +        max_pageout_nsec = MAX(min_pageout_nsec,
      759 +            NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
      760 +
      761 +        /*
      762 +         * If not called for recalculation, return and skip the remaining
      763 +         * steps.
      764 +         */
      765 +        if (!recalc)
 692  766                  return;
 693      -        }
 694  767  
 695  768          /*
 696      -         * Recalculating
 697      -         *
 698      -         * We originally set the number of page scanners to 1. Now that we
 699      -         * know what the handspreadpages is for a scanner, figure out how many
 700      -         * scanners we should run. We want to ensure that the regions don't
 701      -         * overlap and that they are not touching.
 702      -         *
 703      -         * A default 64GB region size is used as the initial value to calculate
 704      -         * how many scanner threads we should create on lower memory systems.
 705      -         * The idea is to limit the number of threads to a practical value
 706      -         * (e.g. a 64GB machine really only needs one scanner thread). For very
 707      -         * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
 708      -         * threads.
 709      -         *
 710      -         * The scanner threads themselves are evenly spread out around the
 711      -         * memory "clock" in pageout_scanner when we reset the hands, and each
 712      -         * thread will scan all of memory.
      769 +         * Set a flag to re-evaluate the clock hand positions.
 713  770           */
 714      -        sz = (btop(64ULL * 0x40000000ULL));
 715      -        if (sz < handspreadpages) {
 716      -                /*
 717      -                 * 64GB is smaller than the separation between the front
 718      -                 * and back hands; use double handspreadpages.
 719      -                 */
 720      -                sz = handspreadpages << 1;
 721      -        }
 722      -        if (sz > total_pages) {
 723      -                sz = total_pages;
 724      -        }
 725      -        /* Record region size for inspection with mdb, otherwise unused */
 726      -        pscan_region_sz = sz;
      771 +        for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
      772 +                reset_hands[i] = B_TRUE;
 727  773  
 728      -        tmp = sz;
 729      -        for (i = 1; tmp < total_pages; i++) {
 730      -                tmp += sz;
 731      -        }
 732      -
 733      -        if (i > MAX_PSCAN_THREADS)
 734      -                i = MAX_PSCAN_THREADS;
 735      -
 736      -        des_page_scanners = i;
      774 +        recalc_pagescanners();
 737  775  }
 738  776  
 739  777  /*
 740  778   * Pageout scheduling.
 741  779   *
 742  780   * Schedpaging controls the rate at which the page out daemon runs by
 743  781   * setting the global variables nscan and desscan SCHEDPAGING_HZ
 744  782   * times a second.  Nscan records the number of pages pageout has examined
 745  783   * in its current pass; schedpaging() resets this value to zero each time
 746  784   * it runs.  Desscan records the number of pages pageout should examine
 747  785   * in its next pass; schedpaging() sets this value based on the amount of
 748  786   * currently available memory.
 749  787   */
 750      -#define SCHEDPAGING_HZ  4
 751  788  
 752      -static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
      789 +static kmutex_t pageout_mutex;
 753  790  
 754  791  /*
 755  792   * Pool of available async pageout putpage requests.
 756  793   */
 757  794  static struct async_reqs *push_req;
 758  795  static struct async_reqs *req_freelist; /* available req structs */
 759  796  static struct async_reqs *push_list;    /* pending reqs */
 760  797  static kmutex_t push_lock;              /* protects req pool */
 761  798  static kcondvar_t push_cv;
 762  799  
↓ open down ↓ 7 lines elided ↑ open up ↑
 770  807   * I/O, which should not take long unless the underlying strategy call blocks
 771  808   * indefinitely for memory.  The actual I/O request happens (or fails) later.
 772  809   */
 773  810  uint_t pageout_deadman_seconds = 90;
 774  811  
 775  812  static uint_t pageout_stucktime = 0;
 776  813  static bool pageout_pushing = false;
 777  814  static uint64_t pageout_pushcount = 0;
 778  815  static uint64_t pageout_pushcount_seen = 0;
 779  816  
 780      -static int async_list_size = 256;       /* number of async request structs */
      817 +static int async_list_size = 8192;      /* number of async request structs */
 781  818  
 782  819  static void pageout_scanner(void *);
 783  820  
 784  821  /*
 785  822   * If a page is being shared more than "po_share" times
 786  823   * then leave it alone- don't page it out.
 787  824   */
 788  825  #define MIN_PO_SHARE    (8)
 789  826  #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 790  827  ulong_t po_share = MIN_PO_SHARE;
↓ open down ↓ 10 lines elided ↑ open up ↑
 801  838  
 802  839          if (freemem < lotsfree + needfree + kmem_reapahead)
 803  840                  kmem_reap();
 804  841  
 805  842          if (freemem < lotsfree + needfree)
 806  843                  seg_preap();
 807  844  
 808  845          if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 809  846                  kcage_cageout_wakeup();
 810  847  
 811      -        (void) atomic_swap_ulong(&nscan, 0);
 812      -        vavail = freemem - deficit;
 813      -        if (pageout_new_spread != 0)
 814      -                vavail -= needfree;
 815      -        if (vavail < 0)
 816      -                vavail = 0;
 817      -        if (vavail > lotsfree)
 818      -                vavail = lotsfree;
      848 +        if (mutex_tryenter(&pageout_mutex)) {
 819  849  
 820      -        /*
 821      -         * Fix for 1161438 (CRS SPR# 73922).  All variables
 822      -         * in the original calculation for desscan were 32 bit signed
 823      -         * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 824      -         * more of memory, the calculation can overflow.  When this
 825      -         * happens, desscan becomes negative and pageout_scanner()
 826      -         * stops paging out.
 827      -         */
 828      -        if (needfree > 0 && pageout_new_spread == 0) {
 829      -                /*
 830      -                 * If we've not yet collected enough samples to
 831      -                 * calculate a spread, kick into high gear anytime
 832      -                 * needfree is non-zero. Note that desscan will not be
 833      -                 * the limiting factor for systems with larger memory;
 834      -                 * the %CPU will limit the scan. That will also be
 835      -                 * maxed out below.
 836      -                 */
 837      -                desscan = fastscan / SCHEDPAGING_HZ;
 838      -        } else {
 839      -                /*
 840      -                 * Once we've calculated a spread based on system
 841      -                 * memory and usage, just treat needfree as another
 842      -                 * form of deficit.
 843      -                 */
 844      -                spgcnt_t faststmp, slowstmp, result;
      850 +                if (pageouts_running != 0)
      851 +                        goto out;
 845  852  
 846      -                slowstmp = slowscan * vavail;
 847      -                faststmp = fastscan * (lotsfree - vavail);
 848      -                result = (slowstmp + faststmp) /
 849      -                    nz(lotsfree) / SCHEDPAGING_HZ;
 850      -                desscan = (pgcnt_t)result;
 851      -        }
      853 +                /* No pageout scanner threads running. */
      854 +                nscan = 0;
      855 +                vavail = freemem - deficit;
      856 +                if (pageout_new_spread != 0)
      857 +                        vavail -= needfree;
      858 +                vavail = clamp(vavail, 0, lotsfree);
 852  859  
 853      -        /*
 854      -         * If we've not yet collected enough samples to calculate a
 855      -         * spread, also kick %CPU to the max.
 856      -         */
 857      -        if (pageout_new_spread == 0) {
 858      -                pageout_nsec = max_pageout_nsec;
 859      -        } else {
 860      -                pageout_nsec = min_pageout_nsec +
 861      -                    (lotsfree - vavail) *
 862      -                    (max_pageout_nsec - min_pageout_nsec) /
 863      -                    nz(lotsfree);
 864      -        }
      860 +                if (needfree > 0 && pageout_new_spread == 0) {
      861 +                        /*
      862 +                         * If we've not yet collected enough samples to
      863 +                         * calculate a spread, use the old logic of kicking
      864 +                         * into high gear anytime needfree is non-zero.
      865 +                         */
      866 +                        desscan = fastscan / SCHEDPAGING_HZ;
      867 +                } else {
      868 +                        /*
      869 +                         * Once we've calculated a spread based on system
      870 +                         * memory and usage, just treat needfree as another
      871 +                         * form of deficit.
      872 +                         */
      873 +                        spgcnt_t faststmp, slowstmp, result;
 865  874  
 866      -        if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
 867      -                /*
 868      -                 * We have finished the pagescan initialization and the desired
 869      -                 * number of page scanners has changed, either because
 870      -                 * initialization just finished, because of a memory DR, or
 871      -                 * because des_page_scanners has been modified on the fly (i.e.
 872      -                 * by mdb). If we need more scanners, start them now, otherwise
 873      -                 * the excess scanners will terminate on their own when they
 874      -                 * reset their hands.
 875      -                 */
 876      -                uint_t i;
 877      -                uint_t curr_nscan = n_page_scanners;
 878      -                pgcnt_t max = total_pages / handspreadpages;
      875 +                        slowstmp = slowscan * vavail;
      876 +                        faststmp = fastscan * (lotsfree - vavail);
      877 +                        result = (slowstmp + faststmp) /
      878 +                            nz(lotsfree) / SCHEDPAGING_HZ;
      879 +                        desscan = (pgcnt_t)result;
      880 +                }
 879  881  
 880      -                if (des_page_scanners > max)
 881      -                        des_page_scanners = max;
      882 +                pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
      883 +                    (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
 882  884  
 883      -                if (des_page_scanners > MAX_PSCAN_THREADS) {
 884      -                        des_page_scanners = MAX_PSCAN_THREADS;
 885      -                } else if (des_page_scanners == 0) {
 886      -                        des_page_scanners = 1;
 887      -                }
      885 +                DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
      886 +                    pageout_nsec);
 888  887  
 889      -                /*
 890      -                 * Each thread has its own entry in the reset_hands array, so
 891      -                 * we don't need any locking in pageout_scanner to check the
 892      -                 * thread's reset_hands entry. Thus, we use a pre-allocated
 893      -                 * fixed size reset_hands array and upper limit on the number
 894      -                 * of pagescan threads.
 895      -                 *
 896      -                 * The reset_hands entries need to be true before we start new
 897      -                 * scanners, but if we're reducing, we don't want a race on the
 898      -                 * recalculation for the existing threads, so we set
 899      -                 * n_page_scanners first.
 900      -                 */
 901      -                n_page_scanners = des_page_scanners;
 902      -                for (i = 0; i < MAX_PSCAN_THREADS; i++) {
 903      -                        reset_hands[i] = B_TRUE;
 904      -                }
      888 +                if (pageout_new_spread != 0 && despagescanners != 0 &&
      889 +                    despagescanners != n_page_scanners) {
      890 +                        /*
      891 +                        * We have finished the pagescan initialisation and the
      892 +                        * desired number of page scanners has changed, either
      893 +                        * because initialisation just finished, because of a
      894 +                        * memory DR, or because despagescanners has been
      895 +                        * modified on the fly (i.e. by mdb).
      896 +                        */
      897 +                        uint_t i, curr_nscan = n_page_scanners;
 905  898  
 906      -                if (des_page_scanners > curr_nscan) {
 907      -                        /* Create additional pageout scanner threads. */
 908      -                        for (i = curr_nscan; i < des_page_scanners; i++) {
 909      -                                (void) lwp_kernel_create(proc_pageout,
 910      -                                    pageout_scanner, (void *)(uintptr_t)i,
 911      -                                    TS_RUN, curthread->t_pri);
      899 +                        /* Re-validate despagescanners */
      900 +                        recalc_pagescanners();
      901 +
      902 +                        n_page_scanners = despagescanners;
      903 +
      904 +                        for (i = 0; i < MAX_PSCAN_THREADS; i++)
      905 +                                reset_hands[i] = B_TRUE;
      906 +
      907 +                        /* If we need more scanners, start them now. */
      908 +                        if (n_page_scanners > curr_nscan) {
      909 +                                for (i = curr_nscan; i < n_page_scanners; i++) {
      910 +                                        (void) lwp_kernel_create(proc_pageout,
      911 +                                            pageout_scanner,
      912 +                                            (void *)(uintptr_t)i, TS_RUN,
      913 +                                            pageout_pri);
      914 +                                }
 912  915                          }
      916 +
      917 +                        /*
      918 +                         * If the number of scanners has decreased, trigger a
      919 +                         * wakeup so that the excess threads will terminate.
      920 +                         */
      921 +                        if (n_page_scanners < curr_nscan) {
      922 +                                WAKE_PAGEOUT_SCANNER();
      923 +                        }
 913  924                  }
 914      -        }
 915  925  
 916      -        zones_over = B_FALSE;
      926 +                zones_over = B_FALSE;
 917  927  
 918      -        if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
 919      -                if (!PAGE_SCAN_STARTUP)
      928 +                if (PAGE_SCAN_STARTUP) {
      929 +                        /*
      930 +                         * We still need to measure the rate at which the
      931 +                         * system is able to scan pages of memory. Each of
      932 +                         * these initial samples is a scan of as much system
      933 +                         * memory as practical, regardless of whether or not we
      934 +                         * are experiencing memory pressure.
      935 +                         */
      936 +                        desscan = total_pages;
      937 +                        pageout_nsec = max_pageout_nsec;
      938 +
      939 +                        DTRACE_PROBE(schedpage__wake__sample);
      940 +                        WAKE_PAGEOUT_SCANNER();
      941 +                } else if (freemem < lotsfree + needfree) {
      942 +                        /*
      943 +                         * We need more memory.
      944 +                         */
 920  945                          low_mem_scan++;
 921      -                /*
 922      -                 * Either we need more memory, or we still need to
 923      -                 * measure the average scan rate.  Wake the scanner.
 924      -                 */
 925      -                DTRACE_PROBE(schedpage__wake__low);
 926      -                WAKE_PAGEOUT_SCANNER();
 927  946  
 928      -        } else if (zone_num_over_cap > 0) {
 929      -                /* One or more zones are over their cap. */
      947 +                        DTRACE_PROBE(schedpage__wake__low);
      948 +                        WAKE_PAGEOUT_SCANNER();
      949 +                } else if (zone_num_over_cap > 0) {
      950 +                        /*
      951 +                         * One of more zones are over their cap.
      952 +                         */
 930  953  
 931      -                /* No page limit */
 932      -                desscan = total_pages;
      954 +                        /* No page limit */
      955 +                        desscan = total_pages;
 933  956  
 934      -                /*
 935      -                 * Increase the scanning CPU% to the max. This implies
 936      -                 * 80% of one CPU/sec if the scanner can run each
 937      -                 * opportunity. Can also be tuned via setting
 938      -                 * zone_pageout_nsec in /etc/system or with mdb.
 939      -                 */
 940      -                pageout_nsec = (zone_pageout_nsec != 0) ?
 941      -                    zone_pageout_nsec : max_pageout_nsec;
      957 +                        /*
      958 +                        * Increase the scanning CPU% to the max. This implies
      959 +                        * 80% of one CPU/sec if the scanner can run each
      960 +                        * opportunity. Can also be tuned via setting
      961 +                        * zone_pageout_nsec in /etc/system or with mdb.
      962 +                        */
      963 +                        pageout_nsec = (zone_pageout_nsec != 0) ?
      964 +                           zone_pageout_nsec : max_pageout_nsec;
 942  965  
 943      -                zones_over = B_TRUE;
 944      -                zone_cap_scan++;
      966 +                        zones_over = B_TRUE;
      967 +                        zone_cap_scan++;
 945  968  
 946      -                DTRACE_PROBE(schedpage__wake__zone);
 947      -                WAKE_PAGEOUT_SCANNER();
 948      -
 949      -        } else {
 950      -                /*
 951      -                 * There are enough free pages, no need to
 952      -                 * kick the scanner thread.  And next time
 953      -                 * around, keep more of the `highly shared'
 954      -                 * pages.
 955      -                 */
 956      -                cv_signal_pageout();
 957      -
 958      -                mutex_enter(&pageout_mutex);
 959      -                if (po_share > MIN_PO_SHARE) {
 960      -                        po_share >>= 1;
      969 +                        DTRACE_PROBE(schedpage__wake__zone);
      970 +                        WAKE_PAGEOUT_SCANNER();
      971 +                } else {
      972 +                        /*
      973 +                         * There are enough free pages, no need to
      974 +                         * kick the scanner thread.  And next time
      975 +                         * around, keep more of the `highly shared'
      976 +                         * pages.
      977 +                         */
      978 +                        cv_signal_pageout();
      979 +                        if (po_share > MIN_PO_SHARE) {
      980 +                                po_share >>= 1;
      981 +                        }
 961  982                  }
      983 +out:
 962  984                  mutex_exit(&pageout_mutex);
 963  985          }
 964  986  
 965  987          /*
 966  988           * Signal threads waiting for available memory.
 967  989           * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 968  990           * in this case it is not needed - the waiters will be waken up during
 969  991           * the next invocation of this function.
 970  992           */
 971  993          if (kmem_avail() > 0)
↓ open down ↓ 8 lines elided ↑ open up ↑
 980 1002  /*
 981 1003   * Paging out should always be enabled.  This tunable exists to hold pageout
 982 1004   * for debugging purposes.  If set to 0, pageout_scanner() will go back to
 983 1005   * sleep each time it is woken by schedpaging().
 984 1006   */
 985 1007  uint_t dopageout = 1;
 986 1008  
 987 1009  /*
 988 1010   * The page out daemon, which runs as process 2.
 989 1011   *
 990      - * Page out occurs when either:
 991      - * a) there is less than lotsfree pages,
 992      - * b) there are one or more zones over their physical memory cap.
     1012 + * The daemon treats physical memory as a circular array of pages and scans
     1013 + * the pages using a 'two-handed clock' algorithm. The front hand moves
     1014 + * through the pages, clearing the reference bit. The back hand travels a
     1015 + * distance (handspreadpages) behind the front hand, freeing the pages that
     1016 + * have not been referenced in the time since the front hand passed. If
     1017 + * modified, they are first written to their backing store before being
     1018 + * freed.
 993 1019   *
 994      - * The daemon treats physical memory as a circular array of pages and scans the
 995      - * pages using a 'two-handed clock' algorithm. The front hand moves through
 996      - * the pages, clearing the reference bit. The back hand travels a distance
 997      - * (handspreadpages) behind the front hand, freeing the pages that have not
 998      - * been referenced in the time since the front hand passed. If modified, they
 999      - * are first written to their backing store before being freed.
     1020 + * In order to make page invalidation more responsive on machines with
     1021 + * larger memory, multiple pageout_scanner threads may be created. In this
     1022 + * case, each thread is given a segment of the memory "clock face" so that
     1023 + * memory can be reclaimed more quickly.
1000 1024   *
1001      - * In order to make page invalidation more responsive on machines with larger
1002      - * memory, multiple pageout_scanner threads may be created. In this case, the
1003      - * threads are evenly distributed around the the memory "clock face" so that
1004      - * memory can be reclaimed more quickly (that is, there can be large regions in
1005      - * which no pages can be reclaimed by a single thread, leading to lag which
1006      - * causes undesirable behavior such as htable stealing).
     1025 + * As long as there are at least lotsfree pages, or no zones over their
     1026 + * cap, then pageout_scanner threads are not run. When pageout_scanner
     1027 + * threads are running for case (a), all pages are considered for pageout.
     1028 + * For case (b), only pages belonging to a zone over its cap will be
     1029 + * considered for pageout.
1007 1030   *
1008      - * As long as there are at least lotsfree pages, or no zones over their cap,
1009      - * then pageout_scanner threads are not run. When pageout_scanner threads are
1010      - * running for case (a), all pages are considered for pageout. For case (b),
1011      - * only pages belonging to a zone over its cap will be considered for pageout.
1012      - *
1013      - * There are multiple threads that act on behalf of the pageout process.
1014      - * A set of threads scan pages (pageout_scanner) and frees them up if
1015      - * they don't require any VOP_PUTPAGE operation. If a page must be
1016      - * written back to its backing store, the request is put on a list
1017      - * and the other (pageout) thread is signaled. The pageout thread
1018      - * grabs VOP_PUTPAGE requests from the list, and processes them.
1019      - * Some filesystems may require resources for the VOP_PUTPAGE
1020      - * operations (like memory) and hence can block the pageout
1021      - * thread, but the pageout_scanner threads can still operate. There is still
1022      - * no guarantee that memory deadlocks cannot occur.
1023      - *
1024      - * The pageout_scanner parameters are determined in schedpaging().
     1031 + * There are multiple threads that act on behalf of the pageout process. A
     1032 + * set of threads scan pages (pageout_scanner) and frees them up if they
     1033 + * don't require any VOP_PUTPAGE operation. If a page must be written back
     1034 + * to its backing store, the request is put on a list and the other
     1035 + * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
     1036 + * requests from the list, and processes them. Some filesystems may require
     1037 + * resources for the VOP_PUTPAGE operations (like memory) and hence can
     1038 + * block the pageout thread, but the scanner thread can still operate.
     1039 + * There is still no guarantee that memory deadlocks cannot occur.
1025 1040   */
1026 1041  void
1027 1042  pageout()
1028 1043  {
1029 1044          struct async_reqs *arg;
1030      -        pri_t pageout_pri;
1031 1045          int i;
1032 1046          pgcnt_t max_pushes;
1033 1047          callb_cpr_t cprinfo;
1034 1048  
1035 1049          proc_pageout = ttoproc(curthread);
1036 1050          proc_pageout->p_cstime = 0;
1037 1051          proc_pageout->p_stime =  0;
1038 1052          proc_pageout->p_cutime =  0;
1039 1053          proc_pageout->p_utime = 0;
1040 1054          bcopy("pageout", PTOU(curproc)->u_psargs, 8);
↓ open down ↓ 10 lines elided ↑ open up ↑
1051 1065           * for pageout.
1052 1066           */
1053 1067          push_req = (struct async_reqs *)
1054 1068              kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1055 1069  
1056 1070          req_freelist = push_req;
1057 1071          for (i = 0; i < async_list_size - 1; i++) {
1058 1072                  push_req[i].a_next = &push_req[i + 1];
1059 1073          }
1060 1074  
1061      -        pageout_pri = curthread->t_pri;
     1075 +        pageout_pri = curthread->t_pri - 1;
1062 1076  
1063      -        /* Create the (first) pageout scanner thread. */
1064      -        (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
1065      -            pageout_pri - 1);
     1077 +        /* Create the first pageout scanner thread. */
     1078 +        (void) lwp_kernel_create(proc_pageout, pageout_scanner,
     1079 +            (void *)0,  /* this is instance 0, not NULL */
     1080 +            TS_RUN, pageout_pri);
1066 1081  
1067 1082          /*
1068 1083           * kick off pageout scheduler.
1069 1084           */
1070 1085          schedpaging(NULL);
1071 1086  
1072 1087          /*
1073 1088           * Create kernel cage thread.
1074 1089           * The kernel cage thread is started under the pageout process
1075 1090           * to take advantage of the less restricted page allocation
↓ open down ↓ 15 lines elided ↑ open up ↑
1091 1106                          cv_wait(&push_cv, &push_lock);
1092 1107                          pushes = 0;
1093 1108                          CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1094 1109                  }
1095 1110                  push_list = arg->a_next;
1096 1111                  arg->a_next = NULL;
1097 1112                  pageout_pushing = true;
1098 1113                  mutex_exit(&push_lock);
1099 1114  
1100 1115                  DTRACE_PROBE(pageout__push);
     1116 +
1101 1117                  if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1102 1118                      arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1103 1119                          pushes++;
1104 1120                  }
1105 1121  
1106 1122                  /* vp held by checkpage() */
1107 1123                  VN_RELE(arg->a_vp);
1108 1124  
1109 1125                  mutex_enter(&push_lock);
1110 1126                  pageout_pushing = false;
↓ open down ↓ 4 lines elided ↑ open up ↑
1115 1131                  mutex_exit(&push_lock);
1116 1132          }
1117 1133  }
1118 1134  
1119 1135  /*
1120 1136   * Kernel thread that scans pages looking for ones to free
1121 1137   */
1122 1138  static void
1123 1139  pageout_scanner(void *a)
1124 1140  {
1125      -        struct page *fronthand, *backhand;
1126      -        uint_t laps, iter = 0;
     1141 +        struct page *fronthand, *backhand, *fronthandstart;
     1142 +        struct page *regionstart, *regionend;
     1143 +        uint_t laps;
1127 1144          callb_cpr_t cprinfo;
1128      -        pgcnt_t nscan_cnt, nscan_limit;
     1145 +        pgcnt_t nscan_cnt, tick;
1129 1146          pgcnt_t pcount;
1130      -        uint_t inst = (uint_t)(uintptr_t)a;
     1147 +        bool bhwrapping, fhwrapping;
1131 1148          hrtime_t sample_start, sample_end;
1132      -        kmutex_t pscan_mutex;
1133      -        bool sampling;
     1149 +        uint_t inst = (uint_t)(uintptr_t)a;
1134 1150  
1135 1151          VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1136 1152  
1137      -        mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
     1153 +        CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
     1154 +        mutex_enter(&pageout_mutex);
1138 1155  
1139      -        CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
1140      -        mutex_enter(&pscan_mutex);
1141      -
1142 1156          /*
1143      -         * Establish the minimum and maximum length of time to be spent
1144      -         * scanning pages per wakeup, limiting the scanner duty cycle.  The
1145      -         * input percentage values (0-100) must be converted to a fraction of
1146      -         * the number of nanoseconds in a second of wall time, then further
1147      -         * scaled down by the number of scanner wakeups in a second:
     1157 +         * The restart case does not attempt to point the hands at roughly
     1158 +         * the right point on the assumption that after one circuit things
     1159 +         * will have settled down, and restarts shouldn't be that often.
1148 1160           */
1149      -        min_pageout_nsec = MAX(1,
1150      -            NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
1151      -        max_pageout_nsec = MAX(min_pageout_nsec,
1152      -            NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
     1161 +        reset_hands[inst] = B_TRUE;
1153 1162  
     1163 +        pageouts_running++;
     1164 +        mutex_exit(&pageout_mutex);
     1165 +
1154 1166  loop:
1155 1167          cv_signal_pageout();
1156 1168  
     1169 +        mutex_enter(&pageout_mutex);
     1170 +        pageouts_running--;
1157 1171          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1158      -        cv_wait(&proc_pageout->p_cv, &pscan_mutex);
1159      -        CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
     1172 +        cv_wait(&proc_pageout->p_cv, &pageout_mutex);
     1173 +        CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
     1174 +        pageouts_running++;
     1175 +        mutex_exit(&pageout_mutex);
1160 1176  
1161 1177          /*
1162      -         * Check if pageout has been disabled for debugging purposes:
     1178 +         * Check if pageout has been disabled for debugging purposes.
1163 1179           */
1164 1180          if (!dopageout) {
1165 1181                  goto loop;
1166 1182          }
1167 1183  
1168 1184          /*
1169      -         * One may reset the clock hands for debugging purposes.  Hands will
1170      -         * also be reset if memory is added to or removed from the system.
     1185 +         * One may reset the clock hands and scanned region for debugging
     1186 +         * purposes. Hands will also be reset on first thread startup, if
     1187 +         * the number of scanning threads (n_page_scanners) changes, or if
     1188 +         * memory is added to, or removed from, the system.
1171 1189           */
1172 1190          if (reset_hands[inst]) {
1173 1191                  struct page *first;
1174      -                pgcnt_t offset = total_pages / n_page_scanners;
1175 1192  
1176 1193                  reset_hands[inst] = B_FALSE;
     1194 +
1177 1195                  if (inst >= n_page_scanners) {
1178 1196                          /*
1179      -                         * The desired number of page scanners has been
1180      -                         * reduced and this instance is no longer wanted.
1181      -                         * Exit the lwp.
1182      -                         */
     1197 +                        * The desired number of page scanners has been
     1198 +                        * reduced and this instance is no longer wanted.
     1199 +                        * Exit the lwp.
     1200 +                        */
1183 1201                          VERIFY3U(inst, !=, 0);
1184      -                        mutex_exit(&pscan_mutex);
     1202 +                        DTRACE_PROBE1(pageout__exit, uint_t, inst);
     1203 +                        mutex_enter(&pageout_mutex);
     1204 +                        pageouts_running--;
     1205 +                        mutex_exit(&pageout_mutex);
1185 1206                          mutex_enter(&curproc->p_lock);
1186 1207                          lwp_exit();
     1208 +                        /* NOTREACHED */
1187 1209                  }
1188 1210  
     1211 +                first = page_first();
     1212 +
1189 1213                  /*
1190      -                 * The reset case repositions the hands at the proper place
1191      -                 * on the memory clock face to prevent creep into another
1192      -                 * thread's active region or when the number of threads has
1193      -                 * changed.
1194      -                 *
1195      -                 * Set the two clock hands to be separated by a reasonable
1196      -                 * amount, but no more than 360 degrees apart.
1197      -                 *
1198      -                 * If inst == 0, backhand starts at first page, otherwise
1199      -                 * it is (inst * offset) around the memory "clock face" so that
1200      -                 * we spread out each scanner instance evenly.
     1214 +                 * Each scanner thread gets its own sector of the memory
     1215 +                 * clock face.
1201 1216                   */
1202      -                first = page_first();
1203      -                backhand = page_nextn(first, offset * inst);
1204      -                if (handspreadpages >= total_pages) {
1205      -                        fronthand = page_nextn(backhand, total_pages - 1);
     1217 +                pgcnt_t span, offset;
     1218 +
     1219 +                span = looppages / n_page_scanners;
     1220 +                VERIFY3U(span, >, handspreadpages);
     1221 +
     1222 +                offset = inst * span;
     1223 +                regionstart = page_nextn(first, offset);
     1224 +                if (inst == n_page_scanners - 1) {
     1225 +                        /* The last instance goes up to the last page */
     1226 +                        regionend = page_nextn(first, looppages - 1);
1206 1227                  } else {
1207      -                        fronthand = page_nextn(backhand, handspreadpages);
     1228 +                        regionend = page_nextn(regionstart, span - 1);
1208 1229                  }
     1230 +
     1231 +                backhand = regionstart;
     1232 +                fronthand = page_nextn(backhand, handspreadpages);
     1233 +                tick = 1;
     1234 +
     1235 +                bhwrapping = fhwrapping = B_FALSE;
     1236 +
     1237 +                DTRACE_PROBE4(pageout__reset, uint_t, inst,
     1238 +                    pgcnt_t, regionstart, pgcnt_t, regionend,
     1239 +                    pgcnt_t, fronthand);
1209 1240          }
1210 1241  
     1242 +        /*
     1243 +         * This CPU kstat is only incremented here and we're obviously
     1244 +         * on this CPU, so no lock.
     1245 +         */
1211 1246          CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1212 1247  
1213 1248          /*
1214 1249           * Keep track of the number of times we have scanned all the way around
1215      -         * the loop:
     1250 +         * the loop on this wakeup.
1216 1251           */
1217 1252          laps = 0;
1218 1253  
1219 1254          /*
1220 1255           * Track the number of pages visited during this scan so that we can
1221 1256           * periodically measure our duty cycle.
1222 1257           */
1223      -        pcount = 0;
1224 1258          nscan_cnt = 0;
     1259 +        pcount = 0;
1225 1260  
1226      -        if (PAGE_SCAN_STARTUP) {
1227      -                /*
1228      -                 * We need to measure the rate at which the system is able to
1229      -                 * scan pages of memory.  Each of these initial samples is a
1230      -                 * scan of all system memory, regardless of whether or not we
1231      -                 * are experiencing memory pressure.
1232      -                 */
1233      -                nscan_limit = total_pages;
1234      -                sampling = true;
1235      -        } else {
1236      -                nscan_limit = desscan;
1237      -                sampling = false;
1238      -        }
     1261 +        DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
     1262 +            hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
1239 1263  
1240      -        DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
1241      -            page_t *, backhand, page_t *, fronthand);
     1264 +        /*
     1265 +         * Record the initial position of the front hand for this cycle so
     1266 +         * that we can detect when the hand wraps around.
     1267 +         */
     1268 +        fronthandstart = fronthand;
1242 1269  
1243 1270          sample_start = gethrtime();
1244 1271  
1245 1272          /*
1246 1273           * Scan the appropriate number of pages for a single duty cycle.
1247      -         * Only scan while at least one of these is true:
1248      -         * 1) one or more zones is over its cap
1249      -         * 2) there is not enough free memory
1250      -         * 3) during page scan startup when determining sample data
1251 1274           */
1252      -        while (nscan_cnt < nscan_limit) {
     1275 +        while (nscan_cnt < desscan) {
1253 1276                  checkpage_result_t rvfront, rvback;
1254 1277  
1255      -                if (!sampling && !zones_over &&
1256      -                    freemem >= lotsfree + needfree) {
     1278 +                /*
     1279 +                 * Only scan while at least one of these is true:
     1280 +                 *  1) one or more zones is over its cap
     1281 +                 *  2) there is not enough free memory
     1282 +                 *  3) during page scan startup when determining sample data
     1283 +                 */
     1284 +                if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
     1285 +                    !zones_over) {
1257 1286                          /*
1258 1287                           * We are not sampling and enough memory has become
1259 1288                           * available that scanning is no longer required.
1260 1289                           */
     1290 +                        DTRACE_PROBE1(pageout__memfree, uint_t, inst);
1261 1291                          break;
1262 1292                  }
1263 1293  
1264      -                DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
     1294 +                DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
1265 1295  
1266 1296                  /*
1267 1297                   * Periodically check to see if we have exceeded the CPU duty
1268 1298                   * cycle for a single wakeup.
1269 1299                   */
1270 1300                  if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1271 1301                          hrtime_t pageout_cycle_nsec;
1272 1302  
1273 1303                          pageout_cycle_nsec = gethrtime() - sample_start;
1274 1304                          if (pageout_cycle_nsec >= pageout_nsec) {
1275      -                                /*
1276      -                                 * This is where we normally break out of the
1277      -                                 * loop when scanning zones or sampling.
1278      -                                 */
1279      -                                if (!zones_over) {
     1305 +                                if (!zones_over)
1280 1306                                          atomic_inc_64(&pageout_timeouts);
1281      -                                }
1282 1307                                  DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1283 1308                                  break;
1284 1309                          }
1285 1310                  }
1286 1311  
1287 1312                  /*
1288 1313                   * If checkpage manages to add a page to the free list,
1289 1314                   * we give ourselves another couple of trips around the loop.
1290 1315                   */
1291 1316                  if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
↓ open down ↓ 11 lines elided ↑ open up ↑
1303 1328                   */
1304 1329                  CPU_STATS_ADDQ(CPU, vm, scan, 1);
1305 1330  
1306 1331                  /*
1307 1332                   * Don't include ineligible pages in the number scanned.
1308 1333                   */
1309 1334                  if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1310 1335                          nscan_cnt++;
1311 1336                  }
1312 1337  
1313      -                backhand = page_next(backhand);
1314      -                fronthand = page_next(fronthand);
     1338 +                if (bhwrapping) {
     1339 +                        backhand = regionstart;
     1340 +                        bhwrapping = B_FALSE;
     1341 +                } else {
     1342 +                        backhand = page_nextn(backhand, tick);
     1343 +                        if (backhand == regionend)
     1344 +                                bhwrapping = B_TRUE;
     1345 +                }
1315 1346  
     1347 +                if (fhwrapping) {
     1348 +                        fronthand = regionstart;
     1349 +                        fhwrapping = B_FALSE;
     1350 +                } else {
     1351 +                        fronthand = page_nextn(fronthand, tick);
     1352 +                        if (fronthand == regionend)
     1353 +                                fhwrapping = B_TRUE;
     1354 +                }
     1355 +
1316 1356                  /*
1317      -                 * The front hand has wrapped around to the first page in the
1318      -                 * loop.
     1357 +                 * The front hand has wrapped around during this wakeup.
1319 1358                   */
1320      -                if (fronthand == page_first())  {
1321      -                        DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
     1359 +                if (fronthand == fronthandstart) {
     1360 +                        laps++;
     1361 +                        DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
     1362 +                            uint_t, laps);
1322 1363  
1323 1364                          /*
1324      -                         * Every 64 wraps we reposition our hands within our
1325      -                         * region to prevent creep into another thread.
1326      -                         */
1327      -                        if ((++iter % pageout_reset_cnt) == 0)
1328      -                                reset_hands[inst] = B_TRUE;
1329      -
1330      -                        /*
1331 1365                           * This CPU kstat is only incremented here and we're
1332 1366                           * obviously on this CPU, so no lock.
1333 1367                           */
1334 1368                          CPU_STATS_ADDQ(CPU, vm, rev, 1);
1335 1369  
1336 1370                          /*
1337      -                         * If scanning because the system is low on memory,
1338 1371                           * then when we wraparound memory we want to try to
1339 1372                           * reclaim more pages.
1340 1373                           * If scanning only because zones are over their cap,
1341 1374                           * then wrapping is common and we simply keep going.
1342      -                         */
1343      -                        if (freemem < lotsfree + needfree && ++laps > 1) {
     1375 +                        */
     1376 +                        if (laps > 1 && freemem < lotsfree + needfree) {
1344 1377                                  /*
1345      -                                 * The system is low on memory.
1346 1378                                   * Extremely unlikely, but it happens.
1347 1379                                   * We went around the loop at least once
1348 1380                                   * and didn't get far enough.
1349 1381                                   * If we are still skipping `highly shared'
1350 1382                                   * pages, skip fewer of them.  Otherwise,
1351 1383                                   * give up till the next clock tick.
1352 1384                                   */
1353      -                                mutex_enter(&pageout_mutex);
1354 1385                                  if (po_share < MAX_PO_SHARE) {
1355 1386                                          po_share <<= 1;
1356      -                                        mutex_exit(&pageout_mutex);
1357 1387                                  } else {
1358      -                                        mutex_exit(&pageout_mutex);
1359 1388                                          break;
1360 1389                                  }
1361 1390                          }
1362 1391                  }
1363 1392          }
1364 1393  
     1394 +        sample_end = gethrtime();
1365 1395          atomic_add_long(&nscan, nscan_cnt);
1366 1396  
1367      -        sample_end = gethrtime();
     1397 +        DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
     1398 +            pgcnt_t, nscan_cnt, pgcnt_t, pcount)
1368 1399  
1369      -        DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
1370      -            uint_t, inst);
1371      -
1372 1400          /*
1373      -         * The following two blocks are only relevant when the scanner is
1374      -         * first started up. After the scanner runs for a while, neither of
1375      -         * the conditions will ever be true again.
1376      -         *
1377 1401           * The global variables used below are only modified by this thread and
1378 1402           * only during initial scanning when there is a single page scanner
1379      -         * thread running. Thus, we don't use any locking.
     1403 +         * thread running.
1380 1404           */
1381 1405          if (pageout_new_spread == 0) {
1382 1406                  VERIFY3U(inst, ==, 0);
     1407 +
1383 1408                  if (PAGE_SCAN_STARTUP) {
1384 1409                          /*
1385 1410                           * Continue accumulating samples until we have enough
1386      -                         * to get a reasonable value for average scan rate:
     1411 +                         * to get a reasonable value for average scan rate.
1387 1412                           */
1388 1413                          pageout_sample_pages += pcount;
1389 1414                          pageout_sample_etime += sample_end - sample_start;
1390 1415                          ++pageout_sample_cnt;
1391 1416                  }
1392 1417  
1393 1418                  if (!PAGE_SCAN_STARTUP) {
1394 1419                          /*
1395 1420                           * We have enough samples, set the spread.
1396 1421                           */
↓ open down ↓ 121 lines elided ↑ open up ↑
1518 1543                  page_unlock(pp);
1519 1544                  return (CKP_INELIGIBLE);
1520 1545          }
1521 1546  
1522 1547          if (zones_over) {
1523 1548                  ASSERT(pp->p_zoneid == ALL_ZONES ||
1524 1549                      pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1525 1550                  if (pp->p_zoneid == ALL_ZONES ||
1526 1551                      zone_pdata[pp->p_zoneid].zpers_over == 0) {
1527 1552                          /*
1528      -                         * Cross-zone shared page, or zone not over it's cap.
1529      -                         * Leave the page alone.
1530      -                         */
     1553 +                        * Cross-zone shared page, or zone not over it's cap.
     1554 +                        * Leave the page alone.
     1555 +                        */
1531 1556                          page_unlock(pp);
1532 1557                          return (CKP_INELIGIBLE);
1533 1558                  }
1534 1559                  zid = pp->p_zoneid;
1535 1560          }
1536 1561  
1537 1562          /*
1538 1563           * Maintain statistics for what we are freeing
1539 1564           */
1540      -
1541 1565          if (pp->p_vnode != NULL) {
1542 1566                  if (pp->p_vnode->v_flag & VVMEXEC)
1543 1567                          isexec = 1;
1544 1568  
1545 1569                  if (!IS_SWAPFSVP(pp->p_vnode))
1546 1570                          isfs = 1;
1547 1571          }
1548 1572  
1549 1573          /*
1550 1574           * Turn off REF and MOD bits with the front hand.
↓ open down ↓ 192 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX