big-one Wdiff usr/src/uts/common/os/vm_pageout.c

Print this page

re #13613 rb4516 Tunables needs volatile keyword

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/vm_pageout.c
          +++ new/usr/src/uts/common/os/vm_pageout.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
       25 +/*
       26 + * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
       27 + */
  25   28  
  26   29  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27   30  /*        All Rights Reserved   */
  28   31  
  29   32  /*
  30   33   * University Copyright- Copyright (c) 1982, 1986, 1988
  31   34   * The Regents of the University of California
  32   35   * All Rights Reserved
  33   36   *
  34   37   * University Acknowledgment- Portions of this document are derived from

  35   38   * software developed by the University of California, Berkeley, and its
  36   39   * contributors.
  37   40   */
  38   41  
  39   42  #include <sys/types.h>
  40   43  #include <sys/t_lock.h>
  41   44  #include <sys/param.h>
  42   45  #include <sys/buf.h>
  43   46  #include <sys/uio.h>
  44   47  #include <sys/proc.h>
  45   48  #include <sys/systm.h>
  46   49  #include <sys/mman.h>
  47   50  #include <sys/cred.h>
  48   51  #include <sys/vnode.h>
  49   52  #include <sys/vm.h>
  50   53  #include <sys/vmparam.h>
  51   54  #include <sys/vtrace.h>
  52   55  #include <sys/cmn_err.h>
  53   56  #include <sys/cpuvar.h>
  54   57  #include <sys/user.h>
  55   58  #include <sys/kmem.h>
  56   59  #include <sys/debug.h>
  57   60  #include <sys/callb.h>
  58   61  #include <sys/tnf_probe.h>
  59   62  #include <sys/mem_cage.h>
  60   63  #include <sys/time.h>
  61   64  
  62   65  #include <vm/hat.h>
  63   66  #include <vm/as.h>
  64   67  #include <vm/seg.h>
  65   68  #include <vm/page.h>
  66   69  #include <vm/pvn.h>
  67   70  #include <vm/seg_kmem.h>

↓ open down ↓

33 lines elided

↑ open up ↑

  68   71  
  69   72  static int checkpage(page_t *, int);
  70   73  
  71   74  /*
  72   75   * The following parameters control operation of the page replacement
  73   76   * algorithm.  They are initialized to 0, and then computed at boot time
  74   77   * based on the size of the system.  If they are patched non-zero in
  75   78   * a loaded vmunix they are left alone and may thus be changed per system
  76   79   * using adb on the loaded system.
  77   80   */
  78      -pgcnt_t         slowscan = 0;
  79      -pgcnt_t         fastscan = 0;
       81 +volatile pgcnt_t        slowscan = 0;
       82 +volatile pgcnt_t        fastscan = 0;
  80   83  
  81      -static pgcnt_t  handspreadpages = 0;
       84 +volatile pgcnt_t        handspreadpages = 0;
  82   85  static int      loopfraction = 2;
  83   86  static pgcnt_t  looppages;
  84      -static int      min_percent_cpu = 4;
       87 +volatile int    min_percent_cpu = 4;
  85   88  static int      max_percent_cpu = 80;
  86   89  static pgcnt_t  maxfastscan = 0;
  87   90  static pgcnt_t  maxslowscan = 100;
  88   91  
  89      -pgcnt_t maxpgio = 0;
  90      -pgcnt_t minfree = 0;
  91      -pgcnt_t desfree = 0;
  92      -pgcnt_t lotsfree = 0;
       92 +volatile pgcnt_t        maxpgio = 0;
       93 +volatile pgcnt_t        minfree = 0;
       94 +volatile pgcnt_t        desfree = 0;
       95 +volatile pgcnt_t        lotsfree = 0;
  93   96  pgcnt_t needfree = 0;
  94      -pgcnt_t throttlefree = 0;
  95      -pgcnt_t pageout_reserve = 0;
       97 +volatile pgcnt_t        throttlefree = 0;
       98 +volatile pgcnt_t        pageout_reserve = 0;
  96   99  
  97  100  pgcnt_t deficit;
  98  101  pgcnt_t nscan;
  99  102  pgcnt_t desscan;
 100  103  
 101  104  /*
 102  105   * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
 103  106   * are the number of ticks in each wakeup cycle that gives the
 104  107   * equivalent of some underlying %CPU duty cycle.
 105  108   * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is

 106  109   * awakened every 25 clock ticks.  So, converting from %CPU to ticks
 107  110   * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
 108  111   * So, for example, 4% == 1 tick and 80% == 20 ticks.
 109  112   *
 110  113   * min_pageout_ticks:
 111  114   *     ticks/wakeup equivalent of min_percent_cpu.
 112  115   *
 113  116   * max_pageout_ticks:
 114  117   *     ticks/wakeup equivalent of max_percent_cpu.
 115  118   *
 116  119   * pageout_ticks:
 117  120   *     Number of clock ticks budgeted for each wakeup cycle.
 118  121   *     Computed each time around by schedpaging().
 119  122   *     Varies between min_pageout_ticks .. max_pageout_ticks,
 120  123   *     depending on memory pressure.
 121  124   *
 122  125   * pageout_lbolt:
 123  126   *     Timestamp of the last time pageout_scanner woke up and started
 124  127   *     (or resumed) scanning for not recently referenced pages.
 125  128   */
 126  129  
 127  130  static clock_t  min_pageout_ticks;
 128  131  static clock_t  max_pageout_ticks;
 129  132  static clock_t  pageout_ticks;
 130  133  static clock_t  pageout_lbolt;
 131  134  
 132  135  static uint_t   reset_hands;
 133  136  
 134  137  #define PAGES_POLL_MASK 1023
 135  138  
 136  139  /*
 137  140   * pageout_sample_lim:
 138  141   *     The limit on the number of samples needed to establish a value
 139  142   *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
 140  143   *
 141  144   * pageout_sample_cnt:
 142  145   *     Current sample number.  Once the sample gets large enough,
 143  146   *     set new values for handspreadpages, fastscan and slowscan.
 144  147   *
 145  148   * pageout_sample_pages:
 146  149   *     The accumulated number of pages scanned during sampling.
 147  150   *
 148  151   * pageout_sample_ticks:
 149  152   *     The accumulated clock ticks for the sample.
 150  153   *
 151  154   * pageout_rate:
 152  155   *     Rate in pages/nanosecond, computed at the end of sampling.
 153  156   *
 154  157   * pageout_new_spread:
 155  158   *     The new value to use for fastscan and handspreadpages.
 156  159   *     Calculated after enough samples have been taken.
 157  160   */
 158  161  
 159  162  typedef hrtime_t hrrate_t;
 160  163  
 161  164  static uint64_t pageout_sample_lim = 4;
 162  165  static uint64_t pageout_sample_cnt = 0;
 163  166  static pgcnt_t  pageout_sample_pages = 0;
 164  167  static hrrate_t pageout_rate = 0;
 165  168  static pgcnt_t  pageout_new_spread = 0;
 166  169  
 167  170  static clock_t  pageout_cycle_ticks;
 168  171  static hrtime_t sample_start, sample_end;
 169  172  static hrtime_t pageout_sample_etime = 0;
 170  173  
 171  174  /*
 172  175   * Record number of times a pageout_scanner wakeup cycle finished because it
 173  176   * timed out (exceeded its CPU budget), rather than because it visited
 174  177   * its budgeted number of pages.
 175  178   */
 176  179  uint64_t pageout_timeouts = 0;
 177  180  
 178  181  #ifdef VM_STATS
 179  182  static struct pageoutvmstats_str {
 180  183          ulong_t checkpage[3];
 181  184  } pageoutvmstats;
 182  185  #endif /* VM_STATS */
 183  186  
 184  187  /*
 185  188   * Threads waiting for free memory use this condition variable and lock until
 186  189   * memory becomes available.
 187  190   */
 188  191  kmutex_t        memavail_lock;
 189  192  kcondvar_t      memavail_cv;
 190  193  
 191  194  /*
 192  195   * The size of the clock loop.
 193  196   */
 194  197  #define LOOPPAGES       total_pages
 195  198  
 196  199  /*
 197  200   * Set up the paging constants for the clock algorithm.
 198  201   * Called after the system is initialized and the amount of memory
 199  202   * and number of paging devices is known.
 200  203   *
 201  204   * lotsfree is 1/64 of memory, but at least 512K.
 202  205   * desfree is 1/2 of lotsfree.
 203  206   * minfree is 1/2 of desfree.
 204  207   *
 205  208   * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
 206  209   *
 207  210   *      lotsfree = btop(512K)
 208  211   *      desfree = btop(200K)
 209  212   *      minfree = btop(100K)
 210  213   *      throttlefree = INT_MIN
 211  214   *      max_percent_cpu = 4
 212  215   */
 213  216  void
 214  217  setupclock(int recalc)
 215  218  {
 216  219  
 217  220          static spgcnt_t init_lfree, init_dfree, init_mfree;
 218  221          static spgcnt_t init_tfree, init_preserve, init_mpgio;
 219  222          static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
 220  223  
 221  224          looppages = LOOPPAGES;
 222  225  
 223  226          /*
 224  227           * setupclock can now be called to recalculate the paging
 225  228           * parameters in the case of dynamic addition of memory.
 226  229           * So to make sure we make the proper calculations, if such a
 227  230           * situation should arise, we save away the initial values
 228  231           * of each parameter so we can recall them when needed. This
 229  232           * way we don't lose the settings an admin might have made
 230  233           * through the /etc/system file.
 231  234           */
 232  235  
 233  236          if (!recalc) {
 234  237                  init_lfree = lotsfree;
 235  238                  init_dfree = desfree;
 236  239                  init_mfree = minfree;
 237  240                  init_tfree = throttlefree;
 238  241                  init_preserve = pageout_reserve;
 239  242                  init_mpgio = maxpgio;
 240  243                  init_mfscan = maxfastscan;
 241  244                  init_fscan = fastscan;
 242  245                  init_sscan = slowscan;
 243  246                  init_hspages = handspreadpages;
 244  247          }
 245  248  
 246  249          /*
 247  250           * Set up thresholds for paging:
 248  251           */
 249  252  
 250  253          /*
 251  254           * Lotsfree is threshold where paging daemon turns on.
 252  255           */
 253  256          if (init_lfree == 0 || init_lfree >= looppages)
 254  257                  lotsfree = MAX(looppages / 64, btop(512 * 1024));
 255  258          else
 256  259                  lotsfree = init_lfree;
 257  260  
 258  261          /*
 259  262           * Desfree is amount of memory desired free.
 260  263           * If less than this for extended period, start swapping.
 261  264           */
 262  265          if (init_dfree == 0 || init_dfree >= lotsfree)
 263  266                  desfree = lotsfree / 2;
 264  267          else
 265  268                  desfree = init_dfree;
 266  269  
 267  270          /*
 268  271           * Minfree is minimal amount of free memory which is tolerable.
 269  272           */
 270  273          if (init_mfree == 0 || init_mfree >= desfree)
 271  274                  minfree = desfree / 2;
 272  275          else
 273  276                  minfree = init_mfree;
 274  277  
 275  278          /*
 276  279           * Throttlefree is the point at which we start throttling
 277  280           * PG_WAIT requests until enough memory becomes available.
 278  281           */
 279  282          if (init_tfree == 0 || init_tfree >= desfree)
 280  283                  throttlefree = minfree;
 281  284          else
 282  285                  throttlefree = init_tfree;
 283  286  
 284  287          /*
 285  288           * Pageout_reserve is the number of pages that we keep in
 286  289           * stock for pageout's own use.  Having a few such pages
 287  290           * provides insurance against system deadlock due to
 288  291           * pageout needing pages.  When freemem < pageout_reserve,
 289  292           * non-blocking allocations are denied to any threads
 290  293           * other than pageout and sched.  (At some point we might
 291  294           * want to consider a per-thread flag like T_PUSHING_PAGES
 292  295           * to indicate that a thread is part of the page-pushing
 293  296           * dance (e.g. an interrupt thread) and thus is entitled
 294  297           * to the same special dispensation we accord pageout.)
 295  298           */
 296  299          if (init_preserve == 0 || init_preserve >= throttlefree)
 297  300                  pageout_reserve = throttlefree / 2;
 298  301          else
 299  302                  pageout_reserve = init_preserve;
 300  303  
 301  304          /*
 302  305           * Maxpgio thresholds how much paging is acceptable.
 303  306           * This figures that 2/3 busy on an arm is all that is
 304  307           * tolerable for paging.  We assume one operation per disk rev.
 305  308           *
 306  309           * XXX - Does not account for multiple swap devices.
 307  310           */
 308  311          if (init_mpgio == 0)
 309  312                  maxpgio = (DISKRPM * 2) / 3;
 310  313          else
 311  314                  maxpgio = init_mpgio;
 312  315  
 313  316          /*
 314  317           * The clock scan rate varies between fastscan and slowscan
 315  318           * based on the amount of free memory available.  Fastscan
 316  319           * rate should be set based on the number pages that can be
 317  320           * scanned per sec using ~10% of processor time.  Since this
 318  321           * value depends on the processor, MMU, Mhz etc., it is
 319  322           * difficult to determine it in a generic manner for all
 320  323           * architectures.
 321  324           *
 322  325           * Instead of trying to determine the number of pages scanned
 323  326           * per sec for every processor, fastscan is set to be the smaller
 324  327           * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 325  328           * time is limited to ~4% of processor time.
 326  329           *
 327  330           * Setting fastscan to be 1/2 of memory allows pageout to scan
 328  331           * all of memory in ~2 secs.  This implies that user pages not
 329  332           * accessed within 1 sec (assuming, handspreadpages == fastscan)
 330  333           * can be reclaimed when free memory is very low.  Stealing pages
 331  334           * not accessed within 1 sec seems reasonable and ensures that
 332  335           * active user processes don't thrash.
 333  336           *
 334  337           * Smaller values of fastscan result in scanning fewer pages
 335  338           * every second and consequently pageout may not be able to free
 336  339           * sufficient memory to maintain the minimum threshold.  Larger
 337  340           * values of fastscan result in scanning a lot more pages which
 338  341           * could lead to thrashing and higher CPU usage.
 339  342           *
 340  343           * Fastscan needs to be limited to a maximum value and should not
 341  344           * scale with memory to prevent pageout from consuming too much
 342  345           * time for scanning on slow CPU's and avoid thrashing, as a
 343  346           * result of scanning too many pages, on faster CPU's.
 344  347           * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 345  348           * (the upper bound for fastscan) based on the average number
 346  349           * of pages that can potentially be scanned in ~1 sec (using ~4%
 347  350           * of the CPU) on some of the following machines that currently
 348  351           * run Solaris 2.x:
 349  352           *
 350  353           *                      average memory scanned in ~1 sec
 351  354           *
 352  355           *      25 Mhz SS1+:            23 Meg
 353  356           *      LX:                     37 Meg
 354  357           *      50 Mhz SC2000:          68 Meg
 355  358           *
 356  359           *      40 Mhz 486:             26 Meg
 357  360           *      66 Mhz 486:             42 Meg
 358  361           *
 359  362           * When free memory falls just below lotsfree, the scan rate
 360  363           * goes from 0 to slowscan (i.e., pageout starts running).  This
 361  364           * transition needs to be smooth and is achieved by ensuring that
 362  365           * pageout scans a small number of pages to satisfy the transient
 363  366           * memory demand.  This is set to not exceed 100 pages/sec (25 per
 364  367           * wakeup) since scanning that many pages has no noticible impact
 365  368           * on system performance.
 366  369           *
 367  370           * In addition to setting fastscan and slowscan, pageout is
 368  371           * limited to using ~4% of the CPU.  This results in increasing
 369  372           * the time taken to scan all of memory, which in turn means that
 370  373           * user processes have a better opportunity of preventing their
 371  374           * pages from being stolen.  This has a positive effect on
 372  375           * interactive and overall system performance when memory demand
 373  376           * is high.
 374  377           *
 375  378           * Thus, the rate at which pages are scanned for replacement will
 376  379           * vary linearly between slowscan and the number of pages that
 377  380           * can be scanned using ~4% of processor time instead of varying
 378  381           * linearly between slowscan and fastscan.
 379  382           *
 380  383           * Also, the processor time used by pageout will vary from ~1%
 381  384           * at slowscan to ~4% at fastscan instead of varying between
 382  385           * ~1% at slowscan and ~10% at fastscan.
 383  386           *
 384  387           * The values chosen for the various VM parameters (fastscan,
 385  388           * handspreadpages, etc) are not universally true for all machines,
 386  389           * but appear to be a good rule of thumb for the machines we've
 387  390           * tested.  They have the following ranges:
 388  391           *
 389  392           *      cpu speed:      20 to 70 Mhz
 390  393           *      page size:      4K to 8K
 391  394           *      memory size:    16M to 5G
 392  395           *      page scan rate: 4000 - 17400 4K pages per sec
 393  396           *
 394  397           * The values need to be re-examined for machines which don't
 395  398           * fall into the various ranges (e.g., slower or faster CPUs,
 396  399           * smaller or larger pagesizes etc) shown above.
 397  400           *
 398  401           * On an MP machine, pageout is often unable to maintain the
 399  402           * minimum paging thresholds under heavy load.  This is due to
 400  403           * the fact that user processes running on other CPU's can be
 401  404           * dirtying memory at a much faster pace than pageout can find
 402  405           * pages to free.  The memory demands could be met by enabling
 403  406           * more than one CPU to run the clock algorithm in such a manner
 404  407           * that the various clock hands don't overlap.  This also makes
 405  408           * it more difficult to determine the values for fastscan, slowscan
 406  409           * and handspreadpages.
 407  410           *
 408  411           * The swapper is currently used to free up memory when pageout
 409  412           * is unable to meet memory demands by swapping out processes.
 410  413           * In addition to freeing up memory, swapping also reduces the
 411  414           * demand for memory by preventing user processes from running
 412  415           * and thereby consuming memory.
 413  416           */
 414  417          if (init_mfscan == 0) {
 415  418                  if (pageout_new_spread != 0)
 416  419                          maxfastscan = pageout_new_spread;
 417  420                  else
 418  421                          maxfastscan = MAXHANDSPREADPAGES;
 419  422          } else {
 420  423                  maxfastscan = init_mfscan;
 421  424          }
 422  425          if (init_fscan == 0)
 423  426                  fastscan = MIN(looppages / loopfraction, maxfastscan);
 424  427          else
 425  428                  fastscan = init_fscan;
 426  429          if (fastscan > looppages / loopfraction)
 427  430                  fastscan = looppages / loopfraction;
 428  431  
 429  432          /*
 430  433           * Set slow scan time to 1/10 the fast scan time, but
 431  434           * not to exceed maxslowscan.
 432  435           */
 433  436          if (init_sscan == 0)
 434  437                  slowscan = MIN(fastscan / 10, maxslowscan);
 435  438          else
 436  439                  slowscan = init_sscan;
 437  440          if (slowscan > fastscan / 2)
 438  441                  slowscan = fastscan / 2;
 439  442  
 440  443          /*
 441  444           * Handspreadpages is distance (in pages) between front and back
 442  445           * pageout daemon hands.  The amount of time to reclaim a page
 443  446           * once pageout examines it increases with this distance and
 444  447           * decreases as the scan rate rises. It must be < the amount
 445  448           * of pageable memory.
 446  449           *
 447  450           * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 448  451           * to be "fastscan" results in the front hand being a few secs
 449  452           * (varies based on the processor speed) ahead of the back hand
 450  453           * at fastscan rates.  This distance can be further reduced, if
 451  454           * necessary, by increasing the processor time used by pageout
 452  455           * to be more than ~4% and preferrably not more than ~10%.
 453  456           *
 454  457           * As a result, user processes have a much better chance of
 455  458           * referencing their pages before the back hand examines them.
 456  459           * This also significantly lowers the number of reclaims from
 457  460           * the freelist since pageout does not end up freeing pages which
 458  461           * may be referenced a sec later.
 459  462           */
 460  463          if (init_hspages == 0)
 461  464                  handspreadpages = fastscan;
 462  465          else
 463  466                  handspreadpages = init_hspages;
 464  467  
 465  468          /*
 466  469           * Make sure that back hand follows front hand by at least
 467  470           * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
 468  471           * for the back hand to look at a page during the same wakeup of
 469  472           * the pageout daemon in which the front hand cleared its ref bit.
 470  473           */
 471  474          if (handspreadpages >= looppages)
 472  475                  handspreadpages = looppages - 1;
 473  476  
 474  477          /*
 475  478           * If we have been called to recalculate the parameters,
 476  479           * set a flag to re-evaluate the clock hand pointers.
 477  480           */
 478  481          if (recalc)
 479  482                  reset_hands = 1;
 480  483  }
 481  484  
 482  485  /*
 483  486   * Pageout scheduling.
 484  487   *
 485  488   * Schedpaging controls the rate at which the page out daemon runs by
 486  489   * setting the global variables nscan and desscan RATETOSCHEDPAGING
 487  490   * times a second.  Nscan records the number of pages pageout has examined
 488  491   * in its current pass; schedpaging resets this value to zero each time
 489  492   * it runs.  Desscan records the number of pages pageout should examine
 490  493   * in its next pass; schedpaging sets this value based on the amount of
 491  494   * currently available memory.
 492  495   */
 493  496  
 494  497  #define RATETOSCHEDPAGING       4               /* hz that is */
 495  498  
 496  499  static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
 497  500  
 498  501  /*
 499  502   * Pool of available async pageout putpage requests.
 500  503   */
 501  504  static struct async_reqs *push_req;
 502  505  static struct async_reqs *req_freelist; /* available req structs */
 503  506  static struct async_reqs *push_list;    /* pending reqs */
 504  507  static kmutex_t push_lock;              /* protects req pool */
 505  508  static kcondvar_t push_cv;
 506  509  
 507  510  static int async_list_size = 256;       /* number of async request structs */
 508  511  
 509  512  static void pageout_scanner(void);
 510  513  
 511  514  /*
 512  515   * If a page is being shared more than "po_share" times
 513  516   * then leave it alone- don't page it out.
 514  517   */
 515  518  #define MIN_PO_SHARE    (8)
 516  519  #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 517  520  ulong_t po_share = MIN_PO_SHARE;
 518  521  
 519  522  /*
 520  523   * Schedule rate for paging.
 521  524   * Rate is linear interpolation between
 522  525   * slowscan with lotsfree and fastscan when out of memory.
 523  526   */
 524  527  static void
 525  528  schedpaging(void *arg)
 526  529  {
 527  530          spgcnt_t vavail;
 528  531  
 529  532          if (freemem < lotsfree + needfree + kmem_reapahead)
 530  533                  kmem_reap();
 531  534  
 532  535          if (freemem < lotsfree + needfree)
 533  536                  seg_preap();
 534  537  
 535  538          if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 536  539                  kcage_cageout_wakeup();
 537  540  
 538  541          if (mutex_tryenter(&pageout_mutex)) {
 539  542                  /* pageout() not running */
 540  543                  nscan = 0;
 541  544                  vavail = freemem - deficit;
 542  545                  if (pageout_new_spread != 0)
 543  546                          vavail -= needfree;
 544  547                  if (vavail < 0)
 545  548                          vavail = 0;
 546  549                  if (vavail > lotsfree)
 547  550                          vavail = lotsfree;
 548  551  
 549  552                  /*
 550  553                   * Fix for 1161438 (CRS SPR# 73922).  All variables
 551  554                   * in the original calculation for desscan were 32 bit signed
 552  555                   * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 553  556                   * more of memory, the calculation can overflow.  When this
 554  557                   * happens, desscan becomes negative and pageout_scanner()
 555  558                   * stops paging out.
 556  559                   */
 557  560                  if ((needfree) && (pageout_new_spread == 0)) {
 558  561                          /*
 559  562                           * If we've not yet collected enough samples to
 560  563                           * calculate a spread, use the old logic of kicking
 561  564                           * into high gear anytime needfree is non-zero.
 562  565                           */
 563  566                          desscan = fastscan / RATETOSCHEDPAGING;
 564  567                  } else {
 565  568                          /*
 566  569                           * Once we've calculated a spread based on system
 567  570                           * memory and usage, just treat needfree as another
 568  571                           * form of deficit.
 569  572                           */
 570  573                          spgcnt_t faststmp, slowstmp, result;
 571  574  
 572  575                          slowstmp = slowscan * vavail;
 573  576                          faststmp = fastscan * (lotsfree - vavail);
 574  577                          result = (slowstmp + faststmp) /
 575  578                              nz(lotsfree) / RATETOSCHEDPAGING;
 576  579                          desscan = (pgcnt_t)result;
 577  580                  }
 578  581  
 579  582                  pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
 580  583                      (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
 581  584  
 582  585                  if (freemem < lotsfree + needfree ||
 583  586                      pageout_sample_cnt < pageout_sample_lim) {
 584  587                          TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
 585  588                              "pageout_cv_signal:freemem %ld", freemem);
 586  589                          cv_signal(&proc_pageout->p_cv);
 587  590                  } else {
 588  591                          /*
 589  592                           * There are enough free pages, no need to
 590  593                           * kick the scanner thread.  And next time
 591  594                           * around, keep more of the `highly shared'
 592  595                           * pages.
 593  596                           */
 594  597                          cv_signal_pageout();
 595  598                          if (po_share > MIN_PO_SHARE) {
 596  599                                  po_share >>= 1;
 597  600                          }
 598  601                  }
 599  602                  mutex_exit(&pageout_mutex);
 600  603          }
 601  604  
 602  605          /*
 603  606           * Signal threads waiting for available memory.
 604  607           * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 605  608           * in this case it is not needed - the waiters will be waken up during
 606  609           * the next invocation of this function.
 607  610           */
 608  611          if (kmem_avail() > 0)
 609  612                  cv_broadcast(&memavail_cv);
 610  613  
 611  614          (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
 612  615  }
 613  616  
 614  617  pgcnt_t         pushes;
 615  618  ulong_t         push_list_size;         /* # of requests on pageout queue */
 616  619  
 617  620  #define FRONT   1
 618  621  #define BACK    2
 619  622  
 620  623  int dopageout = 1;      /* must be non-zero to turn page stealing on */
 621  624  
 622  625  /*
 623  626   * The page out daemon, which runs as process 2.
 624  627   *
 625  628   * As long as there are at least lotsfree pages,
 626  629   * this process is not run.  When the number of free
 627  630   * pages stays in the range desfree to lotsfree,
 628  631   * this daemon runs through the pages in the loop
 629  632   * at a rate determined in schedpaging().  Pageout manages
 630  633   * two hands on the clock.  The front hand moves through
 631  634   * memory, clearing the reference bit,
 632  635   * and stealing pages from procs that are over maxrss.
 633  636   * The back hand travels a distance behind the front hand,
 634  637   * freeing the pages that have not been referenced in the time
 635  638   * since the front hand passed.  If modified, they are pushed to
 636  639   * swap before being freed.
 637  640   *
 638  641   * There are 2 threads that act on behalf of the pageout process.
 639  642   * One thread scans pages (pageout_scanner) and frees them up if
 640  643   * they don't require any VOP_PUTPAGE operation. If a page must be
 641  644   * written back to its backing store, the request is put on a list
 642  645   * and the other (pageout) thread is signaled. The pageout thread
 643  646   * grabs VOP_PUTPAGE requests from the list, and processes them.
 644  647   * Some filesystems may require resources for the VOP_PUTPAGE
 645  648   * operations (like memory) and hence can block the pageout
 646  649   * thread, but the scanner thread can still operate. There is still
 647  650   * no guarantee that memory deadlocks cannot occur.
 648  651   *
 649  652   * For now, this thing is in very rough form.
 650  653   */
 651  654  void
 652  655  pageout()
 653  656  {
 654  657          struct async_reqs *arg;
 655  658          pri_t pageout_pri;
 656  659          int i;
 657  660          pgcnt_t max_pushes;
 658  661          callb_cpr_t cprinfo;
 659  662  
 660  663          proc_pageout = ttoproc(curthread);
 661  664          proc_pageout->p_cstime = 0;
 662  665          proc_pageout->p_stime =  0;
 663  666          proc_pageout->p_cutime =  0;
 664  667          proc_pageout->p_utime = 0;
 665  668          bcopy("pageout", PTOU(curproc)->u_psargs, 8);
 666  669          bcopy("pageout", PTOU(curproc)->u_comm, 7);
 667  670  
 668  671          /*
 669  672           * Create pageout scanner thread
 670  673           */
 671  674          mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
 672  675          mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
 673  676  
 674  677          /*
 675  678           * Allocate and initialize the async request structures
 676  679           * for pageout.
 677  680           */
 678  681          push_req = (struct async_reqs *)
 679  682              kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
 680  683  
 681  684          req_freelist = push_req;
 682  685          for (i = 0; i < async_list_size - 1; i++)
 683  686                  push_req[i].a_next = &push_req[i + 1];
 684  687  
 685  688          pageout_pri = curthread->t_pri;
 686  689  
 687  690          /* Create the pageout scanner thread. */
 688  691          (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
 689  692              pageout_pri - 1);
 690  693  
 691  694          /*
 692  695           * kick off pageout scheduler.
 693  696           */
 694  697          schedpaging(NULL);
 695  698  
 696  699          /*
 697  700           * Create kernel cage thread.
 698  701           * The kernel cage thread is started under the pageout process
 699  702           * to take advantage of the less restricted page allocation
 700  703           * in page_create_throttle().
 701  704           */
 702  705          kcage_cageout_init();
 703  706  
 704  707          /*
 705  708           * Limit pushes to avoid saturating pageout devices.
 706  709           */
 707  710          max_pushes = maxpgio / RATETOSCHEDPAGING;
 708  711          CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
 709  712  
 710  713          for (;;) {
 711  714                  mutex_enter(&push_lock);
 712  715  
 713  716                  while ((arg = push_list) == NULL || pushes > max_pushes) {
 714  717                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
 715  718                          cv_wait(&push_cv, &push_lock);
 716  719                          pushes = 0;
 717  720                          CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
 718  721                  }
 719  722                  push_list = arg->a_next;
 720  723                  arg->a_next = NULL;
 721  724                  mutex_exit(&push_lock);
 722  725  
 723  726                  if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
 724  727                      arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 725  728                          pushes++;
 726  729                  }
 727  730  
 728  731                  /* vp held by checkpage() */
 729  732                  VN_RELE(arg->a_vp);
 730  733  
 731  734                  mutex_enter(&push_lock);
 732  735                  arg->a_next = req_freelist;     /* back on freelist */
 733  736                  req_freelist = arg;
 734  737                  push_list_size--;
 735  738                  mutex_exit(&push_lock);
 736  739          }
 737  740  }
 738  741  
 739  742  /*
 740  743   * Kernel thread that scans pages looking for ones to free
 741  744   */
 742  745  static void
 743  746  pageout_scanner(void)
 744  747  {
 745  748          struct page *fronthand, *backhand;
 746  749          uint_t count;
 747  750          callb_cpr_t cprinfo;
 748  751          pgcnt_t nscan_limit;
 749  752          pgcnt_t pcount;
 750  753  
 751  754          CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
 752  755          mutex_enter(&pageout_mutex);
 753  756  
 754  757          /*
 755  758           * The restart case does not attempt to point the hands at roughly
 756  759           * the right point on the assumption that after one circuit things
 757  760           * will have settled down - and restarts shouldn't be that often.
 758  761           */
 759  762  
 760  763          /*
 761  764           * Set the two clock hands to be separated by a reasonable amount,
 762  765           * but no more than 360 degrees apart.
 763  766           */
 764  767          backhand = page_first();
 765  768          if (handspreadpages >= total_pages)
 766  769                  fronthand = page_nextn(backhand, total_pages - 1);
 767  770          else
 768  771                  fronthand = page_nextn(backhand, handspreadpages);
 769  772  
 770  773          min_pageout_ticks = MAX(1,
 771  774              ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
 772  775          max_pageout_ticks = MAX(min_pageout_ticks,
 773  776              ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
 774  777  
 775  778  loop:
 776  779          cv_signal_pageout();
 777  780  
 778  781          CALLB_CPR_SAFE_BEGIN(&cprinfo);
 779  782          cv_wait(&proc_pageout->p_cv, &pageout_mutex);
 780  783          CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
 781  784  
 782  785          if (!dopageout)
 783  786                  goto loop;
 784  787  
 785  788          if (reset_hands) {
 786  789                  reset_hands = 0;
 787  790  
 788  791                  backhand = page_first();
 789  792                  if (handspreadpages >= total_pages)
 790  793                          fronthand = page_nextn(backhand, total_pages - 1);
 791  794                  else
 792  795                          fronthand = page_nextn(backhand, handspreadpages);
 793  796          }
 794  797  
 795  798          CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
 796  799          count = 0;
 797  800  
 798  801          TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
 799  802              "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
 800  803              freemem, lotsfree, nscan, desscan);
 801  804  
 802  805          /* Kernel probe */
 803  806          TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
 804  807              tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
 805  808  
 806  809          pcount = 0;
 807  810          if (pageout_sample_cnt < pageout_sample_lim) {
 808  811                  nscan_limit = total_pages;
 809  812          } else {
 810  813                  nscan_limit = desscan;
 811  814          }
 812  815          pageout_lbolt = ddi_get_lbolt();
 813  816          sample_start = gethrtime();
 814  817  
 815  818          /*
 816  819           * Scan the appropriate number of pages for a single duty cycle.
 817  820           * However, stop scanning as soon as there is enough free memory.
 818  821           * For a short while, we will be sampling the performance of the
 819  822           * scanner and need to keep running just to get sample data, in
 820  823           * which case we keep going and don't pay attention to whether
 821  824           * or not there is enough free memory.
 822  825           */
 823  826  
 824  827          while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
 825  828              pageout_sample_cnt < pageout_sample_lim)) {
 826  829                  int rvfront, rvback;
 827  830  
 828  831                  /*
 829  832                   * Check to see if we have exceeded our %CPU budget
 830  833                   * for this wakeup, but not on every single page visited,
 831  834                   * just every once in a while.
 832  835                   */
 833  836                  if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
 834  837                          pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
 835  838                          if (pageout_cycle_ticks >= pageout_ticks) {
 836  839                                  ++pageout_timeouts;
 837  840                                  break;
 838  841                          }
 839  842                  }
 840  843  
 841  844                  /*
 842  845                   * If checkpage manages to add a page to the free list,
 843  846                   * we give ourselves another couple of trips around the loop.
 844  847                   */
 845  848                  if ((rvfront = checkpage(fronthand, FRONT)) == 1)
 846  849                          count = 0;
 847  850                  if ((rvback = checkpage(backhand, BACK)) == 1)
 848  851                          count = 0;
 849  852  
 850  853                  ++pcount;
 851  854  
 852  855                  /*
 853  856                   * protected by pageout_mutex instead of cpu_stat_lock
 854  857                   */
 855  858                  CPU_STATS_ADDQ(CPU, vm, scan, 1);
 856  859  
 857  860                  /*
 858  861                   * Don't include ineligible pages in the number scanned.
 859  862                   */
 860  863                  if (rvfront != -1 || rvback != -1)
 861  864                          nscan++;
 862  865  
 863  866                  backhand = page_next(backhand);
 864  867  
 865  868                  /*
 866  869                   * backhand update and wraparound check are done separately
 867  870                   * because lint barks when it finds an empty "if" body
 868  871                   */
 869  872  
 870  873                  if ((fronthand = page_next(fronthand)) == page_first()) {
 871  874                          TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
 872  875                              "pageout_hand_wrap:freemem %ld whichhand %d",
 873  876                              freemem, FRONT);
 874  877  
 875  878                          /*
 876  879                           * protected by pageout_mutex instead of cpu_stat_lock
 877  880                           */
 878  881                          CPU_STATS_ADDQ(CPU, vm, rev, 1);
 879  882                          if (++count > 1) {
 880  883                                  /*
 881  884                                   * Extremely unlikely, but it happens.
 882  885                                   * We went around the loop at least once
 883  886                                   * and didn't get far enough.
 884  887                                   * If we are still skipping `highly shared'
 885  888                                   * pages, skip fewer of them.  Otherwise,
 886  889                                   * give up till the next clock tick.
 887  890                                   */
 888  891                                  if (po_share < MAX_PO_SHARE) {
 889  892                                          po_share <<= 1;
 890  893                                  } else {
 891  894                                          /*
 892  895                                           * Really a "goto loop", but
 893  896                                           * if someone is TRACing or
 894  897                                           * TNF_PROBE_ing, at least
 895  898                                           * make records to show
 896  899                                           * where we are.
 897  900                                           */
 898  901                                          break;
 899  902                                  }
 900  903                          }
 901  904                  }
 902  905          }
 903  906  
 904  907          sample_end = gethrtime();
 905  908  
 906  909          TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
 907  910              "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
 908  911              freemem, lotsfree, nscan, desscan, count);
 909  912  
 910  913          /* Kernel probe */
 911  914          TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
 912  915              tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
 913  916  
 914  917          if (pageout_sample_cnt < pageout_sample_lim) {
 915  918                  pageout_sample_pages += pcount;
 916  919                  pageout_sample_etime += sample_end - sample_start;
 917  920                  ++pageout_sample_cnt;
 918  921          }
 919  922          if (pageout_sample_cnt >= pageout_sample_lim &&
 920  923              pageout_new_spread == 0) {
 921  924                  pageout_rate = (hrrate_t)pageout_sample_pages *
 922  925                      (hrrate_t)(NANOSEC) / pageout_sample_etime;
 923  926                  pageout_new_spread = pageout_rate / 10;
 924  927                  setupclock(1);
 925  928          }
 926  929  
 927  930          goto loop;
 928  931  }
 929  932  
 930  933  /*
 931  934   * Look at the page at hand.  If it is locked (e.g., for physical i/o),
 932  935   * system (u., page table) or free, then leave it alone.  Otherwise,
 933  936   * if we are running the front hand, turn off the page's reference bit.
 934  937   * If the proc is over maxrss, we take it.  If running the back hand,
 935  938   * check whether the page has been reclaimed.  If not, free the page,
 936  939   * pushing it to disk first if necessary.
 937  940   *
 938  941   * Return values:
 939  942   *      -1 if the page is not a candidate at all,
 940  943   *       0 if not freed, or
 941  944   *       1 if we freed it.
 942  945   */
 943  946  static int
 944  947  checkpage(struct page *pp, int whichhand)
 945  948  {
 946  949          int ppattr;
 947  950          int isfs = 0;
 948  951          int isexec = 0;
 949  952          int pagesync_flag;
 950  953  
 951  954          /*
 952  955           * Skip pages:
 953  956           *      - associated with the kernel vnode since
 954  957           *          they are always "exclusively" locked.
 955  958           *      - that are free
 956  959           *      - that are shared more than po_share'd times
 957  960           *      - its already locked
 958  961           *
 959  962           * NOTE:  These optimizations assume that reads are atomic.
 960  963           */
 961  964  
 962  965          if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
 963  966              pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
 964  967              hat_page_checkshare(pp, po_share)) {
 965  968                  return (-1);
 966  969          }
 967  970  
 968  971          if (!page_trylock(pp, SE_EXCL)) {
 969  972                  /*
 970  973                   * Skip the page if we can't acquire the "exclusive" lock.
 971  974                   */
 972  975                  return (-1);
 973  976          } else if (PP_ISFREE(pp)) {
 974  977                  /*
 975  978                   * It became free between the above check and our actually
 976  979                   * locking the page.  Oh, well there will be other pages.
 977  980                   */
 978  981                  page_unlock(pp);
 979  982                  return (-1);
 980  983          }
 981  984  
 982  985          /*
 983  986           * Reject pages that cannot be freed. The page_struct_lock
 984  987           * need not be acquired to examine these
 985  988           * fields since the page has an "exclusive" lock.
 986  989           */
 987  990          if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 988  991                  page_unlock(pp);
 989  992                  return (-1);
 990  993          }
 991  994  
 992  995          /*
 993  996           * Maintain statistics for what we are freeing
 994  997           */
 995  998  
 996  999          if (pp->p_vnode != NULL) {
 997 1000                  if (pp->p_vnode->v_flag & VVMEXEC)
 998 1001                          isexec = 1;
 999 1002  
1000 1003                  if (!IS_SWAPFSVP(pp->p_vnode))
1001 1004                          isfs = 1;
1002 1005          }
1003 1006  
1004 1007          /*
1005 1008           * Turn off REF and MOD bits with the front hand.
1006 1009           * The back hand examines the REF bit and always considers
1007 1010           * SHARED pages as referenced.
1008 1011           */
1009 1012          if (whichhand == FRONT)
1010 1013                  pagesync_flag = HAT_SYNC_ZERORM;
1011 1014          else
1012 1015                  pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1013 1016                      HAT_SYNC_STOPON_SHARED;
1014 1017  
1015 1018          ppattr = hat_pagesync(pp, pagesync_flag);
1016 1019  
1017 1020  recheck:
1018 1021          /*
1019 1022           * If page is referenced; make unreferenced but reclaimable.
1020 1023           * If this page is not referenced, then it must be reclaimable
1021 1024           * and we can add it to the free list.
1022 1025           */
1023 1026          if (ppattr & P_REF) {
1024 1027                  TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1025 1028                      "pageout_isref:pp %p whichhand %d", pp, whichhand);
1026 1029                  if (whichhand == FRONT) {
1027 1030                          /*
1028 1031                           * Checking of rss or madvise flags needed here...
1029 1032                           *
1030 1033                           * If not "well-behaved", fall through into the code
1031 1034                           * for not referenced.
1032 1035                           */
1033 1036                          hat_clrref(pp);
1034 1037                  }
1035 1038                  /*
1036 1039                   * Somebody referenced the page since the front
1037 1040                   * hand went by, so it's not a candidate for
1038 1041                   * freeing up.
1039 1042                   */
1040 1043                  page_unlock(pp);
1041 1044                  return (0);
1042 1045          }
1043 1046  
1044 1047          VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1045 1048  
1046 1049          /*
1047 1050           * If large page, attempt to demote it. If successfully demoted,
1048 1051           * retry the checkpage.
1049 1052           */
1050 1053          if (pp->p_szc != 0) {
1051 1054                  if (!page_try_demote_pages(pp)) {
1052 1055                          VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1053 1056                          page_unlock(pp);
1054 1057                          return (-1);
1055 1058                  }
1056 1059                  ASSERT(pp->p_szc == 0);
1057 1060                  VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1058 1061                  /*
1059 1062                   * since page_try_demote_pages() could have unloaded some
1060 1063                   * mappings it makes sense to reload ppattr.
1061 1064                   */
1062 1065                  ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1063 1066          }
1064 1067  
1065 1068          /*
1066 1069           * If the page is currently dirty, we have to arrange
1067 1070           * to have it cleaned before it can be freed.
1068 1071           *
1069 1072           * XXX - ASSERT(pp->p_vnode != NULL);
1070 1073           */
1071 1074          if ((ppattr & P_MOD) && pp->p_vnode) {
1072 1075                  struct vnode *vp = pp->p_vnode;
1073 1076                  u_offset_t offset = pp->p_offset;
1074 1077  
1075 1078                  /*
1076 1079                   * XXX - Test for process being swapped out or about to exit?
1077 1080                   * [Can't get back to process(es) using the page.]
1078 1081                   */
1079 1082  
1080 1083                  /*
1081 1084                   * Hold the vnode before releasing the page lock to
1082 1085                   * prevent it from being freed and re-used by some
1083 1086                   * other thread.
1084 1087                   */
1085 1088                  VN_HOLD(vp);
1086 1089                  page_unlock(pp);
1087 1090  
1088 1091                  /*
1089 1092                   * Queue i/o request for the pageout thread.
1090 1093                   */
1091 1094                  if (!queue_io_request(vp, offset)) {
1092 1095                          VN_RELE(vp);
1093 1096                          return (0);
1094 1097                  }
1095 1098                  return (1);
1096 1099          }
1097 1100  
1098 1101          /*
1099 1102           * Now we unload all the translations,
1100 1103           * and put the page back on to the free list.
1101 1104           * If the page was used (referenced or modified) after
1102 1105           * the pagesync but before it was unloaded we catch it
1103 1106           * and handle the page properly.
1104 1107           */
1105 1108          TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1106 1109              "pageout_free:pp %p whichhand %d", pp, whichhand);
1107 1110          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1108 1111          ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1109 1112          if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1110 1113                  goto recheck;
1111 1114  
1112 1115          /*LINTED: constant in conditional context*/
1113 1116          VN_DISPOSE(pp, B_FREE, 0, kcred);
1114 1117  
1115 1118          CPU_STATS_ADD_K(vm, dfree, 1);
1116 1119  
1117 1120          if (isfs) {
1118 1121                  if (isexec) {
1119 1122                          CPU_STATS_ADD_K(vm, execfree, 1);
1120 1123                  } else {
1121 1124                          CPU_STATS_ADD_K(vm, fsfree, 1);
1122 1125                  }
1123 1126          } else {
1124 1127                  CPU_STATS_ADD_K(vm, anonfree, 1);
1125 1128          }
1126 1129  
1127 1130          return (1);             /* freed a page! */
1128 1131  }
1129 1132  
1130 1133  /*
1131 1134   * Queue async i/o request from pageout_scanner and segment swapout
1132 1135   * routines on one common list.  This ensures that pageout devices (swap)
1133 1136   * are not saturated by pageout_scanner or swapout requests.
1134 1137   * The pageout thread empties this list by initiating i/o operations.
1135 1138   */
1136 1139  int
1137 1140  queue_io_request(vnode_t *vp, u_offset_t off)
1138 1141  {
1139 1142          struct async_reqs *arg;
1140 1143  
1141 1144          /*
1142 1145           * If we cannot allocate an async request struct,
1143 1146           * skip this page.
1144 1147           */
1145 1148          mutex_enter(&push_lock);
1146 1149          if ((arg = req_freelist) == NULL) {
1147 1150                  mutex_exit(&push_lock);
1148 1151                  return (0);
1149 1152          }
1150 1153          req_freelist = arg->a_next;             /* adjust freelist */
1151 1154          push_list_size++;
1152 1155  
1153 1156          arg->a_vp = vp;
1154 1157          arg->a_off = off;
1155 1158          arg->a_len = PAGESIZE;
1156 1159          arg->a_flags = B_ASYNC | B_FREE;
1157 1160          arg->a_cred = kcred;            /* always held */
1158 1161  
1159 1162          /*
1160 1163           * Add to list of pending write requests.
1161 1164           */
1162 1165          arg->a_next = push_list;
1163 1166          push_list = arg;
1164 1167  
1165 1168          if (req_freelist == NULL) {
1166 1169                  /*
1167 1170                   * No free async requests left. The lock is held so we
1168 1171                   * might as well signal the pusher thread now.
1169 1172                   */
1170 1173                  cv_signal(&push_cv);
1171 1174          }
1172 1175          mutex_exit(&push_lock);
1173 1176          return (1);
1174 1177  }
1175 1178  
1176 1179  /*
1177 1180   * Wakeup pageout to initiate i/o if push_list is not empty.
1178 1181   */
1179 1182  void
1180 1183  cv_signal_pageout()
1181 1184  {
1182 1185          if (push_list != NULL) {
1183 1186                  mutex_enter(&push_lock);
1184 1187                  cv_signal(&push_cv);
1185 1188                  mutex_exit(&push_lock);
1186 1189          }
1187 1190  }

↓ open down ↓

1082 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX