Print this page
13097 improve VM tunables for modern systems (fix mismerge)
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/os/vm_pageout.c
+++ new/usr/src/uts/common/os/vm_pageout.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2021 Oxide Computer Company
24 24 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
25 25 */
26 26
27 27 /*
28 28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 29 * Use is subject to license terms.
30 30 * Copyright 2018 Joyent, Inc.
31 31 */
32 32
33 33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
34 34 /* All Rights Reserved */
35 35
36 36 /*
37 37 * University Copyright- Copyright (c) 1982, 1986, 1988
38 38 * The Regents of the University of California
39 39 * All Rights Reserved
40 40 *
41 41 * University Acknowledgment- Portions of this document are derived from
42 42 * software developed by the University of California, Berkeley, and its
43 43 * contributors.
44 44 */
45 45
46 46 #include <sys/types.h>
47 47 #include <sys/t_lock.h>
48 48 #include <sys/param.h>
49 49 #include <sys/buf.h>
50 50 #include <sys/uio.h>
51 51 #include <sys/proc.h>
52 52 #include <sys/systm.h>
53 53 #include <sys/mman.h>
54 54 #include <sys/cred.h>
55 55 #include <sys/vnode.h>
56 56 #include <sys/vm.h>
57 57 #include <sys/vmparam.h>
58 58 #include <sys/vtrace.h>
59 59 #include <sys/cmn_err.h>
60 60 #include <sys/cpuvar.h>
61 61 #include <sys/user.h>
62 62 #include <sys/kmem.h>
63 63 #include <sys/debug.h>
64 64 #include <sys/callb.h>
65 65 #include <sys/tnf_probe.h>
66 66 #include <sys/mem_cage.h>
67 67 #include <sys/time.h>
68 68 #include <sys/zone.h>
69 69 #include <sys/stdbool.h>
70 70
71 71 #include <vm/hat.h>
72 72 #include <vm/as.h>
73 73 #include <vm/seg.h>
74 74 #include <vm/page.h>
75 75 #include <vm/pvn.h>
76 76 #include <vm/seg_kmem.h>
77 77
78 78 /*
79 79 * FREE MEMORY MANAGEMENT
80 80 *
81 81 * Management of the pool of free pages is a tricky business. There are
82 82 * several critical threshold values which constrain our allocation of new
83 83 * pages and inform the rate of paging out of memory to swap. These threshold
84 84 * values, and the behaviour they induce, are described below in descending
85 85 * order of size -- and thus increasing order of severity!
86 86 *
87 87 * +---------------------------------------------------- physmem (all memory)
88 88 * |
89 89 * | Ordinarily there are no particular constraints placed on page
90 90 * v allocation. The page scanner is not running and page_create_va()
91 91 * | will effectively grant all page requests (whether from the kernel
92 92 * | or from user processes) without artificial delay.
93 93 * |
94 94 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
95 95 * |
96 96 * | When we have less than "lotsfree" pages, pageout_scanner() is
97 97 * v signalled by schedpaging() to begin looking for pages that can
98 98 * | be evicted to disk to bring us back above lotsfree. At this
99 99 * | stage there is still no constraint on allocation of free pages.
100 100 * |
101 101 * | For small systems, we set a lower bound of 16MB for lotsfree;
102 102 * v this is the natural value for a system with 1GB memory. This is
103 103 * | to ensure that the pageout reserve pool contains at least 4MB
104 104 * | for use by ZFS.
105 105 * |
106 106 * | For systems with a large amount of memory, we constrain lotsfree
107 107 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as
108 108 * v at some point the required slack relates more closely to the
109 109 * | rate at which paging can occur than to the total amount of memory.
110 110 * |
111 111 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
112 112 * |
113 113 * | When we drop below desfree, a number of kernel facilities will
114 114 * v wait before allocating more memory, under the assumption that
115 115 * | pageout or reaping will make progress and free up some memory.
116 116 * | This behaviour is not especially coordinated; look for comparisons
117 117 * | of desfree and freemem.
118 118 * |
119 119 * | In addition to various attempts at advisory caution, clock()
120 120 * | will wake up the thread that is ordinarily parked in sched().
121 121 * | This routine is responsible for the heavy-handed swapping out
122 122 * v of entire processes in an attempt to arrest the slide of free
123 123 * | memory. See comments in sched.c for more details.
124 124 * |
125 125 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
126 126 * |
127 127 * | These two separate tunables have, by default, the same value.
128 128 * v Various parts of the kernel use minfree to signal the need for
129 129 * | more aggressive reclamation of memory, and sched() is more
130 130 * | aggressive at swapping processes out.
131 131 * |
132 132 * | If free memory falls below throttlefree, page_create_va() will
133 133 * | use page_create_throttle() to begin holding most requests for
134 134 * | new pages while pageout and reaping free up memory. Sleeping
135 135 * v allocations (e.g., KM_SLEEP) are held here while we wait for
136 136 * | more memory. Non-sleeping allocations are generally allowed to
137 137 * | proceed, unless their priority is explicitly lowered with
138 138 * | KM_NORMALPRI.
139 139 * |
140 140 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
141 141 * |
142 142 * | When we hit throttlefree, the situation is already dire. The
143 143 * v system is generally paging out memory and swapping out entire
144 144 * | processes in order to free up memory for continued operation.
145 145 * |
146 146 * | Unfortunately, evicting memory to disk generally requires short
147 147 * | term use of additional memory; e.g., allocation of buffers for
148 148 * | storage drivers, updating maps of free and used blocks, etc.
149 149 * | As such, pageout_reserve is the number of pages that we keep in
150 150 * | special reserve for use by pageout() and sched() and by any
151 151 * v other parts of the kernel that need to be working for those to
152 152 * | make forward progress such as the ZFS I/O pipeline.
153 153 * |
154 154 * | When we are below pageout_reserve, we fail or hold any allocation
155 155 * | that has not explicitly requested access to the reserve pool.
156 156 * | Access to the reserve is generally granted via the KM_PUSHPAGE
157 157 * | flag, or by marking a thread T_PUSHPAGE such that all allocations
158 158 * | can implicitly tap the reserve. For more details, see the
159 159 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
160 160 * | and VM_PUSHPAGE allocation flags, and page_create_throttle().
161 161 * |
162 162 * +---------------------------------------------------------- no free memory
163 163 * |
164 164 * | If we have arrived here, things are very bad indeed. It is
165 165 * v surprisingly difficult to tell if this condition is even fatal,
166 166 * | as enough memory may have been granted to pageout() and to the
167 167 * | ZFS I/O pipeline that requests for eviction that have already been
168 168 * | made will complete and free up memory some time soon.
169 169 * |
170 170 * | If free memory does not materialise, the system generally remains
171 171 * | deadlocked. The pageout_deadman() below is run once per second
172 172 * | from clock(), seeking to limit the amount of time a single request
173 173 * v to page out can be blocked before the system panics to get a crash
174 174 * | dump and return to service.
175 175 * |
176 176 * +-------------------------------------------------------------------------
177 177 */
178 178
179 179 /*
180 180 * The following parameters control operation of the page replacement
181 181 * algorithm. They are initialized to 0, and then computed at boot time based
182 182 * on the size of the system; see setupclock(). If they are patched non-zero
183 183 * in a loaded vmunix they are left alone and may thus be changed per system
184 184 * using "mdb -kw" on the loaded system.
185 185 */
186 186 pgcnt_t slowscan = 0;
187 187 pgcnt_t fastscan = 0;
188 188
189 189 static pgcnt_t handspreadpages = 0;
190 190
191 191 /*
192 192 * looppages:
193 193 * Cached copy of the total number of pages in the system (total_pages).
194 194 *
195 195 * loopfraction:
196 196 * Divisor used to relate fastscan to looppages in setupclock().
197 197 */
198 198 static uint_t loopfraction = 2;
199 199 static pgcnt_t looppages;
200 200
201 201 static uint_t min_percent_cpu = 4;
202 202 static uint_t max_percent_cpu = 80;
203 203 static pgcnt_t maxfastscan = 0;
204 204 static pgcnt_t maxslowscan = 100;
205 205
206 206 #define MEGABYTES (1024ULL * 1024ULL)
207 207
208 208 /*
209 209 * pageout_threshold_style:
210 210 * set to 1 to use the previous default threshold size calculation;
211 211 * i.e., each threshold is half of the next largest value.
212 212 */
213 213 uint_t pageout_threshold_style = 0;
214 214
215 215 /*
216 216 * The operator may override these tunables to request a different minimum or
217 217 * maximum lotsfree value, or to change the divisor we use for automatic
218 218 * sizing.
219 219 *
220 220 * By default, we make lotsfree 1/64th of the total memory in the machine. The
221 221 * minimum and maximum are specified in bytes, rather than pages; a zero value
222 222 * means the default values (below) are used.
223 223 */
224 224 uint_t lotsfree_fraction = 64;
225 225 pgcnt_t lotsfree_min = 0;
226 226 pgcnt_t lotsfree_max = 0;
227 227
228 228 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES)
229 229 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES)
230 230
231 231 /*
232 232 * If these tunables are set to non-zero values in /etc/system, and provided
233 233 * the value is not larger than the threshold above, the specified value will
234 234 * be used directly without any additional calculation or adjustment. The boot
|
↓ open down ↓ |
234 lines elided |
↑ open up ↑ |
235 235 * time value of these overrides is preserved in the "clockinit" struct. More
236 236 * detail is available in the comment at the top of the file.
237 237 */
238 238 pgcnt_t maxpgio = 0;
239 239 pgcnt_t minfree = 0;
240 240 pgcnt_t desfree = 0;
241 241 pgcnt_t lotsfree = 0;
242 242 pgcnt_t needfree = 0;
243 243 pgcnt_t throttlefree = 0;
244 244 pgcnt_t pageout_reserve = 0;
245 +pri_t pageout_pri;
245 246
246 247 pgcnt_t deficit;
247 248 pgcnt_t nscan;
248 249 pgcnt_t desscan;
249 250
250 251 /* kstats */
251 252 uint64_t low_mem_scan;
252 253 uint64_t zone_cap_scan;
253 -uint64_t n_throttle;
254 254
255 +#define MAX_PSCAN_THREADS 16
256 +
255 257 /*
256 258 * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
257 259 * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
258 260 * that gives the equivalent of some underlying %CPU duty cycle.
259 261 *
260 262 * min_pageout_nsec:
261 263 * nanoseconds/wakeup equivalent of min_percent_cpu.
262 264 *
263 265 * max_pageout_nsec:
264 266 * nanoseconds/wakeup equivalent of max_percent_cpu.
265 267 *
266 268 * pageout_nsec:
267 269 * Number of nanoseconds budgeted for each wakeup cycle.
268 270 * Computed each time around by schedpaging().
269 271 * Varies between min_pageout_nsec and max_pageout_nsec,
270 272 * depending on memory pressure or zones over their cap.
271 273 *
272 274 * zone_pageout_nsec:
273 - * Number of nanoseconds budget for each cycle when a zone
274 - * is over its memory cap. If this is zero, then the value
275 - * of max_pageout_nsec is used instead.
275 + * Number of nanoseconds budget for each cycle when a zone
276 + * is over its memory cap. If this is zero, then the value
277 + * of max_pageout_nsec is used instead.
276 278 */
277 -
278 279 static hrtime_t min_pageout_nsec;
279 280 static hrtime_t max_pageout_nsec;
280 281 static hrtime_t pageout_nsec;
281 282 static hrtime_t zone_pageout_nsec;
282 283
283 -#define MAX_PSCAN_THREADS 16
284 -static boolean_t reset_hands[MAX_PSCAN_THREADS];
284 +static boolean_t reset_hands[MAX_PSCAN_THREADS];
285 285
286 +#define PAGES_POLL_MASK 1023
287 +#define SCHEDPAGING_HZ 4
288 +
286 289 /*
287 - * These can be tuned in /etc/system or set with mdb.
288 - * 'des_page_scanners' is the desired number of page scanner threads. The
289 - * system will bring the actual number of threads into line with the desired
290 - * number. If des_page_scanners is set to an invalid value, the system will
291 - * correct the setting.
290 + * despagescanners:
291 + * The desired number of page scanner threads. The value can be set in
292 + * /etc/system or tuned directly with 'mdb -kw'. The system will bring
293 + * the actual number of threads into line with the desired number. If set
294 + * to an invalid value, the system will correct the setting.
292 295 */
293 -uint_t des_page_scanners;
294 -uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */
296 +uint_t despagescanners = 0;
295 297
296 -uint_t n_page_scanners;
297 -static pgcnt_t pscan_region_sz; /* informational only */
298 -
299 -#define PAGES_POLL_MASK 1023
300 -
301 298 /*
302 299 * pageout_sample_lim:
303 300 * The limit on the number of samples needed to establish a value for new
304 301 * pageout parameters: fastscan, slowscan, pageout_new_spread, and
305 302 * handspreadpages.
306 303 *
307 304 * pageout_sample_cnt:
308 305 * Current sample number. Once the sample gets large enough, set new
309 306 * values for handspreadpages, pageout_new_spread, fastscan and slowscan.
310 307 *
311 308 * pageout_sample_pages:
312 309 * The accumulated number of pages scanned during sampling.
313 310 *
314 311 * pageout_sample_etime:
315 312 * The accumulated nanoseconds for the sample.
|
↓ open down ↓ |
5 lines elided |
↑ open up ↑ |
316 313 *
317 314 * pageout_rate:
318 315 * Rate in pages/nanosecond, computed at the end of sampling.
319 316 *
320 317 * pageout_new_spread:
321 318 * Initially zero while the system scan rate is measured by
322 319 * pageout_scanner(), which then sets this value once per system boot after
323 320 * enough samples have been recorded (pageout_sample_cnt). Once set, this
324 321 * new value is used for fastscan and handspreadpages.
325 322 */
326 -
327 323 typedef hrtime_t hrrate_t;
328 324
329 325 static uint64_t pageout_sample_lim = 4;
330 326 static uint64_t pageout_sample_cnt = 0;
331 327 static pgcnt_t pageout_sample_pages = 0;
328 +static hrtime_t pageout_sample_etime = 0;
332 329 static hrrate_t pageout_rate = 0;
333 330 static pgcnt_t pageout_new_spread = 0;
334 331
335 -static hrtime_t pageout_sample_etime = 0;
336 -
337 -/* True if page scanner is first starting up */
332 +/* True if the page scanner is first starting up */
338 333 #define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
339 334
335 +/* The current number of page scanner threads */
336 +static uint_t n_page_scanners = 1;
337 +/* The number of page scanner threads that are actively scanning. */
338 +static uint_t pageouts_running;
339 +
340 340 /*
341 341 * Record number of times a pageout_scanner() wakeup cycle finished because it
342 342 * timed out (exceeded its CPU budget), rather than because it visited
343 343 * its budgeted number of pages. This is only done when scanning under low
344 344 * free memory conditions, not when scanning for zones over their cap.
345 345 */
346 346 uint64_t pageout_timeouts = 0;
347 347
348 348 #ifdef VM_STATS
349 349 static struct pageoutvmstats_str {
350 350 ulong_t checkpage[3];
351 351 } pageoutvmstats;
352 352 #endif /* VM_STATS */
353 353
354 354 /*
355 355 * Threads waiting for free memory use this condition variable and lock until
356 356 * memory becomes available.
357 357 */
358 358 kmutex_t memavail_lock;
359 359 kcondvar_t memavail_cv;
360 360
361 361 typedef enum pageout_hand {
362 362 POH_FRONT = 1,
363 363 POH_BACK,
364 364 } pageout_hand_t;
365 365
366 366 typedef enum {
367 367 CKP_INELIGIBLE,
368 368 CKP_NOT_FREED,
369 369 CKP_FREED,
370 370 } checkpage_result_t;
371 371
372 372 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
373 373
374 374 static struct clockinit {
375 375 bool ci_init;
376 376 pgcnt_t ci_lotsfree_min;
377 377 pgcnt_t ci_lotsfree_max;
|
↓ open down ↓ |
28 lines elided |
↑ open up ↑ |
378 378 pgcnt_t ci_lotsfree;
379 379 pgcnt_t ci_desfree;
380 380 pgcnt_t ci_minfree;
381 381 pgcnt_t ci_throttlefree;
382 382 pgcnt_t ci_pageout_reserve;
383 383 pgcnt_t ci_maxpgio;
384 384 pgcnt_t ci_maxfastscan;
385 385 pgcnt_t ci_fastscan;
386 386 pgcnt_t ci_slowscan;
387 387 pgcnt_t ci_handspreadpages;
388 + uint_t ci_despagescanners;
388 389 } clockinit = { .ci_init = false };
389 390
390 -static pgcnt_t
391 +static inline pgcnt_t
391 392 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
392 393 {
393 394 if (value < minimum) {
394 395 return (minimum);
395 396 } else if (value > maximum) {
396 397 return (maximum);
397 398 } else {
398 399 return (value);
399 400 }
400 401 }
401 402
402 403 static pgcnt_t
403 404 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
404 405 {
405 406 if (initval == 0 || initval >= initval_ceiling) {
406 407 return (defval);
407 408 } else {
408 409 return (initval);
409 410 }
410 411 }
411 412
412 413 /*
413 414 * Local boolean to control scanning when zones are over their cap. Avoids
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
414 415 * accessing the zone_num_over_cap variable except within schedpaging(), which
415 416 * only runs periodically. This is here only to reduce our access to
416 417 * zone_num_over_cap, since it is already accessed a lot during paging, and
417 418 * the page scanner accesses the zones_over variable on each page during a
418 419 * scan. There is no lock needed for zone_num_over_cap since schedpaging()
419 420 * doesn't modify the variable, it only cares if the variable is 0 or non-0.
420 421 */
421 422 static boolean_t zones_over = B_FALSE;
422 423
423 424 /*
425 + * On large memory systems, multiple instances of the page scanner are run,
426 + * each responsible for a separate region of memory. This speeds up page
427 + * invalidation under low memory conditions.
428 + *
429 + * despagescanners can be set in /etc/system or via mdb and it will
430 + * be used as a guide for how many page scanners to create; the value
431 + * will be adjusted if it is not sensible. Otherwise, the number of
432 + * page scanners is determined dynamically based on handspreadpages.
433 + */
434 +static void
435 +recalc_pagescanners(void)
436 +{
437 + pgcnt_t sz;
438 + uint_t des;
439 +
440 + /* If the initial calibration has not been done, take no action. */
441 + if (pageout_new_spread == 0)
442 + return;
443 +
444 + /*
445 + * If the desired number of scanners is set in /etc/system
446 + * then try to use it.
447 + */
448 + if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
449 + despagescanners = clockinit.ci_despagescanners;
450 +
451 + if (despagescanners != 0) {
452 + /*
453 + * We have a desired number of page scanners, either from
454 + * /etc/system or set via mdb. Try and use it (it will be
455 + * clamped below).
456 + */
457 + des = despagescanners;
458 + } else {
459 + /*
460 + * Calculate the number of desired scanners based on the
461 + * system's memory size.
462 + *
463 + * A 64GiB region size is used as the basis for calculating how
464 + * many scanner threads should be created. For systems with up
465 + * to 64GiB of RAM, a single thread is used; for very large
466 + * memory systems the threads are limited to MAX_PSCAN_THREADS.
467 + */
468 + sz = btop(64ULL << 30);
469 +
470 + if (sz > looppages) {
471 + des = 1;
472 + } else {
473 + pgcnt_t tmp = sz;
474 +
475 + for (des = 1; tmp < looppages; des++)
476 + tmp += sz;
477 + }
478 + }
479 +
480 + /*
481 + * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
482 + * and so that each scanner covers at least 10% more than
483 + * handspreadpages.
484 + */
485 + des = clamp(des, 1,
486 + looppages / (handspreadpages + handspreadpages / 10));
487 + despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
488 +}
489 +
490 +/*
424 491 * Set up the paging constants for the clock algorithm used by
425 492 * pageout_scanner(), and by the virtual memory system overall. See the
426 493 * comments at the top of this file for more information about the threshold
427 494 * values and system responses to memory pressure.
428 495 *
429 496 * This routine is called once by main() at startup, after the initial size of
430 497 * physical memory is determined. It may be called again later if memory is
431 498 * added to or removed from the system, or if new measurements of the page scan
432 499 * rate become available.
433 500 */
434 501 void
435 502 setupclock(void)
436 503 {
437 - uint_t i;
438 - pgcnt_t sz, tmp;
439 - pgcnt_t defval;
440 504 bool half = (pageout_threshold_style == 1);
441 505 bool recalc = true;
442 506
443 507 looppages = total_pages;
444 508
445 509 /*
446 510 * The operator may have provided specific values for some of the
447 511 * tunables via /etc/system. On our first call, we preserve those
448 512 * values so that they can be used for subsequent recalculations.
449 513 *
450 514 * A value of zero for any tunable means we will use the default
451 515 * sizing.
452 516 */
453 -
454 517 if (!clockinit.ci_init) {
455 518 clockinit.ci_init = true;
456 519
457 520 clockinit.ci_lotsfree_min = lotsfree_min;
458 521 clockinit.ci_lotsfree_max = lotsfree_max;
459 522 clockinit.ci_lotsfree = lotsfree;
460 523 clockinit.ci_desfree = desfree;
461 524 clockinit.ci_minfree = minfree;
462 525 clockinit.ci_throttlefree = throttlefree;
463 526 clockinit.ci_pageout_reserve = pageout_reserve;
464 527 clockinit.ci_maxpgio = maxpgio;
465 528 clockinit.ci_maxfastscan = maxfastscan;
466 529 clockinit.ci_fastscan = fastscan;
467 530 clockinit.ci_slowscan = slowscan;
468 531 clockinit.ci_handspreadpages = handspreadpages;
532 + clockinit.ci_despagescanners = despagescanners;
469 533
470 534 /*
471 535 * The first call does not trigger a recalculation, only
472 536 * subsequent calls.
473 537 */
474 538 recalc = false;
475 539 }
476 540
477 541 /*
478 542 * Configure paging threshold values. For more details on what each
479 543 * threshold signifies, see the comments at the top of this file.
480 544 */
481 545 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
482 546 btop(LOTSFREE_MAX_DEFAULT));
483 547 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
484 548 btop(LOTSFREE_MIN_DEFAULT));
485 549
486 550 lotsfree = tune(clockinit.ci_lotsfree, looppages,
487 551 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
488 552
489 553 desfree = tune(clockinit.ci_desfree, lotsfree,
490 554 lotsfree / 2);
491 555
492 556 minfree = tune(clockinit.ci_minfree, desfree,
493 557 half ? desfree / 2 : 3 * desfree / 4);
494 558
495 559 throttlefree = tune(clockinit.ci_throttlefree, desfree,
496 560 minfree);
497 561
498 562 pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
499 563 half ? throttlefree / 2 : 3 * throttlefree / 4);
500 564
501 565 /*
502 566 * Maxpgio thresholds how much paging is acceptable.
503 567 * This figures that 2/3 busy on an arm is all that is
504 568 * tolerable for paging. We assume one operation per disk rev.
505 569 *
506 570 * XXX - Does not account for multiple swap devices.
507 571 */
508 572 if (clockinit.ci_maxpgio == 0) {
509 573 maxpgio = (DISKRPM * 2) / 3;
510 574 } else {
511 575 maxpgio = clockinit.ci_maxpgio;
512 576 }
513 577
514 578 /*
515 579 * The clock scan rate varies between fastscan and slowscan
516 580 * based on the amount of free memory available. Fastscan
517 581 * rate should be set based on the number pages that can be
518 582 * scanned per sec using ~10% of processor time. Since this
519 583 * value depends on the processor, MMU, Mhz etc., it is
520 584 * difficult to determine it in a generic manner for all
521 585 * architectures.
522 586 *
523 587 * Instead of trying to determine the number of pages scanned
524 588 * per sec for every processor, fastscan is set to be the smaller
525 589 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
526 590 * time is limited to ~4% of processor time.
527 591 *
528 592 * Setting fastscan to be 1/2 of memory allows pageout to scan
529 593 * all of memory in ~2 secs. This implies that user pages not
530 594 * accessed within 1 sec (assuming, handspreadpages == fastscan)
531 595 * can be reclaimed when free memory is very low. Stealing pages
532 596 * not accessed within 1 sec seems reasonable and ensures that
533 597 * active user processes don't thrash.
534 598 *
535 599 * Smaller values of fastscan result in scanning fewer pages
536 600 * every second and consequently pageout may not be able to free
537 601 * sufficient memory to maintain the minimum threshold. Larger
538 602 * values of fastscan result in scanning a lot more pages which
539 603 * could lead to thrashing and higher CPU usage.
540 604 *
541 605 * Fastscan needs to be limited to a maximum value and should not
542 606 * scale with memory to prevent pageout from consuming too much
543 607 * time for scanning on slow CPU's and avoid thrashing, as a
544 608 * result of scanning too many pages, on faster CPU's.
545 609 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
546 610 * (the upper bound for fastscan) based on the average number
547 611 * of pages that can potentially be scanned in ~1 sec (using ~4%
548 612 * of the CPU) on some of the following machines that currently
549 613 * run Solaris 2.x:
550 614 *
551 615 * average memory scanned in ~1 sec
552 616 *
553 617 * 25 Mhz SS1+: 23 Meg
554 618 * LX: 37 Meg
555 619 * 50 Mhz SC2000: 68 Meg
556 620 *
557 621 * 40 Mhz 486: 26 Meg
558 622 * 66 Mhz 486: 42 Meg
559 623 *
560 624 * When free memory falls just below lotsfree, the scan rate
561 625 * goes from 0 to slowscan (i.e., pageout starts running). This
562 626 * transition needs to be smooth and is achieved by ensuring that
563 627 * pageout scans a small number of pages to satisfy the transient
564 628 * memory demand. This is set to not exceed 100 pages/sec (25 per
565 629 * wakeup) since scanning that many pages has no noticible impact
566 630 * on system performance.
567 631 *
568 632 * In addition to setting fastscan and slowscan, pageout is
569 633 * limited to using ~4% of the CPU. This results in increasing
570 634 * the time taken to scan all of memory, which in turn means that
571 635 * user processes have a better opportunity of preventing their
572 636 * pages from being stolen. This has a positive effect on
573 637 * interactive and overall system performance when memory demand
574 638 * is high.
575 639 *
576 640 * Thus, the rate at which pages are scanned for replacement will
577 641 * vary linearly between slowscan and the number of pages that
578 642 * can be scanned using ~4% of processor time instead of varying
579 643 * linearly between slowscan and fastscan.
580 644 *
581 645 * Also, the processor time used by pageout will vary from ~1%
582 646 * at slowscan to ~4% at fastscan instead of varying between
583 647 * ~1% at slowscan and ~10% at fastscan.
584 648 *
585 649 * The values chosen for the various VM parameters (fastscan,
586 650 * handspreadpages, etc) are not universally true for all machines,
587 651 * but appear to be a good rule of thumb for the machines we've
588 652 * tested. They have the following ranges:
589 653 *
590 654 * cpu speed: 20 to 70 Mhz
591 655 * page size: 4K to 8K
592 656 * memory size: 16M to 5G
593 657 * page scan rate: 4000 - 17400 4K pages per sec
594 658 *
595 659 * The values need to be re-examined for machines which don't
596 660 * fall into the various ranges (e.g., slower or faster CPUs,
597 661 * smaller or larger pagesizes etc) shown above.
598 662 *
599 663 * On an MP machine, pageout is often unable to maintain the
600 664 * minimum paging thresholds under heavy load. This is due to
601 665 * the fact that user processes running on other CPU's can be
602 666 * dirtying memory at a much faster pace than pageout can find
603 667 * pages to free. The memory demands could be met by enabling
604 668 * more than one CPU to run the clock algorithm in such a manner
605 669 * that the various clock hands don't overlap. This also makes
606 670 * it more difficult to determine the values for fastscan, slowscan
607 671 * and handspreadpages.
608 672 *
609 673 * The swapper is currently used to free up memory when pageout
610 674 * is unable to meet memory demands by swapping out processes.
611 675 * In addition to freeing up memory, swapping also reduces the
612 676 * demand for memory by preventing user processes from running
613 677 * and thereby consuming memory.
614 678 */
615 679 if (clockinit.ci_maxfastscan == 0) {
616 680 if (pageout_new_spread != 0) {
617 681 maxfastscan = pageout_new_spread;
618 682 } else {
619 683 maxfastscan = MAXHANDSPREADPAGES;
620 684 }
621 685 } else {
622 686 maxfastscan = clockinit.ci_maxfastscan;
623 687 }
624 688
625 689 if (clockinit.ci_fastscan == 0) {
626 690 fastscan = MIN(looppages / loopfraction, maxfastscan);
627 691 } else {
628 692 fastscan = clockinit.ci_fastscan;
629 693 }
630 694
631 695 if (fastscan > looppages / loopfraction) {
632 696 fastscan = looppages / loopfraction;
633 697 }
634 698
635 699 /*
636 700 * Set slow scan time to 1/10 the fast scan time, but
637 701 * not to exceed maxslowscan.
638 702 */
639 703 if (clockinit.ci_slowscan == 0) {
|
↓ open down ↓ |
161 lines elided |
↑ open up ↑ |
640 704 slowscan = MIN(fastscan / 10, maxslowscan);
641 705 } else {
642 706 slowscan = clockinit.ci_slowscan;
643 707 }
644 708
645 709 if (slowscan > fastscan / 2) {
646 710 slowscan = fastscan / 2;
647 711 }
648 712
649 713 /*
650 - * Handspreadpages is distance (in pages) between front and back
714 + * Handspreadpages is the distance (in pages) between front and back
651 715 * pageout daemon hands. The amount of time to reclaim a page
652 716 * once pageout examines it increases with this distance and
653 717 * decreases as the scan rate rises. It must be < the amount
654 718 * of pageable memory.
655 719 *
656 720 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
657 721 * to be "fastscan" results in the front hand being a few secs
658 722 * (varies based on the processor speed) ahead of the back hand
659 723 * at fastscan rates. This distance can be further reduced, if
660 724 * necessary, by increasing the processor time used by pageout
661 725 * to be more than ~4% and preferrably not more than ~10%.
662 726 *
663 727 * As a result, user processes have a much better chance of
664 728 * referencing their pages before the back hand examines them.
665 729 * This also significantly lowers the number of reclaims from
666 730 * the freelist since pageout does not end up freeing pages which
667 731 * may be referenced a sec later.
668 732 */
669 733 if (clockinit.ci_handspreadpages == 0) {
670 734 handspreadpages = fastscan;
671 735 } else {
672 736 handspreadpages = clockinit.ci_handspreadpages;
673 737 }
674 738
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
675 739 /*
676 740 * Make sure that back hand follows front hand by at least
677 741 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the
678 742 * back hand to look at a page during the same wakeup of the pageout
679 743 * daemon in which the front hand cleared its ref bit.
680 744 */
681 745 if (handspreadpages >= looppages) {
682 746 handspreadpages = looppages - 1;
683 747 }
684 748
685 - if (!recalc) {
686 - /*
687 - * Setup basic values at initialization.
688 - */
689 - pscan_region_sz = total_pages;
690 - des_page_scanners = n_page_scanners = 1;
691 - reset_hands[0] = B_TRUE;
749 + /*
750 + * Establish the minimum and maximum length of time to be spent
751 + * scanning pages per wakeup, limiting the scanner duty cycle. The
752 + * input percentage values (0-100) must be converted to a fraction of
753 + * the number of nanoseconds in a second of wall time, then further
754 + * scaled down by the number of scanner wakeups in a second.
755 + */
756 + min_pageout_nsec = MAX(1,
757 + NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
758 + max_pageout_nsec = MAX(min_pageout_nsec,
759 + NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
760 +
761 + /*
762 + * If not called for recalculation, return and skip the remaining
763 + * steps.
764 + */
765 + if (!recalc)
692 766 return;
693 - }
694 767
695 768 /*
696 - * Recalculating
697 - *
698 - * We originally set the number of page scanners to 1. Now that we
699 - * know what the handspreadpages is for a scanner, figure out how many
700 - * scanners we should run. We want to ensure that the regions don't
701 - * overlap and that they are not touching.
702 - *
703 - * A default 64GB region size is used as the initial value to calculate
704 - * how many scanner threads we should create on lower memory systems.
705 - * The idea is to limit the number of threads to a practical value
706 - * (e.g. a 64GB machine really only needs one scanner thread). For very
707 - * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
708 - * threads.
709 - *
710 - * The scanner threads themselves are evenly spread out around the
711 - * memory "clock" in pageout_scanner when we reset the hands, and each
712 - * thread will scan all of memory.
769 + * Set a flag to re-evaluate the clock hand positions.
713 770 */
714 - sz = (btop(64ULL * 0x40000000ULL));
715 - if (sz < handspreadpages) {
716 - /*
717 - * 64GB is smaller than the separation between the front
718 - * and back hands; use double handspreadpages.
719 - */
720 - sz = handspreadpages << 1;
721 - }
722 - if (sz > total_pages) {
723 - sz = total_pages;
724 - }
725 - /* Record region size for inspection with mdb, otherwise unused */
726 - pscan_region_sz = sz;
771 + for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
772 + reset_hands[i] = B_TRUE;
727 773
728 - tmp = sz;
729 - for (i = 1; tmp < total_pages; i++) {
730 - tmp += sz;
731 - }
732 -
733 - if (i > MAX_PSCAN_THREADS)
734 - i = MAX_PSCAN_THREADS;
735 -
736 - des_page_scanners = i;
774 + recalc_pagescanners();
737 775 }
738 776
739 777 /*
740 778 * Pageout scheduling.
741 779 *
742 780 * Schedpaging controls the rate at which the page out daemon runs by
743 781 * setting the global variables nscan and desscan SCHEDPAGING_HZ
744 782 * times a second. Nscan records the number of pages pageout has examined
745 783 * in its current pass; schedpaging() resets this value to zero each time
746 784 * it runs. Desscan records the number of pages pageout should examine
747 785 * in its next pass; schedpaging() sets this value based on the amount of
748 786 * currently available memory.
749 787 */
750 -#define SCHEDPAGING_HZ 4
751 788
752 -static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
789 +static kmutex_t pageout_mutex;
753 790
754 791 /*
755 792 * Pool of available async pageout putpage requests.
756 793 */
757 794 static struct async_reqs *push_req;
758 795 static struct async_reqs *req_freelist; /* available req structs */
759 796 static struct async_reqs *push_list; /* pending reqs */
760 797 static kmutex_t push_lock; /* protects req pool */
761 798 static kcondvar_t push_cv;
762 799
763 800 /*
764 801 * If pageout() is stuck on a single push for this many seconds,
765 802 * pageout_deadman() will assume the system has hit a memory deadlock. If set
766 803 * to 0, the deadman will have no effect.
767 804 *
768 805 * Note that we are only looking for stalls in the calls that pageout() makes
769 806 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging
|
↓ open down ↓ |
7 lines elided |
↑ open up ↑ |
770 807 * I/O, which should not take long unless the underlying strategy call blocks
771 808 * indefinitely for memory. The actual I/O request happens (or fails) later.
772 809 */
773 810 uint_t pageout_deadman_seconds = 90;
774 811
775 812 static uint_t pageout_stucktime = 0;
776 813 static bool pageout_pushing = false;
777 814 static uint64_t pageout_pushcount = 0;
778 815 static uint64_t pageout_pushcount_seen = 0;
779 816
780 -static int async_list_size = 256; /* number of async request structs */
817 +static int async_list_size = 8192; /* number of async request structs */
781 818
782 819 static void pageout_scanner(void *);
783 820
784 821 /*
785 822 * If a page is being shared more than "po_share" times
786 823 * then leave it alone- don't page it out.
787 824 */
788 825 #define MIN_PO_SHARE (8)
789 826 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
790 827 ulong_t po_share = MIN_PO_SHARE;
791 828
792 829 /*
793 830 * Schedule rate for paging.
794 831 * Rate is linear interpolation between
795 832 * slowscan with lotsfree and fastscan when out of memory.
796 833 */
797 834 static void
798 835 schedpaging(void *arg)
799 836 {
800 837 spgcnt_t vavail;
|
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
801 838
802 839 if (freemem < lotsfree + needfree + kmem_reapahead)
803 840 kmem_reap();
804 841
805 842 if (freemem < lotsfree + needfree)
806 843 seg_preap();
807 844
808 845 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
809 846 kcage_cageout_wakeup();
810 847
811 - (void) atomic_swap_ulong(&nscan, 0);
812 - vavail = freemem - deficit;
813 - if (pageout_new_spread != 0)
814 - vavail -= needfree;
815 - if (vavail < 0)
816 - vavail = 0;
817 - if (vavail > lotsfree)
818 - vavail = lotsfree;
848 + if (mutex_tryenter(&pageout_mutex)) {
819 849
820 - /*
821 - * Fix for 1161438 (CRS SPR# 73922). All variables
822 - * in the original calculation for desscan were 32 bit signed
823 - * ints. As freemem approaches 0x0 on a system with 1 Gig or
824 - * more of memory, the calculation can overflow. When this
825 - * happens, desscan becomes negative and pageout_scanner()
826 - * stops paging out.
827 - */
828 - if (needfree > 0 && pageout_new_spread == 0) {
829 - /*
830 - * If we've not yet collected enough samples to
831 - * calculate a spread, kick into high gear anytime
832 - * needfree is non-zero. Note that desscan will not be
833 - * the limiting factor for systems with larger memory;
834 - * the %CPU will limit the scan. That will also be
835 - * maxed out below.
836 - */
837 - desscan = fastscan / SCHEDPAGING_HZ;
838 - } else {
839 - /*
840 - * Once we've calculated a spread based on system
841 - * memory and usage, just treat needfree as another
842 - * form of deficit.
843 - */
844 - spgcnt_t faststmp, slowstmp, result;
850 + if (pageouts_running != 0)
851 + goto out;
845 852
846 - slowstmp = slowscan * vavail;
847 - faststmp = fastscan * (lotsfree - vavail);
848 - result = (slowstmp + faststmp) /
849 - nz(lotsfree) / SCHEDPAGING_HZ;
850 - desscan = (pgcnt_t)result;
851 - }
853 + /* No pageout scanner threads running. */
854 + nscan = 0;
855 + vavail = freemem - deficit;
856 + if (pageout_new_spread != 0)
857 + vavail -= needfree;
858 + vavail = clamp(vavail, 0, lotsfree);
852 859
853 - /*
854 - * If we've not yet collected enough samples to calculate a
855 - * spread, also kick %CPU to the max.
856 - */
857 - if (pageout_new_spread == 0) {
858 - pageout_nsec = max_pageout_nsec;
859 - } else {
860 - pageout_nsec = min_pageout_nsec +
861 - (lotsfree - vavail) *
862 - (max_pageout_nsec - min_pageout_nsec) /
863 - nz(lotsfree);
864 - }
860 + if (needfree > 0 && pageout_new_spread == 0) {
861 + /*
862 + * If we've not yet collected enough samples to
863 + * calculate a spread, use the old logic of kicking
864 + * into high gear anytime needfree is non-zero.
865 + */
866 + desscan = fastscan / SCHEDPAGING_HZ;
867 + } else {
868 + /*
869 + * Once we've calculated a spread based on system
870 + * memory and usage, just treat needfree as another
871 + * form of deficit.
872 + */
873 + spgcnt_t faststmp, slowstmp, result;
865 874
866 - if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
867 - /*
868 - * We have finished the pagescan initialization and the desired
869 - * number of page scanners has changed, either because
870 - * initialization just finished, because of a memory DR, or
871 - * because des_page_scanners has been modified on the fly (i.e.
872 - * by mdb). If we need more scanners, start them now, otherwise
873 - * the excess scanners will terminate on their own when they
874 - * reset their hands.
875 - */
876 - uint_t i;
877 - uint_t curr_nscan = n_page_scanners;
878 - pgcnt_t max = total_pages / handspreadpages;
875 + slowstmp = slowscan * vavail;
876 + faststmp = fastscan * (lotsfree - vavail);
877 + result = (slowstmp + faststmp) /
878 + nz(lotsfree) / SCHEDPAGING_HZ;
879 + desscan = (pgcnt_t)result;
880 + }
879 881
880 - if (des_page_scanners > max)
881 - des_page_scanners = max;
882 + pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
883 + (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
882 884
883 - if (des_page_scanners > MAX_PSCAN_THREADS) {
884 - des_page_scanners = MAX_PSCAN_THREADS;
885 - } else if (des_page_scanners == 0) {
886 - des_page_scanners = 1;
887 - }
885 + DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
886 + pageout_nsec);
888 887
889 - /*
890 - * Each thread has its own entry in the reset_hands array, so
891 - * we don't need any locking in pageout_scanner to check the
892 - * thread's reset_hands entry. Thus, we use a pre-allocated
893 - * fixed size reset_hands array and upper limit on the number
894 - * of pagescan threads.
895 - *
896 - * The reset_hands entries need to be true before we start new
897 - * scanners, but if we're reducing, we don't want a race on the
898 - * recalculation for the existing threads, so we set
899 - * n_page_scanners first.
900 - */
901 - n_page_scanners = des_page_scanners;
902 - for (i = 0; i < MAX_PSCAN_THREADS; i++) {
903 - reset_hands[i] = B_TRUE;
904 - }
888 + if (pageout_new_spread != 0 && despagescanners != 0 &&
889 + despagescanners != n_page_scanners) {
890 + /*
891 + * We have finished the pagescan initialisation and the
892 + * desired number of page scanners has changed, either
893 + * because initialisation just finished, because of a
894 + * memory DR, or because despagescanners has been
895 + * modified on the fly (i.e. by mdb).
896 + */
897 + uint_t i, curr_nscan = n_page_scanners;
905 898
906 - if (des_page_scanners > curr_nscan) {
907 - /* Create additional pageout scanner threads. */
908 - for (i = curr_nscan; i < des_page_scanners; i++) {
909 - (void) lwp_kernel_create(proc_pageout,
910 - pageout_scanner, (void *)(uintptr_t)i,
911 - TS_RUN, curthread->t_pri);
899 + /* Re-validate despagescanners */
900 + recalc_pagescanners();
901 +
902 + n_page_scanners = despagescanners;
903 +
904 + for (i = 0; i < MAX_PSCAN_THREADS; i++)
905 + reset_hands[i] = B_TRUE;
906 +
907 + /* If we need more scanners, start them now. */
908 + if (n_page_scanners > curr_nscan) {
909 + for (i = curr_nscan; i < n_page_scanners; i++) {
910 + (void) lwp_kernel_create(proc_pageout,
911 + pageout_scanner,
912 + (void *)(uintptr_t)i, TS_RUN,
913 + pageout_pri);
914 + }
912 915 }
916 +
917 + /*
918 + * If the number of scanners has decreased, trigger a
919 + * wakeup so that the excess threads will terminate.
920 + */
921 + if (n_page_scanners < curr_nscan) {
922 + WAKE_PAGEOUT_SCANNER();
923 + }
913 924 }
914 - }
915 925
916 - zones_over = B_FALSE;
926 + zones_over = B_FALSE;
917 927
918 - if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
919 - if (!PAGE_SCAN_STARTUP)
928 + if (PAGE_SCAN_STARTUP) {
929 + /*
930 + * We still need to measure the rate at which the
931 + * system is able to scan pages of memory. Each of
932 + * these initial samples is a scan of as much system
933 + * memory as practical, regardless of whether or not we
934 + * are experiencing memory pressure.
935 + */
936 + desscan = total_pages;
937 + pageout_nsec = max_pageout_nsec;
938 +
939 + DTRACE_PROBE(schedpage__wake__sample);
940 + WAKE_PAGEOUT_SCANNER();
941 + } else if (freemem < lotsfree + needfree) {
942 + /*
943 + * We need more memory.
944 + */
920 945 low_mem_scan++;
921 - /*
922 - * Either we need more memory, or we still need to
923 - * measure the average scan rate. Wake the scanner.
924 - */
925 - DTRACE_PROBE(schedpage__wake__low);
926 - WAKE_PAGEOUT_SCANNER();
927 946
928 - } else if (zone_num_over_cap > 0) {
929 - /* One or more zones are over their cap. */
947 + DTRACE_PROBE(schedpage__wake__low);
948 + WAKE_PAGEOUT_SCANNER();
949 + } else if (zone_num_over_cap > 0) {
950 + /*
951 + * One of more zones are over their cap.
952 + */
930 953
931 - /* No page limit */
932 - desscan = total_pages;
954 + /* No page limit */
955 + desscan = total_pages;
933 956
934 - /*
935 - * Increase the scanning CPU% to the max. This implies
936 - * 80% of one CPU/sec if the scanner can run each
937 - * opportunity. Can also be tuned via setting
938 - * zone_pageout_nsec in /etc/system or with mdb.
939 - */
940 - pageout_nsec = (zone_pageout_nsec != 0) ?
941 - zone_pageout_nsec : max_pageout_nsec;
957 + /*
958 + * Increase the scanning CPU% to the max. This implies
959 + * 80% of one CPU/sec if the scanner can run each
960 + * opportunity. Can also be tuned via setting
961 + * zone_pageout_nsec in /etc/system or with mdb.
962 + */
963 + pageout_nsec = (zone_pageout_nsec != 0) ?
964 + zone_pageout_nsec : max_pageout_nsec;
942 965
943 - zones_over = B_TRUE;
944 - zone_cap_scan++;
966 + zones_over = B_TRUE;
967 + zone_cap_scan++;
945 968
946 - DTRACE_PROBE(schedpage__wake__zone);
947 - WAKE_PAGEOUT_SCANNER();
948 -
949 - } else {
950 - /*
951 - * There are enough free pages, no need to
952 - * kick the scanner thread. And next time
953 - * around, keep more of the `highly shared'
954 - * pages.
955 - */
956 - cv_signal_pageout();
957 -
958 - mutex_enter(&pageout_mutex);
959 - if (po_share > MIN_PO_SHARE) {
960 - po_share >>= 1;
969 + DTRACE_PROBE(schedpage__wake__zone);
970 + WAKE_PAGEOUT_SCANNER();
971 + } else {
972 + /*
973 + * There are enough free pages, no need to
974 + * kick the scanner thread. And next time
975 + * around, keep more of the `highly shared'
976 + * pages.
977 + */
978 + cv_signal_pageout();
979 + if (po_share > MIN_PO_SHARE) {
980 + po_share >>= 1;
981 + }
961 982 }
983 +out:
962 984 mutex_exit(&pageout_mutex);
963 985 }
964 986
965 987 /*
966 988 * Signal threads waiting for available memory.
967 989 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
968 990 * in this case it is not needed - the waiters will be waken up during
969 991 * the next invocation of this function.
970 992 */
971 993 if (kmem_avail() > 0)
972 994 cv_broadcast(&memavail_cv);
973 995
974 996 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
975 997 }
976 998
977 999 pgcnt_t pushes;
978 1000 ulong_t push_list_size; /* # of requests on pageout queue */
979 1001
|
↓ open down ↓ |
8 lines elided |
↑ open up ↑ |
980 1002 /*
981 1003 * Paging out should always be enabled. This tunable exists to hold pageout
982 1004 * for debugging purposes. If set to 0, pageout_scanner() will go back to
983 1005 * sleep each time it is woken by schedpaging().
984 1006 */
985 1007 uint_t dopageout = 1;
986 1008
987 1009 /*
988 1010 * The page out daemon, which runs as process 2.
989 1011 *
990 - * Page out occurs when either:
991 - * a) there is less than lotsfree pages,
992 - * b) there are one or more zones over their physical memory cap.
1012 + * The daemon treats physical memory as a circular array of pages and scans
1013 + * the pages using a 'two-handed clock' algorithm. The front hand moves
1014 + * through the pages, clearing the reference bit. The back hand travels a
1015 + * distance (handspreadpages) behind the front hand, freeing the pages that
1016 + * have not been referenced in the time since the front hand passed. If
1017 + * modified, they are first written to their backing store before being
1018 + * freed.
993 1019 *
994 - * The daemon treats physical memory as a circular array of pages and scans the
995 - * pages using a 'two-handed clock' algorithm. The front hand moves through
996 - * the pages, clearing the reference bit. The back hand travels a distance
997 - * (handspreadpages) behind the front hand, freeing the pages that have not
998 - * been referenced in the time since the front hand passed. If modified, they
999 - * are first written to their backing store before being freed.
1020 + * In order to make page invalidation more responsive on machines with
1021 + * larger memory, multiple pageout_scanner threads may be created. In this
1022 + * case, each thread is given a segment of the memory "clock face" so that
1023 + * memory can be reclaimed more quickly.
1000 1024 *
1001 - * In order to make page invalidation more responsive on machines with larger
1002 - * memory, multiple pageout_scanner threads may be created. In this case, the
1003 - * threads are evenly distributed around the the memory "clock face" so that
1004 - * memory can be reclaimed more quickly (that is, there can be large regions in
1005 - * which no pages can be reclaimed by a single thread, leading to lag which
1006 - * causes undesirable behavior such as htable stealing).
1025 + * As long as there are at least lotsfree pages, or no zones over their
1026 + * cap, then pageout_scanner threads are not run. When pageout_scanner
1027 + * threads are running for case (a), all pages are considered for pageout.
1028 + * For case (b), only pages belonging to a zone over its cap will be
1029 + * considered for pageout.
1007 1030 *
1008 - * As long as there are at least lotsfree pages, or no zones over their cap,
1009 - * then pageout_scanner threads are not run. When pageout_scanner threads are
1010 - * running for case (a), all pages are considered for pageout. For case (b),
1011 - * only pages belonging to a zone over its cap will be considered for pageout.
1012 - *
1013 - * There are multiple threads that act on behalf of the pageout process.
1014 - * A set of threads scan pages (pageout_scanner) and frees them up if
1015 - * they don't require any VOP_PUTPAGE operation. If a page must be
1016 - * written back to its backing store, the request is put on a list
1017 - * and the other (pageout) thread is signaled. The pageout thread
1018 - * grabs VOP_PUTPAGE requests from the list, and processes them.
1019 - * Some filesystems may require resources for the VOP_PUTPAGE
1020 - * operations (like memory) and hence can block the pageout
1021 - * thread, but the pageout_scanner threads can still operate. There is still
1022 - * no guarantee that memory deadlocks cannot occur.
1023 - *
1024 - * The pageout_scanner parameters are determined in schedpaging().
1031 + * There are multiple threads that act on behalf of the pageout process. A
1032 + * set of threads scan pages (pageout_scanner) and frees them up if they
1033 + * don't require any VOP_PUTPAGE operation. If a page must be written back
1034 + * to its backing store, the request is put on a list and the other
1035 + * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
1036 + * requests from the list, and processes them. Some filesystems may require
1037 + * resources for the VOP_PUTPAGE operations (like memory) and hence can
1038 + * block the pageout thread, but the scanner thread can still operate.
1039 + * There is still no guarantee that memory deadlocks cannot occur.
1025 1040 */
1026 1041 void
1027 1042 pageout()
1028 1043 {
1029 1044 struct async_reqs *arg;
1030 - pri_t pageout_pri;
1031 1045 int i;
1032 1046 pgcnt_t max_pushes;
1033 1047 callb_cpr_t cprinfo;
1034 1048
1035 1049 proc_pageout = ttoproc(curthread);
1036 1050 proc_pageout->p_cstime = 0;
1037 1051 proc_pageout->p_stime = 0;
1038 1052 proc_pageout->p_cutime = 0;
1039 1053 proc_pageout->p_utime = 0;
1040 1054 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1041 1055 bcopy("pageout", PTOU(curproc)->u_comm, 7);
1042 1056
1043 1057 /*
1044 1058 * Create pageout scanner thread
1045 1059 */
1046 1060 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1047 1061 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1048 1062
1049 1063 /*
1050 1064 * Allocate and initialize the async request structures
|
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
1051 1065 * for pageout.
1052 1066 */
1053 1067 push_req = (struct async_reqs *)
1054 1068 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1055 1069
1056 1070 req_freelist = push_req;
1057 1071 for (i = 0; i < async_list_size - 1; i++) {
1058 1072 push_req[i].a_next = &push_req[i + 1];
1059 1073 }
1060 1074
1061 - pageout_pri = curthread->t_pri;
1075 + pageout_pri = curthread->t_pri - 1;
1062 1076
1063 - /* Create the (first) pageout scanner thread. */
1064 - (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
1065 - pageout_pri - 1);
1077 + /* Create the first pageout scanner thread. */
1078 + (void) lwp_kernel_create(proc_pageout, pageout_scanner,
1079 + (void *)0, /* this is instance 0, not NULL */
1080 + TS_RUN, pageout_pri);
1066 1081
1067 1082 /*
1068 1083 * kick off pageout scheduler.
1069 1084 */
1070 1085 schedpaging(NULL);
1071 1086
1072 1087 /*
1073 1088 * Create kernel cage thread.
1074 1089 * The kernel cage thread is started under the pageout process
1075 1090 * to take advantage of the less restricted page allocation
1076 1091 * in page_create_throttle().
1077 1092 */
1078 1093 kcage_cageout_init();
1079 1094
1080 1095 /*
1081 1096 * Limit pushes to avoid saturating pageout devices.
1082 1097 */
1083 1098 max_pushes = maxpgio / SCHEDPAGING_HZ;
1084 1099 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1085 1100
1086 1101 for (;;) {
1087 1102 mutex_enter(&push_lock);
1088 1103
1089 1104 while ((arg = push_list) == NULL || pushes > max_pushes) {
1090 1105 CALLB_CPR_SAFE_BEGIN(&cprinfo);
|
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
1091 1106 cv_wait(&push_cv, &push_lock);
1092 1107 pushes = 0;
1093 1108 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1094 1109 }
1095 1110 push_list = arg->a_next;
1096 1111 arg->a_next = NULL;
1097 1112 pageout_pushing = true;
1098 1113 mutex_exit(&push_lock);
1099 1114
1100 1115 DTRACE_PROBE(pageout__push);
1116 +
1101 1117 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1102 1118 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1103 1119 pushes++;
1104 1120 }
1105 1121
1106 1122 /* vp held by checkpage() */
1107 1123 VN_RELE(arg->a_vp);
1108 1124
1109 1125 mutex_enter(&push_lock);
1110 1126 pageout_pushing = false;
1111 1127 pageout_pushcount++;
1112 1128 arg->a_next = req_freelist; /* back on freelist */
1113 1129 req_freelist = arg;
1114 1130 push_list_size--;
|
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
1115 1131 mutex_exit(&push_lock);
1116 1132 }
1117 1133 }
1118 1134
1119 1135 /*
1120 1136 * Kernel thread that scans pages looking for ones to free
1121 1137 */
1122 1138 static void
1123 1139 pageout_scanner(void *a)
1124 1140 {
1125 - struct page *fronthand, *backhand;
1126 - uint_t laps, iter = 0;
1141 + struct page *fronthand, *backhand, *fronthandstart;
1142 + struct page *regionstart, *regionend;
1143 + uint_t laps;
1127 1144 callb_cpr_t cprinfo;
1128 - pgcnt_t nscan_cnt, nscan_limit;
1145 + pgcnt_t nscan_cnt, tick;
1129 1146 pgcnt_t pcount;
1130 - uint_t inst = (uint_t)(uintptr_t)a;
1147 + bool bhwrapping, fhwrapping;
1131 1148 hrtime_t sample_start, sample_end;
1132 - kmutex_t pscan_mutex;
1133 - bool sampling;
1149 + uint_t inst = (uint_t)(uintptr_t)a;
1134 1150
1135 1151 VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1136 1152
1137 - mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
1153 + CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
1154 + mutex_enter(&pageout_mutex);
1138 1155
1139 - CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
1140 - mutex_enter(&pscan_mutex);
1141 -
1142 1156 /*
1143 - * Establish the minimum and maximum length of time to be spent
1144 - * scanning pages per wakeup, limiting the scanner duty cycle. The
1145 - * input percentage values (0-100) must be converted to a fraction of
1146 - * the number of nanoseconds in a second of wall time, then further
1147 - * scaled down by the number of scanner wakeups in a second:
1157 + * The restart case does not attempt to point the hands at roughly
1158 + * the right point on the assumption that after one circuit things
1159 + * will have settled down, and restarts shouldn't be that often.
1148 1160 */
1149 - min_pageout_nsec = MAX(1,
1150 - NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
1151 - max_pageout_nsec = MAX(min_pageout_nsec,
1152 - NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
1161 + reset_hands[inst] = B_TRUE;
1153 1162
1163 + pageouts_running++;
1164 + mutex_exit(&pageout_mutex);
1165 +
1154 1166 loop:
1155 1167 cv_signal_pageout();
1156 1168
1169 + mutex_enter(&pageout_mutex);
1170 + pageouts_running--;
1157 1171 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1158 - cv_wait(&proc_pageout->p_cv, &pscan_mutex);
1159 - CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
1172 + cv_wait(&proc_pageout->p_cv, &pageout_mutex);
1173 + CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
1174 + pageouts_running++;
1175 + mutex_exit(&pageout_mutex);
1160 1176
1161 1177 /*
1162 - * Check if pageout has been disabled for debugging purposes:
1178 + * Check if pageout has been disabled for debugging purposes.
1163 1179 */
1164 1180 if (!dopageout) {
1165 1181 goto loop;
1166 1182 }
1167 1183
1168 1184 /*
1169 - * One may reset the clock hands for debugging purposes. Hands will
1170 - * also be reset if memory is added to or removed from the system.
1185 + * One may reset the clock hands and scanned region for debugging
1186 + * purposes. Hands will also be reset on first thread startup, if
1187 + * the number of scanning threads (n_page_scanners) changes, or if
1188 + * memory is added to, or removed from, the system.
1171 1189 */
1172 1190 if (reset_hands[inst]) {
1173 1191 struct page *first;
1174 - pgcnt_t offset = total_pages / n_page_scanners;
1175 1192
1176 1193 reset_hands[inst] = B_FALSE;
1194 +
1177 1195 if (inst >= n_page_scanners) {
1178 1196 /*
1179 - * The desired number of page scanners has been
1180 - * reduced and this instance is no longer wanted.
1181 - * Exit the lwp.
1182 - */
1197 + * The desired number of page scanners has been
1198 + * reduced and this instance is no longer wanted.
1199 + * Exit the lwp.
1200 + */
1183 1201 VERIFY3U(inst, !=, 0);
1184 - mutex_exit(&pscan_mutex);
1202 + DTRACE_PROBE1(pageout__exit, uint_t, inst);
1203 + mutex_enter(&pageout_mutex);
1204 + pageouts_running--;
1205 + mutex_exit(&pageout_mutex);
1185 1206 mutex_enter(&curproc->p_lock);
1186 1207 lwp_exit();
1208 + /* NOTREACHED */
1187 1209 }
1188 1210
1211 + first = page_first();
1212 +
1189 1213 /*
1190 - * The reset case repositions the hands at the proper place
1191 - * on the memory clock face to prevent creep into another
1192 - * thread's active region or when the number of threads has
1193 - * changed.
1194 - *
1195 - * Set the two clock hands to be separated by a reasonable
1196 - * amount, but no more than 360 degrees apart.
1197 - *
1198 - * If inst == 0, backhand starts at first page, otherwise
1199 - * it is (inst * offset) around the memory "clock face" so that
1200 - * we spread out each scanner instance evenly.
1214 + * Each scanner thread gets its own sector of the memory
1215 + * clock face.
1201 1216 */
1202 - first = page_first();
1203 - backhand = page_nextn(first, offset * inst);
1204 - if (handspreadpages >= total_pages) {
1205 - fronthand = page_nextn(backhand, total_pages - 1);
1217 + pgcnt_t span, offset;
1218 +
1219 + span = looppages / n_page_scanners;
1220 + VERIFY3U(span, >, handspreadpages);
1221 +
1222 + offset = inst * span;
1223 + regionstart = page_nextn(first, offset);
1224 + if (inst == n_page_scanners - 1) {
1225 + /* The last instance goes up to the last page */
1226 + regionend = page_nextn(first, looppages - 1);
1206 1227 } else {
1207 - fronthand = page_nextn(backhand, handspreadpages);
1228 + regionend = page_nextn(regionstart, span - 1);
1208 1229 }
1230 +
1231 + backhand = regionstart;
1232 + fronthand = page_nextn(backhand, handspreadpages);
1233 + tick = 1;
1234 +
1235 + bhwrapping = fhwrapping = B_FALSE;
1236 +
1237 + DTRACE_PROBE4(pageout__reset, uint_t, inst,
1238 + pgcnt_t, regionstart, pgcnt_t, regionend,
1239 + pgcnt_t, fronthand);
1209 1240 }
1210 1241
1242 + /*
1243 + * This CPU kstat is only incremented here and we're obviously
1244 + * on this CPU, so no lock.
1245 + */
1211 1246 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1212 1247
1213 1248 /*
1214 1249 * Keep track of the number of times we have scanned all the way around
1215 - * the loop:
1250 + * the loop on this wakeup.
1216 1251 */
1217 1252 laps = 0;
1218 1253
1219 1254 /*
1220 1255 * Track the number of pages visited during this scan so that we can
1221 1256 * periodically measure our duty cycle.
1222 1257 */
1223 - pcount = 0;
1224 1258 nscan_cnt = 0;
1259 + pcount = 0;
1225 1260
1226 - if (PAGE_SCAN_STARTUP) {
1227 - /*
1228 - * We need to measure the rate at which the system is able to
1229 - * scan pages of memory. Each of these initial samples is a
1230 - * scan of all system memory, regardless of whether or not we
1231 - * are experiencing memory pressure.
1232 - */
1233 - nscan_limit = total_pages;
1234 - sampling = true;
1235 - } else {
1236 - nscan_limit = desscan;
1237 - sampling = false;
1238 - }
1261 + DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
1262 + hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
1239 1263
1240 - DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
1241 - page_t *, backhand, page_t *, fronthand);
1264 + /*
1265 + * Record the initial position of the front hand for this cycle so
1266 + * that we can detect when the hand wraps around.
1267 + */
1268 + fronthandstart = fronthand;
1242 1269
1243 1270 sample_start = gethrtime();
1244 1271
1245 1272 /*
1246 1273 * Scan the appropriate number of pages for a single duty cycle.
1247 - * Only scan while at least one of these is true:
1248 - * 1) one or more zones is over its cap
1249 - * 2) there is not enough free memory
1250 - * 3) during page scan startup when determining sample data
1251 1274 */
1252 - while (nscan_cnt < nscan_limit) {
1275 + while (nscan_cnt < desscan) {
1253 1276 checkpage_result_t rvfront, rvback;
1254 1277
1255 - if (!sampling && !zones_over &&
1256 - freemem >= lotsfree + needfree) {
1278 + /*
1279 + * Only scan while at least one of these is true:
1280 + * 1) one or more zones is over its cap
1281 + * 2) there is not enough free memory
1282 + * 3) during page scan startup when determining sample data
1283 + */
1284 + if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
1285 + !zones_over) {
1257 1286 /*
1258 1287 * We are not sampling and enough memory has become
1259 1288 * available that scanning is no longer required.
1260 1289 */
1290 + DTRACE_PROBE1(pageout__memfree, uint_t, inst);
1261 1291 break;
1262 1292 }
1263 1293
1264 - DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
1294 + DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
1265 1295
1266 1296 /*
1267 1297 * Periodically check to see if we have exceeded the CPU duty
1268 1298 * cycle for a single wakeup.
1269 1299 */
1270 1300 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1271 1301 hrtime_t pageout_cycle_nsec;
1272 1302
1273 1303 pageout_cycle_nsec = gethrtime() - sample_start;
1274 1304 if (pageout_cycle_nsec >= pageout_nsec) {
1275 - /*
1276 - * This is where we normally break out of the
1277 - * loop when scanning zones or sampling.
1278 - */
1279 - if (!zones_over) {
1305 + if (!zones_over)
1280 1306 atomic_inc_64(&pageout_timeouts);
1281 - }
1282 1307 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1283 1308 break;
1284 1309 }
1285 1310 }
1286 1311
1287 1312 /*
1288 1313 * If checkpage manages to add a page to the free list,
1289 1314 * we give ourselves another couple of trips around the loop.
1290 1315 */
1291 1316 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1292 1317 laps = 0;
1293 1318 }
1294 1319 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1295 1320 laps = 0;
1296 1321 }
1297 1322
1298 1323 ++pcount;
1299 1324
1300 1325 /*
1301 1326 * This CPU kstat is only incremented here and we're obviously
1302 1327 * on this CPU, so no lock.
|
↓ open down ↓ |
11 lines elided |
↑ open up ↑ |
1303 1328 */
1304 1329 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1305 1330
1306 1331 /*
1307 1332 * Don't include ineligible pages in the number scanned.
1308 1333 */
1309 1334 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1310 1335 nscan_cnt++;
1311 1336 }
1312 1337
1313 - backhand = page_next(backhand);
1314 - fronthand = page_next(fronthand);
1338 + if (bhwrapping) {
1339 + backhand = regionstart;
1340 + bhwrapping = B_FALSE;
1341 + } else {
1342 + backhand = page_nextn(backhand, tick);
1343 + if (backhand == regionend)
1344 + bhwrapping = B_TRUE;
1345 + }
1315 1346
1347 + if (fhwrapping) {
1348 + fronthand = regionstart;
1349 + fhwrapping = B_FALSE;
1350 + } else {
1351 + fronthand = page_nextn(fronthand, tick);
1352 + if (fronthand == regionend)
1353 + fhwrapping = B_TRUE;
1354 + }
1355 +
1316 1356 /*
1317 - * The front hand has wrapped around to the first page in the
1318 - * loop.
1357 + * The front hand has wrapped around during this wakeup.
1319 1358 */
1320 - if (fronthand == page_first()) {
1321 - DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
1359 + if (fronthand == fronthandstart) {
1360 + laps++;
1361 + DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
1362 + uint_t, laps);
1322 1363
1323 1364 /*
1324 - * Every 64 wraps we reposition our hands within our
1325 - * region to prevent creep into another thread.
1326 - */
1327 - if ((++iter % pageout_reset_cnt) == 0)
1328 - reset_hands[inst] = B_TRUE;
1329 -
1330 - /*
1331 1365 * This CPU kstat is only incremented here and we're
1332 1366 * obviously on this CPU, so no lock.
1333 1367 */
1334 1368 CPU_STATS_ADDQ(CPU, vm, rev, 1);
1335 1369
1336 1370 /*
1337 - * If scanning because the system is low on memory,
1338 1371 * then when we wraparound memory we want to try to
1339 1372 * reclaim more pages.
1340 1373 * If scanning only because zones are over their cap,
1341 1374 * then wrapping is common and we simply keep going.
1342 - */
1343 - if (freemem < lotsfree + needfree && ++laps > 1) {
1375 + */
1376 + if (laps > 1 && freemem < lotsfree + needfree) {
1344 1377 /*
1345 - * The system is low on memory.
1346 1378 * Extremely unlikely, but it happens.
1347 1379 * We went around the loop at least once
1348 1380 * and didn't get far enough.
1349 1381 * If we are still skipping `highly shared'
1350 1382 * pages, skip fewer of them. Otherwise,
1351 1383 * give up till the next clock tick.
1352 1384 */
1353 - mutex_enter(&pageout_mutex);
1354 1385 if (po_share < MAX_PO_SHARE) {
1355 1386 po_share <<= 1;
1356 - mutex_exit(&pageout_mutex);
1357 1387 } else {
1358 - mutex_exit(&pageout_mutex);
1359 1388 break;
1360 1389 }
1361 1390 }
1362 1391 }
1363 1392 }
1364 1393
1394 + sample_end = gethrtime();
1365 1395 atomic_add_long(&nscan, nscan_cnt);
1366 1396
1367 - sample_end = gethrtime();
1397 + DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
1398 + pgcnt_t, nscan_cnt, pgcnt_t, pcount)
1368 1399
1369 - DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
1370 - uint_t, inst);
1371 -
1372 1400 /*
1373 - * The following two blocks are only relevant when the scanner is
1374 - * first started up. After the scanner runs for a while, neither of
1375 - * the conditions will ever be true again.
1376 - *
1377 1401 * The global variables used below are only modified by this thread and
1378 1402 * only during initial scanning when there is a single page scanner
1379 - * thread running. Thus, we don't use any locking.
1403 + * thread running.
1380 1404 */
1381 1405 if (pageout_new_spread == 0) {
1382 1406 VERIFY3U(inst, ==, 0);
1407 +
1383 1408 if (PAGE_SCAN_STARTUP) {
1384 1409 /*
1385 1410 * Continue accumulating samples until we have enough
1386 - * to get a reasonable value for average scan rate:
1411 + * to get a reasonable value for average scan rate.
1387 1412 */
1388 1413 pageout_sample_pages += pcount;
1389 1414 pageout_sample_etime += sample_end - sample_start;
1390 1415 ++pageout_sample_cnt;
1391 1416 }
1392 1417
1393 1418 if (!PAGE_SCAN_STARTUP) {
1394 1419 /*
1395 1420 * We have enough samples, set the spread.
1396 1421 */
1397 1422 pageout_rate = (hrrate_t)pageout_sample_pages *
1398 1423 (hrrate_t)(NANOSEC) / pageout_sample_etime;
1399 1424 pageout_new_spread = pageout_rate / 10;
1400 1425 setupclock();
1401 1426 }
1402 1427 }
1403 1428
1404 1429 goto loop;
1405 1430 }
1406 1431
1407 1432 /*
1408 1433 * The pageout deadman is run once per second by clock().
1409 1434 */
1410 1435 void
1411 1436 pageout_deadman(void)
1412 1437 {
1413 1438 if (panicstr != NULL) {
1414 1439 /*
1415 1440 * There is no pageout after panic.
1416 1441 */
1417 1442 return;
1418 1443 }
1419 1444
1420 1445 if (pageout_deadman_seconds == 0) {
1421 1446 /*
1422 1447 * The deadman is not enabled.
1423 1448 */
1424 1449 return;
1425 1450 }
1426 1451
1427 1452 if (!pageout_pushing) {
1428 1453 goto reset;
1429 1454 }
1430 1455
1431 1456 /*
1432 1457 * We are pushing a page. Check to see if it is the same call we saw
1433 1458 * last time we looked:
1434 1459 */
1435 1460 if (pageout_pushcount != pageout_pushcount_seen) {
1436 1461 /*
1437 1462 * It is a different call from the last check, so we are not
1438 1463 * stuck.
1439 1464 */
1440 1465 goto reset;
1441 1466 }
1442 1467
1443 1468 if (++pageout_stucktime >= pageout_deadman_seconds) {
1444 1469 panic("pageout_deadman: stuck pushing the same page for %d "
1445 1470 "seconds (freemem is %lu)", pageout_deadman_seconds,
1446 1471 freemem);
1447 1472 }
1448 1473
1449 1474 return;
1450 1475
1451 1476 reset:
1452 1477 /*
1453 1478 * Reset our tracking state to reflect that we are not stuck:
1454 1479 */
1455 1480 pageout_stucktime = 0;
1456 1481 pageout_pushcount_seen = pageout_pushcount;
1457 1482 }
1458 1483
1459 1484 /*
1460 1485 * Look at the page at hand. If it is locked (e.g., for physical i/o),
1461 1486 * system (u., page table) or free, then leave it alone. Otherwise,
1462 1487 * if we are running the front hand, turn off the page's reference bit.
1463 1488 * If the proc is over maxrss, we take it. If running the back hand,
1464 1489 * check whether the page has been reclaimed. If not, free the page,
1465 1490 * pushing it to disk first if necessary.
1466 1491 *
1467 1492 * Return values:
1468 1493 * CKP_INELIGIBLE if the page is not a candidate at all,
1469 1494 * CKP_NOT_FREED if the page was not freed, or
1470 1495 * CKP_FREED if we freed it.
1471 1496 */
1472 1497 static checkpage_result_t
1473 1498 checkpage(struct page *pp, pageout_hand_t whichhand)
1474 1499 {
1475 1500 int ppattr;
1476 1501 int isfs = 0;
1477 1502 int isexec = 0;
1478 1503 int pagesync_flag;
1479 1504 zoneid_t zid = ALL_ZONES;
1480 1505
1481 1506 /*
1482 1507 * Skip pages:
1483 1508 * - associated with the kernel vnode since
1484 1509 * they are always "exclusively" locked.
1485 1510 * - that are free
1486 1511 * - that are shared more than po_share'd times
1487 1512 * - its already locked
1488 1513 *
1489 1514 * NOTE: These optimizations assume that reads are atomic.
1490 1515 */
1491 1516
1492 1517 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1493 1518 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1494 1519 hat_page_checkshare(pp, po_share)) {
1495 1520 return (CKP_INELIGIBLE);
1496 1521 }
1497 1522
1498 1523 if (!page_trylock(pp, SE_EXCL)) {
1499 1524 /*
1500 1525 * Skip the page if we can't acquire the "exclusive" lock.
1501 1526 */
1502 1527 return (CKP_INELIGIBLE);
1503 1528 } else if (PP_ISFREE(pp)) {
1504 1529 /*
1505 1530 * It became free between the above check and our actually
1506 1531 * locking the page. Oh well, there will be other pages.
1507 1532 */
1508 1533 page_unlock(pp);
1509 1534 return (CKP_INELIGIBLE);
1510 1535 }
1511 1536
1512 1537 /*
1513 1538 * Reject pages that cannot be freed. The page_struct_lock
1514 1539 * need not be acquired to examine these
1515 1540 * fields since the page has an "exclusive" lock.
1516 1541 */
1517 1542 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
|
↓ open down ↓ |
121 lines elided |
↑ open up ↑ |
1518 1543 page_unlock(pp);
1519 1544 return (CKP_INELIGIBLE);
1520 1545 }
1521 1546
1522 1547 if (zones_over) {
1523 1548 ASSERT(pp->p_zoneid == ALL_ZONES ||
1524 1549 pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1525 1550 if (pp->p_zoneid == ALL_ZONES ||
1526 1551 zone_pdata[pp->p_zoneid].zpers_over == 0) {
1527 1552 /*
1528 - * Cross-zone shared page, or zone not over it's cap.
1529 - * Leave the page alone.
1530 - */
1553 + * Cross-zone shared page, or zone not over it's cap.
1554 + * Leave the page alone.
1555 + */
1531 1556 page_unlock(pp);
1532 1557 return (CKP_INELIGIBLE);
1533 1558 }
1534 1559 zid = pp->p_zoneid;
1535 1560 }
1536 1561
1537 1562 /*
1538 1563 * Maintain statistics for what we are freeing
1539 1564 */
1540 -
1541 1565 if (pp->p_vnode != NULL) {
1542 1566 if (pp->p_vnode->v_flag & VVMEXEC)
1543 1567 isexec = 1;
1544 1568
1545 1569 if (!IS_SWAPFSVP(pp->p_vnode))
1546 1570 isfs = 1;
1547 1571 }
1548 1572
1549 1573 /*
1550 1574 * Turn off REF and MOD bits with the front hand.
1551 1575 * The back hand examines the REF bit and always considers
1552 1576 * SHARED pages as referenced.
1553 1577 */
1554 1578 if (whichhand == POH_FRONT) {
1555 1579 pagesync_flag = HAT_SYNC_ZERORM;
1556 1580 } else {
1557 1581 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1558 1582 HAT_SYNC_STOPON_SHARED;
1559 1583 }
1560 1584
1561 1585 ppattr = hat_pagesync(pp, pagesync_flag);
1562 1586
1563 1587 recheck:
1564 1588 /*
1565 1589 * If page is referenced; make unreferenced but reclaimable.
1566 1590 * If this page is not referenced, then it must be reclaimable
1567 1591 * and we can add it to the free list.
1568 1592 */
1569 1593 if (ppattr & P_REF) {
1570 1594 DTRACE_PROBE2(pageout__isref, page_t *, pp,
1571 1595 pageout_hand_t, whichhand);
1572 1596
1573 1597 if (whichhand == POH_FRONT) {
1574 1598 /*
1575 1599 * Checking of rss or madvise flags needed here...
1576 1600 *
1577 1601 * If not "well-behaved", fall through into the code
1578 1602 * for not referenced.
1579 1603 */
1580 1604 hat_clrref(pp);
1581 1605 }
1582 1606
1583 1607 /*
1584 1608 * Somebody referenced the page since the front
1585 1609 * hand went by, so it's not a candidate for
1586 1610 * freeing up.
1587 1611 */
1588 1612 page_unlock(pp);
1589 1613 return (CKP_NOT_FREED);
1590 1614 }
1591 1615
1592 1616 VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1593 1617
1594 1618 /*
1595 1619 * If large page, attempt to demote it. If successfully demoted,
1596 1620 * retry the checkpage.
1597 1621 */
1598 1622 if (pp->p_szc != 0) {
1599 1623 if (!page_try_demote_pages(pp)) {
1600 1624 VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1601 1625 page_unlock(pp);
1602 1626 return (CKP_INELIGIBLE);
1603 1627 }
1604 1628
1605 1629 ASSERT(pp->p_szc == 0);
1606 1630 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1607 1631
1608 1632 /*
1609 1633 * Since page_try_demote_pages() could have unloaded some
1610 1634 * mappings it makes sense to reload ppattr.
1611 1635 */
1612 1636 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1613 1637 }
1614 1638
1615 1639 /*
1616 1640 * If the page is currently dirty, we have to arrange to have it
1617 1641 * cleaned before it can be freed.
1618 1642 *
1619 1643 * XXX - ASSERT(pp->p_vnode != NULL);
1620 1644 */
1621 1645 if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1622 1646 struct vnode *vp = pp->p_vnode;
1623 1647 u_offset_t offset = pp->p_offset;
1624 1648
1625 1649 /*
1626 1650 * XXX - Test for process being swapped out or about to exit?
1627 1651 * [Can't get back to process(es) using the page.]
1628 1652 */
1629 1653
1630 1654 /*
1631 1655 * Hold the vnode before releasing the page lock to
1632 1656 * prevent it from being freed and re-used by some
1633 1657 * other thread.
1634 1658 */
1635 1659 VN_HOLD(vp);
1636 1660 page_unlock(pp);
1637 1661
1638 1662 /*
1639 1663 * Queue I/O request for the pageout thread.
1640 1664 */
1641 1665 if (!queue_io_request(vp, offset)) {
1642 1666 VN_RELE(vp);
1643 1667 return (CKP_NOT_FREED);
1644 1668 }
1645 1669 if (isfs) {
1646 1670 zone_pageout_stat(zid, ZPO_DIRTY);
1647 1671 } else {
1648 1672 zone_pageout_stat(zid, ZPO_ANONDIRTY);
1649 1673 }
1650 1674 return (CKP_FREED);
1651 1675 }
1652 1676
1653 1677 /*
1654 1678 * Now we unload all the translations and put the page back on to the
1655 1679 * free list. If the page was used (referenced or modified) after the
1656 1680 * pagesync but before it was unloaded we catch it and handle the page
1657 1681 * properly.
1658 1682 */
1659 1683 DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1660 1684 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1661 1685 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1662 1686 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1663 1687 goto recheck;
1664 1688 }
1665 1689
1666 1690 VN_DISPOSE(pp, B_FREE, 0, kcred);
1667 1691
1668 1692 CPU_STATS_ADD_K(vm, dfree, 1);
1669 1693
1670 1694 if (isfs) {
1671 1695 if (isexec) {
1672 1696 CPU_STATS_ADD_K(vm, execfree, 1);
1673 1697 } else {
1674 1698 CPU_STATS_ADD_K(vm, fsfree, 1);
1675 1699 }
1676 1700 zone_pageout_stat(zid, ZPO_FS);
1677 1701 } else {
1678 1702 CPU_STATS_ADD_K(vm, anonfree, 1);
1679 1703 zone_pageout_stat(zid, ZPO_ANON);
1680 1704 }
1681 1705
1682 1706 return (CKP_FREED);
1683 1707 }
1684 1708
1685 1709 /*
1686 1710 * Queue async i/o request from pageout_scanner and segment swapout
1687 1711 * routines on one common list. This ensures that pageout devices (swap)
1688 1712 * are not saturated by pageout_scanner or swapout requests.
1689 1713 * The pageout thread empties this list by initiating i/o operations.
1690 1714 */
1691 1715 int
1692 1716 queue_io_request(vnode_t *vp, u_offset_t off)
1693 1717 {
1694 1718 struct async_reqs *arg;
1695 1719
1696 1720 /*
1697 1721 * If we cannot allocate an async request struct,
1698 1722 * skip this page.
1699 1723 */
1700 1724 mutex_enter(&push_lock);
1701 1725 if ((arg = req_freelist) == NULL) {
1702 1726 mutex_exit(&push_lock);
1703 1727 return (0);
1704 1728 }
1705 1729 req_freelist = arg->a_next; /* adjust freelist */
1706 1730 push_list_size++;
1707 1731
1708 1732 arg->a_vp = vp;
1709 1733 arg->a_off = off;
1710 1734 arg->a_len = PAGESIZE;
1711 1735 arg->a_flags = B_ASYNC | B_FREE;
1712 1736 arg->a_cred = kcred; /* always held */
1713 1737
1714 1738 /*
1715 1739 * Add to list of pending write requests.
1716 1740 */
1717 1741 arg->a_next = push_list;
1718 1742 push_list = arg;
1719 1743
1720 1744 if (req_freelist == NULL) {
1721 1745 /*
1722 1746 * No free async requests left. The lock is held so we
1723 1747 * might as well signal the pusher thread now.
1724 1748 */
1725 1749 cv_signal(&push_cv);
1726 1750 }
1727 1751 mutex_exit(&push_lock);
1728 1752 return (1);
1729 1753 }
1730 1754
1731 1755 /*
1732 1756 * Wakeup pageout to initiate i/o if push_list is not empty.
1733 1757 */
1734 1758 void
1735 1759 cv_signal_pageout()
1736 1760 {
1737 1761 if (push_list != NULL) {
1738 1762 mutex_enter(&push_lock);
1739 1763 cv_signal(&push_cv);
1740 1764 mutex_exit(&push_lock);
1741 1765 }
1742 1766 }
|
↓ open down ↓ |
192 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX