Print this page
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/cmd/zoneadmd/mcap.c
+++ new/usr/src/cmd/zoneadmd/mcap.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 23 * Copyright 2014, Joyent, Inc. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * This file implements the code which runs a thread inside zoneadmd to cap
28 28 * the associated zone's physical memory. A thread to do this is started
29 29 * when the zone boots and is halted when the zone shuts down.
30 30 *
31 31 * Because of the way that the VM system is currently implemented, there is no
32 32 * way to go from the bottom up (page to process to zone). Thus, there is no
33 33 * obvious way to hook an rctl into the kernel's paging code to enforce a hard
34 34 * memory cap. Instead, we implement a soft physical memory cap which looks
35 35 * at the zone's overall rss and once it is over the cap, works from the top
36 36 * down (zone to process to page), looking at zone processes, to determine
37 37 * what to try to pageout to get the zone under its memory cap.
38 38 *
39 39 * The code uses the fast, cheap, but potentially very inaccurate sum of the
40 40 * rss values from psinfo_t to first approximate the zone's rss and will
41 41 * fallback to the vm_getusage syscall to determine the zone's rss if needed.
42 42 * It then checks the rss against the zone's zone.max-physical-memory rctl.
43 43 * Once the zone goes over its cap, then this thread will work through the
44 44 * zone's /proc process list, Pgrab-bing each process and stepping through the
45 45 * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
46 46 * to pageout pages, until the zone is again under its cap.
47 47 *
48 48 * Although zone memory capping is implemented as a soft cap by this user-level
49 49 * thread, the interfaces around memory caps that are exposed to the user are
50 50 * the standard ones; an rctl and kstats. This thread uses the rctl value
51 51 * to obtain the cap and works with the zone kernel code to update the kstats.
52 52 * If the implementation ever moves into the kernel, these exposed interfaces
53 53 * do not need to change.
54 54 *
55 55 * The thread adaptively sleeps, periodically checking the state of the
56 56 * zone. As the zone's rss gets closer to the cap, the thread will wake up
57 57 * more often to check the zone's status. Once the zone is over the cap,
58 58 * the thread will work to pageout until the zone is under the cap, as shown
59 59 * by updated vm_usage data.
60 60 *
61 61 * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags
62 62 * are set by hrm_setbits() and on x86 that code path is only executed by
63 63 * segvn_pagelock -> hat_setstat -> hrm_setbits
64 64 * segvn_softunlock -^
65 65 * On SPARC there is an additional code path which may make this data
66 66 * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
67 67 * maps. If we ever fix this issue, then we could generalize this mcap code to
68 68 * do more with the data on active pages.
69 69 *
70 70 * For debugging, touch the file {zonepath}/mcap_debug.log. This will
71 71 * cause the thread to start logging its actions into that file (it may take
72 72 * a minute or two if the thread is currently sleeping). Removing that
73 73 * file will cause logging to stop.
74 74 */
75 75
76 76 #include <sys/mman.h>
77 77 #include <sys/param.h>
78 78 #include <sys/stat.h>
79 79 #include <sys/types.h>
80 80 #include <assert.h>
81 81 #include <errno.h>
82 82 #include <fcntl.h>
83 83 #include <libproc.h>
84 84 #include <limits.h>
85 85 #include <procfs.h>
86 86 #include <stdio.h>
87 87 #include <stdlib.h>
88 88 #include <strings.h>
89 89 #include <time.h>
90 90 #include <unistd.h>
91 91 #include <sys/priocntl.h>
92 92 #include <dirent.h>
93 93 #include <zone.h>
94 94 #include <libzonecfg.h>
95 95 #include <thread.h>
96 96 #include <values.h>
97 97 #include <sys/vm_usage.h>
98 98 #include <sys/resource.h>
99 99 #include <sys/debug.h>
100 100 #include <synch.h>
101 101 #include <wait.h>
102 102 #include <libcontract.h>
103 103 #include <libcontract_priv.h>
104 104 #include <sys/contract/process.h>
105 105 #include "zoneadmd.h"
106 106
107 107 /* round up to next y = 2^n */
108 108 #define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1))
109 109
110 110 #define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */
111 111
112 112 /*
113 113 * zonecfg attribute tunables for memory capping.
114 114 * phys-mcap-cmd
115 115 * type: string
116 116 * specifies a command that can be run when over the cap
117 117 * phys-mcap-no-vmusage
118 118 * type: boolean
119 119 * true disables vm_getusage and just uses zone's proc. rss sum
120 120 * phys-mcap-no-pageout
121 121 * type: boolean
122 122 * true disables pageout when over
|
↓ open down ↓ |
122 lines elided |
↑ open up ↑ |
123 123 * phys-mcap-no-pf-throttle
124 124 * type: boolean
125 125 * true disables page fault throttling when over
126 126 */
127 127 #define TUNE_CMD "phys-mcap-cmd"
128 128 #define TUNE_NVMU "phys-mcap-no-vmusage"
129 129 #define TUNE_NPAGE "phys-mcap-no-pageout"
130 130 #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle"
131 131
132 132 /*
133 + * The large mapping value was derived empirically by seeing that mappings
134 + * much bigger than 16mb sometimes take a relatively long time to invalidate
135 + * (significant fraction of a second).
136 + */
137 +#define SEC_INTERIM 4 /* num secs to pause after stopped too long */
138 +#define MSEC_TOO_LONG 100 /* release proc. after stopped for 100ms */
139 +#define LARGE_MAPPING 16384 /* >= 16MB in KB - pageout in chunks */
140 +
141 +/*
133 142 * These are only used in get_mem_info but global. We always need scale_rss and
134 143 * prev_fast_rss to be persistent but we also have the other two global so we
135 144 * can easily see these with mdb.
136 145 */
137 146 uint64_t scale_rss = 0;
138 147 uint64_t prev_fast_rss = 0;
139 148 uint64_t fast_rss = 0;
140 149 uint64_t accurate_rss = 0;
141 150
142 151 static char zoneproc[MAXPATHLEN];
143 152 static char debug_log[MAXPATHLEN];
144 153 static zoneid_t zid;
145 154 static mutex_t shutdown_mx;
|
↓ open down ↓ |
3 lines elided |
↑ open up ↑ |
146 155 static cond_t shutdown_cv;
147 156 static int shutting_down = 0;
148 157 static thread_t mcap_tid;
149 158 static FILE *debug_log_fp = NULL;
150 159 static uint64_t zone_rss_cap; /* RSS cap(KB) */
151 160 static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
152 161 static boolean_t skip_vmusage = B_FALSE;
153 162 static boolean_t skip_pageout = B_FALSE;
154 163 static boolean_t skip_pf_throttle = B_FALSE;
155 164
156 -static zlog_t *logp;
157 -
158 165 static int64_t check_suspend();
159 166 static void get_mcap_tunables();
160 167
161 168 /*
162 169 * Structure to hold current state about a process address space that we're
163 170 * working on.
164 171 */
165 172 typedef struct {
166 173 int pr_curr; /* the # of the mapping we're working on */
167 174 int pr_nmap; /* number of mappings in address space */
168 175 prmap_t *pr_mapp; /* process's map array */
169 176 } proc_map_t;
170 177
171 178 typedef struct zsd_vmusage64 {
172 179 id_t vmu_zoneid;
173 180 uint_t vmu_type;
174 181 id_t vmu_id;
175 182 /*
176 183 * An amd64 kernel will align the following uint64_t members, but a
177 184 * 32bit i386 process will not without help.
178 185 */
179 186 int vmu_align_next_members_on_8_bytes;
180 187 uint64_t vmu_rss_all;
181 188 uint64_t vmu_rss_private;
182 189 uint64_t vmu_rss_shared;
183 190 uint64_t vmu_swap_all;
184 191 uint64_t vmu_swap_private;
185 192 uint64_t vmu_swap_shared;
186 193 } zsd_vmusage64_t;
187 194
188 195 /*
189 196 * Output a debug log message.
190 197 */
191 198 /*PRINTFLIKE1*/
192 199 static void
193 200 debug(char *fmt, ...)
194 201 {
195 202 va_list ap;
196 203
197 204 if (debug_log_fp == NULL)
198 205 return;
199 206
200 207 va_start(ap, fmt);
201 208 (void) vfprintf(debug_log_fp, fmt, ap);
202 209 va_end(ap);
203 210 (void) fflush(debug_log_fp);
204 211 }
205 212
206 213 /*
207 214 * Like sleep(3C) but can be interupted by cond_signal which is posted when
208 215 * we're shutting down the mcap thread.
209 216 */
210 217 static void
211 218 sleep_shutdown(int secs)
212 219 {
213 220 timestruc_t to;
214 221
215 222 to.tv_sec = secs;
216 223 to.tv_nsec = 0;
217 224
218 225 (void) mutex_lock(&shutdown_mx);
219 226 if (!shutting_down)
220 227 (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
221 228 (void) mutex_unlock(&shutdown_mx);
222 229 }
223 230
224 231 static boolean_t
225 232 proc_issystem(pid_t pid)
226 233 {
227 234 char pc_clname[PC_CLNMSZ];
228 235
229 236 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
230 237 PC_KY_NULL) != -1)
231 238 return (strcmp(pc_clname, "SYS") == 0);
232 239
233 240 return (B_TRUE);
234 241 }
235 242
236 243 /*
237 244 * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
238 245 */
239 246 static void
240 247 run_over_cmd()
241 248 {
242 249 int ctfd;
243 250 int err;
244 251 pid_t childpid;
245 252 siginfo_t info;
246 253 ctid_t ct;
247 254
248 255 /*
249 256 * Before we enter the zone, we need to create a new process contract
250 257 * for the child, as required by zone_enter().
251 258 */
252 259 if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
253 260 return;
254 261 if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
255 262 ct_tmpl_set_informative(ctfd, 0) != 0 ||
256 263 ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
257 264 ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
258 265 ct_tmpl_activate(ctfd) != 0) {
259 266 (void) close(ctfd);
260 267 return;
261 268 }
262 269
263 270 childpid = fork();
264 271 switch (childpid) {
265 272 case -1:
266 273 (void) ct_tmpl_clear(ctfd);
267 274 (void) close(ctfd);
268 275 break;
269 276 case 0: /* Child */
270 277 (void) ct_tmpl_clear(ctfd);
271 278 (void) close(ctfd);
272 279 if (zone_enter(zid) == -1)
273 280 _exit(errno);
274 281 err = system(over_cmd);
275 282 _exit(err);
276 283 break;
277 284 default: /* Parent */
278 285 if (contract_latest(&ct) == -1)
279 286 ct = -1;
280 287 (void) ct_tmpl_clear(ctfd);
281 288 (void) close(ctfd);
282 289 err = waitid(P_PID, childpid, &info, WEXITED);
283 290 (void) contract_abandon_id(ct);
284 291 if (err == -1 || info.si_status != 0)
285 292 debug("over_cmd failed");
286 293 break;
287 294 }
288 295 }
289 296
290 297 /*
291 298 * Get the next mapping.
292 299 */
293 300 static prmap_t *
294 301 nextmapping(proc_map_t *pmp)
295 302 {
296 303 if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
297 304 return (NULL);
298 305
299 306 return (&pmp->pr_mapp[pmp->pr_curr++]);
300 307 }
301 308
302 309 /*
303 310 * Initialize the proc_map_t to access the first mapping of an address space.
304 311 */
305 312 static prmap_t *
306 313 init_map(proc_map_t *pmp, pid_t pid)
307 314 {
308 315 int fd;
309 316 int res;
310 317 struct stat st;
311 318 char pathbuf[MAXPATHLEN];
312 319
313 320 bzero(pmp, sizeof (proc_map_t));
314 321 pmp->pr_nmap = -1;
315 322
316 323 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
317 324 if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
318 325 return (NULL);
319 326
320 327 redo:
321 328 errno = 0;
322 329 if (fstat(fd, &st) != 0)
323 330 goto done;
324 331
325 332 if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
326 333 debug("cannot malloc() %ld bytes for xmap", st.st_size);
327 334 goto done;
328 335 }
329 336 (void) bzero(pmp->pr_mapp, st.st_size);
330 337
331 338 errno = 0;
332 339 if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
333 340 free(pmp->pr_mapp);
334 341 pmp->pr_mapp = NULL;
335 342 if (res > 0 || errno == E2BIG) {
336 343 goto redo;
337 344 } else {
338 345 debug("pid %ld cannot read xmap\n", pid);
339 346 goto done;
340 347 }
341 348 }
342 349
343 350 pmp->pr_nmap = st.st_size / sizeof (prmap_t);
344 351
345 352 done:
346 353 (void) close(fd);
347 354 return (nextmapping(pmp));
348 355 }
349 356
350 357 /*
351 358 * Attempt to invalidate the entire mapping from within the given process's
352 359 * address space. May return nonzero with errno as:
353 360 * ESRCH - process not found
354 361 * ENOMEM - segment not found
355 362 * EINVAL - mapping exceeds a single segment
356 363 */
357 364 static int
358 365 pageout_mapping(pid_t pid, prmap_t *pmp)
359 366 {
360 367 int res;
361 368
362 369 if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
363 370 return (0);
364 371
365 372 errno = 0;
366 373 res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
367 374 pmp->pr_size);
368 375
369 376 return (res);
370 377 }
371 378
|
↓ open down ↓ |
204 lines elided |
↑ open up ↑ |
372 379 /*
373 380 * Work through a process paging out mappings until the whole address space was
374 381 * examined or the excess is < 0. Return our estimate of the updated excess.
375 382 */
376 383 static int64_t
377 384 pageout_process(pid_t pid, int64_t excess)
378 385 {
379 386 int psfd;
380 387 prmap_t *pmap;
381 388 proc_map_t cur;
382 - int res;
383 389 int64_t sum_d_rss, d_rss;
384 390 int64_t old_rss;
385 391 int map_cnt;
386 392 psinfo_t psinfo;
387 393 char pathbuf[MAXPATHLEN];
388 394
389 395 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
390 396 pid);
391 397 if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
392 398 return (excess);
393 399
394 400 cur.pr_mapp = NULL;
395 401
396 402 if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
397 403 goto done;
398 404
399 405 old_rss = (int64_t)psinfo.pr_rssize;
400 406 map_cnt = 0;
401 407
402 408 /* If unscannable, skip it. */
403 409 if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
404 410 debug("pid %ld: system process, skipping %s\n",
405 411 pid, psinfo.pr_psargs);
406 412 goto done;
407 413 }
408 414
409 415 /* If tiny RSS (16KB), skip it. */
410 416 if (old_rss <= 16) {
411 417 debug("pid %ld: skipping, RSS %lldKB %s\n",
412 418 pid, old_rss, psinfo.pr_psargs);
413 419 goto done;
414 420 }
415 421
416 422 /* Get segment residency information. */
417 423 pmap = init_map(&cur, pid);
418 424
419 425 /* Skip process if it has no mappings. */
420 426 if (pmap == NULL) {
421 427 debug("pid %ld: map unreadable; ignoring\n", pid);
422 428 goto done;
423 429 }
|
↓ open down ↓ |
31 lines elided |
↑ open up ↑ |
424 430
425 431 debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
426 432 pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
427 433
428 434 /*
429 435 * Within the process's address space, attempt to page out mappings.
430 436 */
431 437 sum_d_rss = 0;
432 438 while (excess > 0 && pmap != NULL && !shutting_down) {
433 439 /* invalidate the entire mapping */
434 - if ((res = pageout_mapping(pid, pmap)) < 0)
440 + if (pageout_mapping(pid, pmap) < 0)
435 441 debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
436 - pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
442 + pid, (void *)pmap->pr_vaddr,
443 + (long)pmap->pr_size / 1024L, errno);
437 444
438 445 map_cnt++;
439 446
440 447 /*
441 448 * Re-check the process rss and get the delta.
442 449 */
443 450 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
444 451 != sizeof (psinfo)) {
445 452 excess -= old_rss;
446 453 goto done;
447 454 }
448 455
449 456 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
450 457 old_rss = (int64_t)psinfo.pr_rssize;
451 458 sum_d_rss += d_rss;
452 459
453 460 /*
454 461 * d_rss hopefully should be negative (or 0 if nothing
455 462 * invalidated) but can be positive if more got paged in.
456 463 */
457 464 excess += d_rss;
458 465
459 466 if (excess <= 0) {
460 467 debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
461 468 "excess %lldKB\n", pid, map_cnt,
462 469 (unsigned long long)sum_d_rss, (long long)excess);
463 470 map_cnt = 0;
464 471
465 472 /*
466 473 * If we're actually under, this will suspend checking
467 474 * in the middle of this process's address space.
468 475 */
469 476 excess = check_suspend();
470 477 if (shutting_down)
471 478 goto done;
472 479
473 480 /*
474 481 * since we might have suspended, re-read process's rss
475 482 */
476 483 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
477 484 != sizeof (psinfo)) {
478 485 excess -= old_rss;
479 486 goto done;
480 487 }
481 488
482 489 old_rss = (int64_t)psinfo.pr_rssize;
483 490
484 491 debug("pid %ld: resume pageout; excess %lld\n", pid,
485 492 (long long)excess);
486 493 sum_d_rss = 0;
487 494 }
488 495
489 496 pmap = nextmapping(&cur);
490 497 }
491 498
492 499 debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
493 500 pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
494 501
495 502 done:
496 503 if (cur.pr_mapp != NULL)
497 504 free(cur.pr_mapp);
498 505
499 506 (void) close(psfd);
500 507
501 508 if (shutting_down)
502 509 return (0);
503 510
504 511 return (excess);
505 512 }
506 513
507 514 /*
508 515 * Get the zone's RSS data.
509 516 */
510 517 static uint64_t
511 518 get_mem_info()
512 519 {
513 520 uint64_t n = 1;
514 521 zsd_vmusage64_t buf;
515 522 uint64_t tmp_rss;
516 523 DIR *pdir = NULL;
517 524 struct dirent *dent;
518 525
519 526 /*
520 527 * Start by doing the fast, cheap RSS calculation using the rss value
521 528 * in psinfo_t. Because that's per-process, it can lead to double
522 529 * counting some memory and overestimating how much is being used, but
523 530 * as long as that's not over the cap, then we don't need do the
524 531 * expensive calculation.
525 532 *
526 533 * If we have to do the expensive calculation, we remember the scaling
527 534 * factor so that we can try to use that on subsequent iterations for
528 535 * the fast rss.
529 536 */
530 537 if (shutting_down)
531 538 return (0);
532 539
533 540 if ((pdir = opendir(zoneproc)) == NULL)
534 541 return (0);
535 542
536 543 accurate_rss = 0;
537 544 fast_rss = 0;
538 545 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
539 546 pid_t pid;
540 547 int psfd;
541 548 int64_t rss;
542 549 char pathbuf[MAXPATHLEN];
543 550 psinfo_t psinfo;
544 551
545 552 if (strcmp(".", dent->d_name) == 0 ||
546 553 strcmp("..", dent->d_name) == 0)
547 554 continue;
548 555
549 556 pid = atoi(dent->d_name);
550 557 if (pid == 0 || pid == 1)
551 558 continue;
552 559
553 560 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
554 561 zoneproc, pid);
555 562
556 563 rss = 0;
557 564 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
558 565 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
559 566 sizeof (psinfo))
560 567 rss = (int64_t)psinfo.pr_rssize;
561 568
562 569 (void) close(psfd);
563 570 }
564 571
565 572 fast_rss += rss;
566 573 }
567 574
568 575 (void) closedir(pdir);
569 576
570 577 if (shutting_down)
571 578 return (0);
572 579
573 580 debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
574 581 scale_rss, prev_fast_rss);
575 582
576 583 /* see if we can get by with a scaled fast rss */
577 584 tmp_rss = fast_rss;
578 585 if (scale_rss > 1 && prev_fast_rss > 0) {
579 586 /*
580 587 * Only scale the fast value if it hasn't ballooned too much
581 588 * to trust.
582 589 */
583 590 if (fast_rss / prev_fast_rss < 2) {
584 591 fast_rss /= scale_rss;
585 592 debug("scaled fast rss: %lluKB\n", fast_rss);
586 593 }
587 594 }
588 595
589 596 if (fast_rss <= zone_rss_cap || skip_vmusage) {
590 597 uint64_t zone_rss_bytes;
591 598
592 599 zone_rss_bytes = fast_rss * 1024;
593 600 /* Use the zone's approx. RSS in the kernel */
594 601 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
595 602 return (fast_rss);
596 603 }
597 604
598 605 buf.vmu_id = zid;
599 606
600 607 /* get accurate usage (cached data may be up to 5 seconds old) */
601 608 if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
602 609 (uintptr_t)&buf, (uintptr_t)&n) != 0) {
603 610 debug("vmusage failed\n");
604 611 (void) sleep_shutdown(1);
605 612 return (0);
606 613 }
607 614
608 615 if (n > 1) {
609 616 /* This should never happen */
610 617 debug("vmusage returned more than one result\n");
611 618 (void) sleep_shutdown(1);
612 619 return (0);
613 620 }
614 621
615 622 if (buf.vmu_id != zid) {
616 623 /* This should never happen */
617 624 debug("vmusage returned the incorrect zone\n");
618 625 (void) sleep_shutdown(1);
619 626 return (0);
620 627 }
621 628
622 629 accurate_rss = buf.vmu_rss_all / 1024;
623 630
624 631 /* calculate scaling factor to use for fast_rss from now on */
625 632 if (accurate_rss > 0) {
626 633 scale_rss = fast_rss / accurate_rss;
627 634 debug("new scaling factor: %llu\n", scale_rss);
628 635 /* remember the fast rss when we had to get the accurate rss */
629 636 prev_fast_rss = tmp_rss;
630 637 }
631 638
632 639 debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
633 640 scale_rss, prev_fast_rss);
634 641 return (accurate_rss);
635 642 }
636 643
637 644 /*
638 645 * Needed to read the zones physical-memory-cap rctl.
639 646 */
640 647 static struct ps_prochandle *
641 648 grab_zone_proc()
642 649 {
643 650 DIR *dirp;
644 651 struct dirent *dentp;
645 652 struct ps_prochandle *ph = NULL;
646 653 int tmp;
647 654
648 655 if ((dirp = opendir(zoneproc)) == NULL)
649 656 return (NULL);
650 657
651 658 while (!shutting_down && (dentp = readdir(dirp))) {
652 659 int pid;
653 660
654 661 if (strcmp(".", dentp->d_name) == 0 ||
655 662 strcmp("..", dentp->d_name) == 0)
656 663 continue;
657 664
658 665 pid = atoi(dentp->d_name);
659 666 /* attempt to grab process */
660 667 if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
661 668 if (Psetflags(ph, PR_RLC) == 0) {
662 669 if (Pcreate_agent(ph) == 0) {
663 670 (void) closedir(dirp);
664 671 return (ph);
665 672 }
666 673 }
667 674 Prelease(ph, 0);
668 675 }
669 676 }
670 677
671 678 (void) closedir(dirp);
672 679 return (NULL);
673 680 }
674 681
675 682 static uint64_t
676 683 get_zone_cap()
677 684 {
678 685 rctlblk_t *rblk;
679 686 uint64_t mcap;
680 687 struct ps_prochandle *ph;
681 688
682 689 if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
683 690 return (UINT64_MAX);
684 691
685 692 if ((ph = grab_zone_proc()) == NULL) {
686 693 free(rblk);
687 694 return (UINT64_MAX);
688 695 }
689 696
690 697 if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
691 698 RCTL_FIRST)) {
692 699 Pdestroy_agent(ph);
693 700 Prelease(ph, 0);
694 701 free(rblk);
695 702 return (UINT64_MAX);
696 703 }
697 704
698 705 Pdestroy_agent(ph);
699 706 Prelease(ph, 0);
700 707
701 708 mcap = rctlblk_get_value(rblk);
702 709 free(rblk);
703 710 return (mcap);
704 711 }
705 712
706 713 /*
707 714 * check_suspend is invoked at the beginning of every pass through the process
708 715 * list or after we've paged out enough so that we think the excess is under
709 716 * the cap. The purpose is to periodically check the zone's rss and return
710 717 * the excess when the zone is over the cap. The rest of the time this
711 718 * function will sleep, periodically waking up to check the current rss.
712 719 *
713 720 * Depending on the percentage of penetration of the zone's rss into the
714 721 * cap we sleep for longer or shorter amounts. This reduces the impact of this
715 722 * work on the system, which is important considering that each zone will be
716 723 * monitoring its rss.
717 724 */
718 725 static int64_t
719 726 check_suspend()
720 727 {
721 728 static hrtime_t last_cap_read = 0;
722 729 static uint64_t addon;
723 730 static uint64_t lo_thresh; /* Thresholds for how long to sleep */
724 731 static uint64_t hi_thresh; /* when under the cap (80% & 90%). */
725 732 static uint64_t prev_zone_rss = 0;
726 733 static uint32_t pfdelay = 0; /* usec page fault delay when over */
727 734
728 735 /* Wait a second to give the async pageout a chance to catch up. */
729 736 (void) sleep_shutdown(1);
730 737
731 738 while (!shutting_down) {
732 739 int64_t new_excess;
733 740 int sleep_time;
734 741 hrtime_t now;
735 742 struct stat st;
736 743 uint64_t zone_rss; /* total RSS(KB) */
737 744
738 745 /*
739 746 * Check if the debug log files exists and enable or disable
740 747 * debug.
741 748 */
742 749 if (debug_log_fp == NULL) {
743 750 if (stat(debug_log, &st) == 0)
744 751 debug_log_fp = fopen(debug_log, "w");
745 752 } else {
746 753 if (stat(debug_log, &st) == -1) {
747 754 (void) fclose(debug_log_fp);
748 755 debug_log_fp = NULL;
749 756 }
750 757 }
751 758
752 759 /*
753 760 * If the CAP_REFRESH interval has passed, re-get the current
754 761 * cap in case it has been dynamically updated.
755 762 */
756 763 now = gethrtime();
757 764 if (now - last_cap_read > CAP_REFRESH) {
758 765 uint64_t mcap;
759 766
760 767 last_cap_read = now;
761 768
762 769 mcap = get_zone_cap();
763 770 if (mcap != 0 && mcap != UINT64_MAX)
764 771 zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
765 772 else
766 773 zone_rss_cap = UINT64_MAX;
767 774
768 775 lo_thresh = (uint64_t)(zone_rss_cap * .8);
769 776 hi_thresh = (uint64_t)(zone_rss_cap * .9);
770 777 addon = (uint64_t)(zone_rss_cap * 0.05);
771 778
772 779 /*
773 780 * We allow the memory cap tunables to be changed on
774 781 * the fly.
775 782 */
776 783 get_mcap_tunables();
777 784
778 785 debug("%s: %s\n", TUNE_CMD, over_cmd);
779 786 debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
780 787 debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
781 788 debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
782 789 debug("current cap %lluKB lo %lluKB hi %lluKB\n",
783 790 zone_rss_cap, lo_thresh, hi_thresh);
784 791 }
785 792
786 793 /* No cap, nothing to do. */
787 794 if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
788 795 debug("no cap, sleep 120 seconds\n");
789 796 (void) sleep_shutdown(120);
790 797 continue;
791 798 }
792 799
793 800 zone_rss = get_mem_info();
794 801
795 802 /* calculate excess */
796 803 new_excess = zone_rss - zone_rss_cap;
797 804
798 805 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
799 806 zone_rss, zone_rss_cap, new_excess);
800 807
801 808 /*
802 809 * If necessary, updates stats.
803 810 */
804 811
805 812 /*
806 813 * If it looks like we did some paging out since last over the
807 814 * cap then update the kstat so we can approximate how much was
808 815 * paged out.
809 816 */
810 817 if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
811 818 uint64_t diff;
812 819
813 820 /* assume diff is num bytes we paged out */
814 821 diff = (prev_zone_rss - zone_rss) * 1024;
815 822
816 823 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
817 824 &diff, 0);
818 825 }
819 826 prev_zone_rss = zone_rss;
820 827
821 828 if (new_excess > 0) {
822 829 uint64_t n = 1;
823 830
824 831 /* Increment "nover" kstat. */
825 832 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
826 833
827 834 if (!skip_pf_throttle) {
828 835 /*
829 836 * Tell the kernel to start throttling page
830 837 * faults by some number of usecs to help us
831 838 * catch up. If we are persistently over the
832 839 * cap the delay ramps up to a max of 2000usecs.
833 840 * Note that for delays less than 1 tick
834 841 * (i.e. all of these) we busy-wait in as_fault.
835 842 * delay faults/sec
836 843 * 125 8000
837 844 * 250 4000
838 845 * 500 2000
839 846 * 1000 1000
840 847 * 2000 500
841 848 */
842 849 if (pfdelay == 0)
843 850 pfdelay = 125;
844 851 else if (pfdelay < 2000)
845 852 pfdelay *= 2;
846 853
847 854 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
848 855 &pfdelay, 0);
849 856 }
850 857
851 858 /*
852 859 * Once we go over the cap, then we want to
853 860 * page out a little extra instead of stopping
854 861 * right at the cap. To do this we add 5% to
855 862 * the excess so that pageout_proces will work
856 863 * a little longer before stopping.
857 864 */
858 865 return ((int64_t)(new_excess + addon));
859 866 }
860 867
861 868 /*
862 869 * At this point we are under the cap.
863 870 *
864 871 * Tell the kernel to stop throttling page faults.
865 872 *
866 873 * Scale the amount of time we sleep before rechecking the
867 874 * zone's memory usage. Also, scale the accpetable age of
868 875 * cached results from vm_getusage. We do this based on the
869 876 * penetration into the capped limit.
870 877 */
871 878 if (pfdelay > 0) {
872 879 pfdelay = 0;
873 880 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
874 881 &pfdelay, 0);
875 882 }
876 883
877 884 if (zone_rss <= lo_thresh) {
878 885 sleep_time = 120;
879 886 } else if (zone_rss <= hi_thresh) {
880 887 sleep_time = 60;
881 888 } else {
882 889 sleep_time = 30;
883 890 }
884 891
885 892 debug("sleep %d seconds\n", sleep_time);
886 893 (void) sleep_shutdown(sleep_time);
887 894 }
888 895
889 896 /* Shutting down, tell the kernel so it doesn't throttle */
890 897 if (pfdelay > 0) {
891 898 pfdelay = 0;
892 899 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
893 900 }
894 901
895 902 return (0);
896 903 }
897 904
898 905 static void
899 906 get_mcap_tunables()
900 907 {
901 908 zone_dochandle_t handle;
902 909 struct zone_attrtab attr;
903 910
904 911 over_cmd[0] = '\0';
905 912 if ((handle = zonecfg_init_handle()) == NULL)
906 913 return;
907 914
908 915 if (zonecfg_get_handle(zone_name, handle) != Z_OK)
909 916 goto done;
910 917
911 918 /* Reset to defaults in case rebooting and settings have changed */
912 919 over_cmd[0] = '\0';
913 920 skip_vmusage = B_FALSE;
914 921 skip_pageout = B_FALSE;
915 922 skip_pf_throttle = B_FALSE;
916 923
917 924 if (zonecfg_setattrent(handle) != Z_OK)
918 925 goto done;
919 926 while (zonecfg_getattrent(handle, &attr) == Z_OK) {
920 927 if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
921 928 (void) strlcpy(over_cmd, attr.zone_attr_value,
922 929 sizeof (over_cmd));
923 930 } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
924 931 if (strcmp("true", attr.zone_attr_value) == 0)
925 932 skip_vmusage = B_TRUE;
926 933 } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
927 934 if (strcmp("true", attr.zone_attr_value) == 0)
928 935 skip_pageout = B_TRUE;
929 936 } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
930 937 if (strcmp("true", attr.zone_attr_value) == 0)
931 938 skip_pf_throttle = B_TRUE;
932 939 }
933 940 }
934 941 (void) zonecfg_endattrent(handle);
935 942
936 943 done:
937 944 zonecfg_fini_handle(handle);
938 945 }
939 946
940 947 /* ARGSUSED */
941 948 static int
942 949 chk_proc_fs(void *data, const char *spec, const char *dir,
943 950 const char *fstype, const char *opt)
944 951 {
945 952 if (fstype != NULL && strcmp(fstype, "proc") == 0)
946 953 *((boolean_t *)data) = B_TRUE;
947 954
948 955 return (0);
949 956 }
950 957
951 958 static boolean_t
952 959 has_proc()
953 960 {
954 961 brand_handle_t bh;
955 962 boolean_t fnd = B_FALSE;
956 963
957 964 if ((bh = brand_open(brand_name)) != NULL) {
958 965 (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
959 966 }
960 967
961 968 brand_close(bh);
962 969 return (fnd);
963 970 }
964 971
965 972 /*
966 973 * We run this loop for brands with no /proc to simply update the RSS, using
967 974 * the cheap GZ /proc data, every 5 minutes.
968 975 */
969 976 static void
970 977 no_procfs()
971 978 {
972 979 DIR *pdir = NULL;
973 980 struct dirent *dent;
974 981 uint64_t zone_rss_bytes;
975 982
976 983 (void) sleep_shutdown(30);
977 984 while (!shutting_down) {
978 985 /*
979 986 * Just do the fast, cheap RSS calculation using the rss value
980 987 * in psinfo_t. Because that's per-process, it can lead to
981 988 * double counting some memory and overestimating how much is
982 989 * being used. Since there is no /proc in the zone, we use the
983 990 * GZ /proc and check for the correct zone.
984 991 */
985 992 if ((pdir = opendir("/proc")) == NULL)
986 993 return;
987 994
988 995 fast_rss = 0;
989 996 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
990 997 pid_t pid;
991 998 int psfd;
992 999 int64_t rss;
993 1000 char pathbuf[MAXPATHLEN];
994 1001 psinfo_t psinfo;
995 1002
996 1003 if (strcmp(".", dent->d_name) == 0 ||
997 1004 strcmp("..", dent->d_name) == 0)
998 1005 continue;
999 1006
1000 1007 pid = atoi(dent->d_name);
1001 1008 if (pid == 0 || pid == 1)
1002 1009 continue;
1003 1010
1004 1011 (void) snprintf(pathbuf, sizeof (pathbuf),
1005 1012 "/proc/%d/psinfo", pid);
1006 1013
1007 1014 rss = 0;
1008 1015 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1009 1016 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1010 1017 sizeof (psinfo)) {
1011 1018 if (psinfo.pr_zoneid == zid)
1012 1019 rss = (int64_t)psinfo.pr_rssize;
1013 1020 }
1014 1021
1015 1022 (void) close(psfd);
1016 1023 }
1017 1024
1018 1025 fast_rss += rss;
1019 1026 }
1020 1027
1021 1028 (void) closedir(pdir);
1022 1029
1023 1030 if (shutting_down)
1024 1031 return;
1025 1032
1026 1033 zone_rss_bytes = fast_rss * 1024;
1027 1034 /* Use the zone's approx. RSS in the kernel */
1028 1035 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1029 1036
1030 1037 (void) sleep_shutdown(300);
1031 1038 }
1032 1039 }
1033 1040
1034 1041 /*
1035 1042 * Thread that checks zone's memory usage and when over the cap, goes through
1036 1043 * the zone's process list trying to pageout processes to get under the cap.
1037 1044 */
1038 1045 static void
1039 1046 mcap_zone()
1040 1047 {
1041 1048 DIR *pdir = NULL;
1042 1049 int64_t excess;
1043 1050
1044 1051 debug("thread startup\n");
1045 1052
1046 1053 get_mcap_tunables();
1047 1054
1048 1055 /*
1049 1056 * If the zone has no /proc filesystem, we can't use the fast algorithm
1050 1057 * to check RSS or pageout any processes. All we can do is periodically
1051 1058 * update it's RSS kstat using the expensive sycall.
1052 1059 */
1053 1060 if (!has_proc()) {
1054 1061 no_procfs();
1055 1062 debug("thread shutdown\n");
1056 1063 return;
1057 1064 }
1058 1065
1059 1066 /*
1060 1067 * When first starting it is likely lots of other zones are starting
1061 1068 * too because the system is booting. Since we just started the zone
1062 1069 * we're not worried about being over the cap right away, so we let
1063 1070 * things settle a bit and tolerate some older data here to minimize
1064 1071 * the load on the system.
1065 1072 */
1066 1073 (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1067 1074
1068 1075 /* Wait until zone's /proc is mounted */
1069 1076 while (!shutting_down) {
1070 1077 struct stat st;
1071 1078
1072 1079 if (stat(zoneproc, &st) == 0 &&
1073 1080 strcmp(st.st_fstype, "proc") == 0)
1074 1081 break;
1075 1082 sleep_shutdown(5);
1076 1083 }
1077 1084
1078 1085 /* Open zone's /proc and walk entries. */
1079 1086 while (!shutting_down) {
1080 1087 if ((pdir = opendir(zoneproc)) != NULL)
1081 1088 break;
1082 1089 sleep_shutdown(5);
1083 1090 }
1084 1091
1085 1092 while (!shutting_down) {
1086 1093 struct dirent *dirent;
1087 1094
1088 1095 /* Wait until we've gone over the cap. */
1089 1096 excess = check_suspend();
1090 1097
1091 1098 debug("starting to scan, excess %lldk\n", (long long)excess);
1092 1099
1093 1100 if (over_cmd[0] != '\0') {
1094 1101 uint64_t zone_rss; /* total RSS(KB) */
1095 1102
1096 1103 debug("run phys_mcap_cmd: %s\n", over_cmd);
1097 1104 run_over_cmd();
1098 1105
1099 1106 zone_rss = get_mem_info();
1100 1107 excess = zone_rss - zone_rss_cap;
1101 1108 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1102 1109 zone_rss, zone_rss_cap, excess);
1103 1110 if (excess <= 0)
1104 1111 continue;
1105 1112 }
1106 1113
1107 1114 while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1108 1115 pid_t pid;
1109 1116
1110 1117 if (strcmp(".", dirent->d_name) == 0 ||
1111 1118 strcmp("..", dirent->d_name) == 0)
1112 1119 continue;
1113 1120
1114 1121 pid = atoi(dirent->d_name);
1115 1122 if (pid == 0 || pid == 1)
1116 1123 continue;
1117 1124
1118 1125 if (skip_pageout)
1119 1126 (void) sleep_shutdown(2);
1120 1127 else
1121 1128 excess = pageout_process(pid, excess);
1122 1129
1123 1130 if (excess <= 0) {
1124 1131 debug("apparently under; excess %lld\n",
1125 1132 (long long)excess);
1126 1133 /* Double check the current excess */
1127 1134 excess = check_suspend();
1128 1135 }
1129 1136 }
1130 1137
1131 1138 debug("process pass done; excess %lld\n", (long long)excess);
1132 1139 rewinddir(pdir);
1133 1140
1134 1141 if (skip_pageout)
1135 1142 (void) sleep_shutdown(120);
1136 1143 }
1137 1144
1138 1145 if (pdir != NULL)
1139 1146 (void) closedir(pdir);
|
↓ open down ↓ |
693 lines elided |
↑ open up ↑ |
1140 1147 debug("thread shutdown\n");
1141 1148 }
1142 1149
1143 1150 void
1144 1151 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1145 1152 {
1146 1153 int res;
1147 1154
1148 1155 shutting_down = 0;
1149 1156 zid = id;
1150 - logp = zlogp;
1151 1157
1152 1158 /* all but the lx brand currently use /proc */
1153 1159 if (strcmp(brand_name, "lx") == 0) {
1154 1160 (void) snprintf(zoneproc, sizeof (zoneproc),
1155 1161 "%s/root/native/proc", zonepath);
1156 1162 } else {
1157 1163 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1158 1164 zonepath);
1159 1165 }
1160 1166
1161 1167 (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1162 1168 zonepath);
1163 1169
1164 1170 res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1165 1171 &mcap_tid);
1166 1172 if (res != 0) {
1167 1173 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1168 1174 res);
1169 1175 mcap_tid = 0;
1170 1176 }
1171 1177 }
1172 1178
1173 1179 void
1174 1180 destroy_mcap_thread()
1175 1181 {
1176 1182 if (mcap_tid != 0) {
1177 1183 shutting_down = 1;
1178 1184 (void) cond_signal(&shutdown_cv);
1179 1185 (void) thr_join(mcap_tid, NULL, NULL);
1180 1186 mcap_tid = 0;
1181 1187 }
1182 1188 }
|
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX