Print this page
Reduce lint
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/cmd/zoneadmd/mcap.c
+++ new/usr/src/cmd/zoneadmd/mcap.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 23 * Copyright 2014, Joyent, Inc. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * This file implements the code which runs a thread inside zoneadmd to cap
28 28 * the associated zone's physical memory. A thread to do this is started
29 29 * when the zone boots and is halted when the zone shuts down.
30 30 *
31 31 * Because of the way that the VM system is currently implemented, there is no
32 32 * way to go from the bottom up (page to process to zone). Thus, there is no
33 33 * obvious way to hook an rctl into the kernel's paging code to enforce a hard
34 34 * memory cap. Instead, we implement a soft physical memory cap which looks
35 35 * at the zone's overall rss and once it is over the cap, works from the top
36 36 * down (zone to process to page), looking at zone processes, to determine
37 37 * what to try to pageout to get the zone under its memory cap.
38 38 *
39 39 * The code uses the fast, cheap, but potentially very inaccurate sum of the
40 40 * rss values from psinfo_t to first approximate the zone's rss and will
41 41 * fallback to the vm_getusage syscall to determine the zone's rss if needed.
42 42 * It then checks the rss against the zone's zone.max-physical-memory rctl.
43 43 * Once the zone goes over its cap, then this thread will work through the
44 44 * zone's /proc process list, Pgrab-bing each process and stepping through the
45 45 * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
46 46 * to pageout pages, until the zone is again under its cap.
47 47 *
48 48 * Although zone memory capping is implemented as a soft cap by this user-level
49 49 * thread, the interfaces around memory caps that are exposed to the user are
50 50 * the standard ones; an rctl and kstats. This thread uses the rctl value
51 51 * to obtain the cap and works with the zone kernel code to update the kstats.
52 52 * If the implementation ever moves into the kernel, these exposed interfaces
53 53 * do not need to change.
54 54 *
55 55 * The thread adaptively sleeps, periodically checking the state of the
56 56 * zone. As the zone's rss gets closer to the cap, the thread will wake up
57 57 * more often to check the zone's status. Once the zone is over the cap,
58 58 * the thread will work to pageout until the zone is under the cap, as shown
59 59 * by updated vm_usage data.
60 60 *
61 61 * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags
62 62 * are set by hrm_setbits() and on x86 that code path is only executed by
63 63 * segvn_pagelock -> hat_setstat -> hrm_setbits
64 64 * segvn_softunlock -^
65 65 * On SPARC there is an additional code path which may make this data
66 66 * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
67 67 * maps. If we ever fix this issue, then we could generalize this mcap code to
68 68 * do more with the data on active pages.
69 69 *
70 70 * For debugging, touch the file {zonepath}/mcap_debug.log. This will
71 71 * cause the thread to start logging its actions into that file (it may take
72 72 * a minute or two if the thread is currently sleeping). Removing that
73 73 * file will cause logging to stop.
74 74 */
75 75
76 76 #include <sys/mman.h>
77 77 #include <sys/param.h>
78 78 #include <sys/stat.h>
79 79 #include <sys/types.h>
80 80 #include <assert.h>
81 81 #include <errno.h>
82 82 #include <fcntl.h>
83 83 #include <libproc.h>
84 84 #include <limits.h>
85 85 #include <procfs.h>
86 86 #include <stdio.h>
87 87 #include <stdlib.h>
88 88 #include <strings.h>
89 89 #include <time.h>
90 90 #include <unistd.h>
91 91 #include <sys/priocntl.h>
92 92 #include <dirent.h>
93 93 #include <zone.h>
94 94 #include <libzonecfg.h>
95 95 #include <thread.h>
96 96 #include <values.h>
97 97 #include <sys/vm_usage.h>
98 98 #include <sys/resource.h>
99 99 #include <sys/debug.h>
100 100 #include <synch.h>
101 101 #include <wait.h>
102 102 #include <libcontract.h>
103 103 #include <libcontract_priv.h>
104 104 #include <sys/contract/process.h>
105 105 #include "zoneadmd.h"
106 106
107 107 /* round up to next y = 2^n */
108 108 #define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1))
109 109
110 110 #define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */
111 111
112 112 /*
113 113 * zonecfg attribute tunables for memory capping.
114 114 * phys-mcap-cmd
115 115 * type: string
116 116 * specifies a command that can be run when over the cap
117 117 * phys-mcap-no-vmusage
118 118 * type: boolean
119 119 * true disables vm_getusage and just uses zone's proc. rss sum
120 120 * phys-mcap-no-pageout
121 121 * type: boolean
122 122 * true disables pageout when over
123 123 * phys-mcap-no-pf-throttle
124 124 * type: boolean
125 125 * true disables page fault throttling when over
126 126 */
127 127 #define TUNE_CMD "phys-mcap-cmd"
128 128 #define TUNE_NVMU "phys-mcap-no-vmusage"
129 129 #define TUNE_NPAGE "phys-mcap-no-pageout"
130 130 #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle"
131 131
132 132 /*
133 133 * The large mapping value was derived empirically by seeing that mappings
134 134 * much bigger than 16mb sometimes take a relatively long time to invalidate
135 135 * (significant fraction of a second).
136 136 */
137 137 #define SEC_INTERIM 4 /* num secs to pause after stopped too long */
138 138 #define MSEC_TOO_LONG 100 /* release proc. after stopped for 100ms */
139 139 #define LARGE_MAPPING 16384 /* >= 16MB in KB - pageout in chunks */
140 140
141 141 /*
142 142 * These are only used in get_mem_info but global. We always need scale_rss and
143 143 * prev_fast_rss to be persistent but we also have the other two global so we
144 144 * can easily see these with mdb.
145 145 */
146 146 uint64_t scale_rss = 0;
147 147 uint64_t prev_fast_rss = 0;
148 148 uint64_t fast_rss = 0;
149 149 uint64_t accurate_rss = 0;
150 150
151 151 static char zoneproc[MAXPATHLEN];
152 152 static char debug_log[MAXPATHLEN];
153 153 static zoneid_t zid;
154 154 static mutex_t shutdown_mx;
|
↓ open down ↓ |
154 lines elided |
↑ open up ↑ |
155 155 static cond_t shutdown_cv;
156 156 static int shutting_down = 0;
157 157 static thread_t mcap_tid;
158 158 static FILE *debug_log_fp = NULL;
159 159 static uint64_t zone_rss_cap; /* RSS cap(KB) */
160 160 static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
161 161 static boolean_t skip_vmusage = B_FALSE;
162 162 static boolean_t skip_pageout = B_FALSE;
163 163 static boolean_t skip_pf_throttle = B_FALSE;
164 164
165 -static zlog_t *logp;
166 -
167 165 static int64_t check_suspend();
168 166 static void get_mcap_tunables();
169 167
170 168 /*
171 169 * Structure to hold current state about a process address space that we're
172 170 * working on.
173 171 */
174 172 typedef struct {
175 173 int pr_curr; /* the # of the mapping we're working on */
176 174 int pr_nmap; /* number of mappings in address space */
177 175 prmap_t *pr_mapp; /* process's map array */
178 176 } proc_map_t;
179 177
180 178 typedef struct zsd_vmusage64 {
181 179 id_t vmu_zoneid;
182 180 uint_t vmu_type;
183 181 id_t vmu_id;
184 182 /*
185 183 * An amd64 kernel will align the following uint64_t members, but a
186 184 * 32bit i386 process will not without help.
187 185 */
188 186 int vmu_align_next_members_on_8_bytes;
189 187 uint64_t vmu_rss_all;
190 188 uint64_t vmu_rss_private;
191 189 uint64_t vmu_rss_shared;
192 190 uint64_t vmu_swap_all;
193 191 uint64_t vmu_swap_private;
194 192 uint64_t vmu_swap_shared;
195 193 } zsd_vmusage64_t;
196 194
197 195 /*
198 196 * Output a debug log message.
199 197 */
200 198 /*PRINTFLIKE1*/
201 199 static void
202 200 debug(char *fmt, ...)
203 201 {
204 202 va_list ap;
205 203
206 204 if (debug_log_fp == NULL)
207 205 return;
208 206
209 207 va_start(ap, fmt);
210 208 (void) vfprintf(debug_log_fp, fmt, ap);
211 209 va_end(ap);
212 210 (void) fflush(debug_log_fp);
213 211 }
214 212
215 213 /*
216 214 * Like sleep(3C) but can be interupted by cond_signal which is posted when
217 215 * we're shutting down the mcap thread.
218 216 */
219 217 static void
220 218 sleep_shutdown(int secs)
221 219 {
222 220 timestruc_t to;
223 221
224 222 to.tv_sec = secs;
225 223 to.tv_nsec = 0;
226 224
227 225 (void) mutex_lock(&shutdown_mx);
228 226 if (!shutting_down)
229 227 (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
230 228 (void) mutex_unlock(&shutdown_mx);
231 229 }
232 230
233 231 static boolean_t
234 232 proc_issystem(pid_t pid)
235 233 {
236 234 char pc_clname[PC_CLNMSZ];
237 235
238 236 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
239 237 PC_KY_NULL) != -1)
240 238 return (strcmp(pc_clname, "SYS") == 0);
241 239
242 240 return (B_TRUE);
243 241 }
244 242
245 243 /*
246 244 * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
247 245 */
248 246 static void
249 247 run_over_cmd()
250 248 {
251 249 int ctfd;
252 250 int err;
253 251 pid_t childpid;
254 252 siginfo_t info;
255 253 ctid_t ct;
256 254
257 255 /*
258 256 * Before we enter the zone, we need to create a new process contract
259 257 * for the child, as required by zone_enter().
260 258 */
261 259 if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
262 260 return;
263 261 if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
264 262 ct_tmpl_set_informative(ctfd, 0) != 0 ||
265 263 ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
266 264 ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
267 265 ct_tmpl_activate(ctfd) != 0) {
268 266 (void) close(ctfd);
269 267 return;
270 268 }
271 269
272 270 childpid = fork();
273 271 switch (childpid) {
274 272 case -1:
275 273 (void) ct_tmpl_clear(ctfd);
276 274 (void) close(ctfd);
277 275 break;
278 276 case 0: /* Child */
279 277 (void) ct_tmpl_clear(ctfd);
280 278 (void) close(ctfd);
281 279 if (zone_enter(zid) == -1)
282 280 _exit(errno);
283 281 err = system(over_cmd);
284 282 _exit(err);
285 283 break;
286 284 default: /* Parent */
287 285 if (contract_latest(&ct) == -1)
288 286 ct = -1;
289 287 (void) ct_tmpl_clear(ctfd);
290 288 (void) close(ctfd);
291 289 err = waitid(P_PID, childpid, &info, WEXITED);
292 290 (void) contract_abandon_id(ct);
293 291 if (err == -1 || info.si_status != 0)
294 292 debug("over_cmd failed");
295 293 break;
296 294 }
297 295 }
298 296
299 297 /*
300 298 * Get the next mapping.
301 299 */
302 300 static prmap_t *
303 301 nextmapping(proc_map_t *pmp)
304 302 {
305 303 if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
306 304 return (NULL);
307 305
308 306 return (&pmp->pr_mapp[pmp->pr_curr++]);
309 307 }
310 308
311 309 /*
312 310 * Initialize the proc_map_t to access the first mapping of an address space.
313 311 */
314 312 static prmap_t *
315 313 init_map(proc_map_t *pmp, pid_t pid)
316 314 {
317 315 int fd;
318 316 int res;
319 317 struct stat st;
320 318 char pathbuf[MAXPATHLEN];
321 319
322 320 bzero(pmp, sizeof (proc_map_t));
323 321 pmp->pr_nmap = -1;
324 322
325 323 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
326 324 if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
327 325 return (NULL);
328 326
329 327 redo:
330 328 errno = 0;
331 329 if (fstat(fd, &st) != 0)
332 330 goto done;
333 331
334 332 if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
335 333 debug("cannot malloc() %ld bytes for xmap", st.st_size);
336 334 goto done;
337 335 }
338 336 (void) bzero(pmp->pr_mapp, st.st_size);
339 337
340 338 errno = 0;
341 339 if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
342 340 free(pmp->pr_mapp);
343 341 pmp->pr_mapp = NULL;
344 342 if (res > 0 || errno == E2BIG) {
345 343 goto redo;
346 344 } else {
347 345 debug("pid %ld cannot read xmap\n", pid);
348 346 goto done;
349 347 }
350 348 }
351 349
352 350 pmp->pr_nmap = st.st_size / sizeof (prmap_t);
353 351
354 352 done:
355 353 (void) close(fd);
356 354 return (nextmapping(pmp));
357 355 }
358 356
359 357 /*
360 358 * Attempt to invalidate the entire mapping from within the given process's
361 359 * address space. May return nonzero with errno as:
362 360 * ESRCH - process not found
363 361 * ENOMEM - segment not found
364 362 * EINVAL - mapping exceeds a single segment
365 363 */
366 364 static int
367 365 pageout_mapping(pid_t pid, prmap_t *pmp)
368 366 {
369 367 int res;
370 368
371 369 if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
372 370 return (0);
373 371
374 372 errno = 0;
375 373 res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
376 374 pmp->pr_size);
377 375
378 376 return (res);
379 377 }
380 378
|
↓ open down ↓ |
204 lines elided |
↑ open up ↑ |
381 379 /*
382 380 * Work through a process paging out mappings until the whole address space was
383 381 * examined or the excess is < 0. Return our estimate of the updated excess.
384 382 */
385 383 static int64_t
386 384 pageout_process(pid_t pid, int64_t excess)
387 385 {
388 386 int psfd;
389 387 prmap_t *pmap;
390 388 proc_map_t cur;
391 - int res;
392 389 int64_t sum_d_rss, d_rss;
393 390 int64_t old_rss;
394 391 int map_cnt;
395 392 psinfo_t psinfo;
396 393 char pathbuf[MAXPATHLEN];
397 394
398 395 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
399 396 pid);
400 397 if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
401 398 return (excess);
402 399
403 400 cur.pr_mapp = NULL;
404 401
405 402 if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
406 403 goto done;
407 404
408 405 old_rss = (int64_t)psinfo.pr_rssize;
409 406 map_cnt = 0;
410 407
411 408 /* If unscannable, skip it. */
412 409 if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
413 410 debug("pid %ld: system process, skipping %s\n",
414 411 pid, psinfo.pr_psargs);
415 412 goto done;
416 413 }
417 414
418 415 /* If tiny RSS (16KB), skip it. */
419 416 if (old_rss <= 16) {
420 417 debug("pid %ld: skipping, RSS %lldKB %s\n",
421 418 pid, old_rss, psinfo.pr_psargs);
422 419 goto done;
423 420 }
424 421
425 422 /* Get segment residency information. */
426 423 pmap = init_map(&cur, pid);
427 424
428 425 /* Skip process if it has no mappings. */
429 426 if (pmap == NULL) {
430 427 debug("pid %ld: map unreadable; ignoring\n", pid);
431 428 goto done;
432 429 }
|
↓ open down ↓ |
31 lines elided |
↑ open up ↑ |
433 430
434 431 debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
435 432 pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
436 433
437 434 /*
438 435 * Within the process's address space, attempt to page out mappings.
439 436 */
440 437 sum_d_rss = 0;
441 438 while (excess > 0 && pmap != NULL && !shutting_down) {
442 439 /* invalidate the entire mapping */
443 - if ((res = pageout_mapping(pid, pmap)) < 0)
440 + if (pageout_mapping(pid, pmap) < 0)
444 441 debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
445 - pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
442 + pid, (void *)pmap->pr_vaddr,
443 + (long)pmap->pr_size / 1024L, errno);
446 444
447 445 map_cnt++;
448 446
449 447 /*
450 448 * Re-check the process rss and get the delta.
451 449 */
452 450 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
453 451 != sizeof (psinfo)) {
454 452 excess -= old_rss;
455 453 goto done;
456 454 }
457 455
458 456 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
459 457 old_rss = (int64_t)psinfo.pr_rssize;
460 458 sum_d_rss += d_rss;
461 459
462 460 /*
463 461 * d_rss hopefully should be negative (or 0 if nothing
464 462 * invalidated) but can be positive if more got paged in.
465 463 */
466 464 excess += d_rss;
467 465
468 466 if (excess <= 0) {
469 467 debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
470 468 "excess %lldKB\n", pid, map_cnt,
471 469 (unsigned long long)sum_d_rss, (long long)excess);
472 470 map_cnt = 0;
473 471
474 472 /*
475 473 * If we're actually under, this will suspend checking
476 474 * in the middle of this process's address space.
477 475 */
478 476 excess = check_suspend();
479 477 if (shutting_down)
480 478 goto done;
481 479
482 480 /*
483 481 * since we might have suspended, re-read process's rss
484 482 */
485 483 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
486 484 != sizeof (psinfo)) {
487 485 excess -= old_rss;
488 486 goto done;
489 487 }
490 488
491 489 old_rss = (int64_t)psinfo.pr_rssize;
492 490
493 491 debug("pid %ld: resume pageout; excess %lld\n", pid,
494 492 (long long)excess);
495 493 sum_d_rss = 0;
496 494 }
497 495
498 496 pmap = nextmapping(&cur);
499 497 }
500 498
501 499 debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
502 500 pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
503 501
504 502 done:
505 503 if (cur.pr_mapp != NULL)
506 504 free(cur.pr_mapp);
507 505
508 506 (void) close(psfd);
509 507
510 508 if (shutting_down)
511 509 return (0);
512 510
513 511 return (excess);
514 512 }
515 513
516 514 /*
517 515 * Get the zone's RSS data.
518 516 */
519 517 static uint64_t
520 518 get_mem_info()
521 519 {
522 520 uint64_t n = 1;
523 521 zsd_vmusage64_t buf;
524 522 uint64_t tmp_rss;
525 523 DIR *pdir = NULL;
526 524 struct dirent *dent;
527 525
528 526 /*
529 527 * Start by doing the fast, cheap RSS calculation using the rss value
530 528 * in psinfo_t. Because that's per-process, it can lead to double
531 529 * counting some memory and overestimating how much is being used, but
532 530 * as long as that's not over the cap, then we don't need do the
533 531 * expensive calculation.
534 532 *
535 533 * If we have to do the expensive calculation, we remember the scaling
536 534 * factor so that we can try to use that on subsequent iterations for
537 535 * the fast rss.
538 536 */
539 537 if (shutting_down)
540 538 return (0);
541 539
542 540 if ((pdir = opendir(zoneproc)) == NULL)
543 541 return (0);
544 542
545 543 accurate_rss = 0;
546 544 fast_rss = 0;
547 545 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
548 546 pid_t pid;
549 547 int psfd;
550 548 int64_t rss;
551 549 char pathbuf[MAXPATHLEN];
552 550 psinfo_t psinfo;
553 551
554 552 if (strcmp(".", dent->d_name) == 0 ||
555 553 strcmp("..", dent->d_name) == 0)
556 554 continue;
557 555
558 556 pid = atoi(dent->d_name);
559 557 if (pid == 0 || pid == 1)
560 558 continue;
561 559
562 560 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
563 561 zoneproc, pid);
564 562
565 563 rss = 0;
566 564 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
567 565 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
568 566 sizeof (psinfo))
569 567 rss = (int64_t)psinfo.pr_rssize;
570 568
571 569 (void) close(psfd);
572 570 }
573 571
574 572 fast_rss += rss;
575 573 }
576 574
577 575 (void) closedir(pdir);
578 576
579 577 if (shutting_down)
580 578 return (0);
581 579
582 580 debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
583 581 scale_rss, prev_fast_rss);
584 582
585 583 /* see if we can get by with a scaled fast rss */
586 584 tmp_rss = fast_rss;
587 585 if (scale_rss > 1 && prev_fast_rss > 0) {
588 586 /*
589 587 * Only scale the fast value if it hasn't ballooned too much
590 588 * to trust.
591 589 */
592 590 if (fast_rss / prev_fast_rss < 2) {
593 591 fast_rss /= scale_rss;
594 592 debug("scaled fast rss: %lluKB\n", fast_rss);
595 593 }
596 594 }
597 595
598 596 if (fast_rss <= zone_rss_cap || skip_vmusage) {
599 597 uint64_t zone_rss_bytes;
600 598
601 599 zone_rss_bytes = fast_rss * 1024;
602 600 /* Use the zone's approx. RSS in the kernel */
603 601 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
604 602 return (fast_rss);
605 603 }
606 604
607 605 buf.vmu_id = zid;
608 606
609 607 /* get accurate usage (cached data may be up to 5 seconds old) */
610 608 if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
611 609 (uintptr_t)&buf, (uintptr_t)&n) != 0) {
612 610 debug("vmusage failed\n");
613 611 (void) sleep_shutdown(1);
614 612 return (0);
615 613 }
616 614
617 615 if (n > 1) {
618 616 /* This should never happen */
619 617 debug("vmusage returned more than one result\n");
620 618 (void) sleep_shutdown(1);
621 619 return (0);
622 620 }
623 621
624 622 if (buf.vmu_id != zid) {
625 623 /* This should never happen */
626 624 debug("vmusage returned the incorrect zone\n");
627 625 (void) sleep_shutdown(1);
628 626 return (0);
629 627 }
630 628
631 629 accurate_rss = buf.vmu_rss_all / 1024;
632 630
633 631 /* calculate scaling factor to use for fast_rss from now on */
634 632 if (accurate_rss > 0) {
635 633 scale_rss = fast_rss / accurate_rss;
636 634 debug("new scaling factor: %llu\n", scale_rss);
637 635 /* remember the fast rss when we had to get the accurate rss */
638 636 prev_fast_rss = tmp_rss;
639 637 }
640 638
641 639 debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
642 640 scale_rss, prev_fast_rss);
643 641 return (accurate_rss);
644 642 }
645 643
646 644 /*
647 645 * Needed to read the zones physical-memory-cap rctl.
648 646 */
649 647 static struct ps_prochandle *
650 648 grab_zone_proc()
651 649 {
652 650 DIR *dirp;
653 651 struct dirent *dentp;
654 652 struct ps_prochandle *ph = NULL;
655 653 int tmp;
656 654
657 655 if ((dirp = opendir(zoneproc)) == NULL)
658 656 return (NULL);
659 657
660 658 while (!shutting_down && (dentp = readdir(dirp))) {
661 659 int pid;
662 660
663 661 if (strcmp(".", dentp->d_name) == 0 ||
664 662 strcmp("..", dentp->d_name) == 0)
665 663 continue;
666 664
667 665 pid = atoi(dentp->d_name);
668 666 /* attempt to grab process */
669 667 if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
670 668 if (Psetflags(ph, PR_RLC) == 0) {
671 669 if (Pcreate_agent(ph) == 0) {
672 670 (void) closedir(dirp);
673 671 return (ph);
674 672 }
675 673 }
676 674 Prelease(ph, 0);
677 675 }
678 676 }
679 677
680 678 (void) closedir(dirp);
681 679 return (NULL);
682 680 }
683 681
684 682 static uint64_t
685 683 get_zone_cap()
686 684 {
687 685 rctlblk_t *rblk;
688 686 uint64_t mcap;
689 687 struct ps_prochandle *ph;
690 688
691 689 if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
692 690 return (UINT64_MAX);
693 691
694 692 if ((ph = grab_zone_proc()) == NULL) {
695 693 free(rblk);
696 694 return (UINT64_MAX);
697 695 }
698 696
699 697 if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
700 698 RCTL_FIRST)) {
701 699 Pdestroy_agent(ph);
702 700 Prelease(ph, 0);
703 701 free(rblk);
704 702 return (UINT64_MAX);
705 703 }
706 704
707 705 Pdestroy_agent(ph);
708 706 Prelease(ph, 0);
709 707
710 708 mcap = rctlblk_get_value(rblk);
711 709 free(rblk);
712 710 return (mcap);
713 711 }
714 712
715 713 /*
716 714 * check_suspend is invoked at the beginning of every pass through the process
717 715 * list or after we've paged out enough so that we think the excess is under
718 716 * the cap. The purpose is to periodically check the zone's rss and return
719 717 * the excess when the zone is over the cap. The rest of the time this
720 718 * function will sleep, periodically waking up to check the current rss.
721 719 *
722 720 * Depending on the percentage of penetration of the zone's rss into the
723 721 * cap we sleep for longer or shorter amounts. This reduces the impact of this
724 722 * work on the system, which is important considering that each zone will be
725 723 * monitoring its rss.
726 724 */
727 725 static int64_t
728 726 check_suspend()
729 727 {
730 728 static hrtime_t last_cap_read = 0;
731 729 static uint64_t addon;
732 730 static uint64_t lo_thresh; /* Thresholds for how long to sleep */
733 731 static uint64_t hi_thresh; /* when under the cap (80% & 90%). */
734 732 static uint64_t prev_zone_rss = 0;
735 733 static uint32_t pfdelay = 0; /* usec page fault delay when over */
736 734
737 735 /* Wait a second to give the async pageout a chance to catch up. */
738 736 (void) sleep_shutdown(1);
739 737
740 738 while (!shutting_down) {
741 739 int64_t new_excess;
742 740 int sleep_time;
743 741 hrtime_t now;
744 742 struct stat st;
745 743 uint64_t zone_rss; /* total RSS(KB) */
746 744
747 745 /*
748 746 * Check if the debug log files exists and enable or disable
749 747 * debug.
750 748 */
751 749 if (debug_log_fp == NULL) {
752 750 if (stat(debug_log, &st) == 0)
753 751 debug_log_fp = fopen(debug_log, "w");
754 752 } else {
755 753 if (stat(debug_log, &st) == -1) {
756 754 (void) fclose(debug_log_fp);
757 755 debug_log_fp = NULL;
758 756 }
759 757 }
760 758
761 759 /*
762 760 * If the CAP_REFRESH interval has passed, re-get the current
763 761 * cap in case it has been dynamically updated.
764 762 */
765 763 now = gethrtime();
766 764 if (now - last_cap_read > CAP_REFRESH) {
767 765 uint64_t mcap;
768 766
769 767 last_cap_read = now;
770 768
771 769 mcap = get_zone_cap();
772 770 if (mcap != 0 && mcap != UINT64_MAX)
773 771 zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
774 772 else
775 773 zone_rss_cap = UINT64_MAX;
776 774
777 775 lo_thresh = (uint64_t)(zone_rss_cap * .8);
778 776 hi_thresh = (uint64_t)(zone_rss_cap * .9);
779 777 addon = (uint64_t)(zone_rss_cap * 0.05);
780 778
781 779 /*
782 780 * We allow the memory cap tunables to be changed on
783 781 * the fly.
784 782 */
785 783 get_mcap_tunables();
786 784
787 785 debug("%s: %s\n", TUNE_CMD, over_cmd);
788 786 debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
789 787 debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
790 788 debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
791 789 debug("current cap %lluKB lo %lluKB hi %lluKB\n",
792 790 zone_rss_cap, lo_thresh, hi_thresh);
793 791 }
794 792
795 793 /* No cap, nothing to do. */
796 794 if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
797 795 debug("no cap, sleep 120 seconds\n");
798 796 (void) sleep_shutdown(120);
799 797 continue;
800 798 }
801 799
802 800 zone_rss = get_mem_info();
803 801
804 802 /* calculate excess */
805 803 new_excess = zone_rss - zone_rss_cap;
806 804
807 805 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
808 806 zone_rss, zone_rss_cap, new_excess);
809 807
810 808 /*
811 809 * If necessary, updates stats.
812 810 */
813 811
814 812 /*
815 813 * If it looks like we did some paging out since last over the
816 814 * cap then update the kstat so we can approximate how much was
817 815 * paged out.
818 816 */
819 817 if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
820 818 uint64_t diff;
821 819
822 820 /* assume diff is num bytes we paged out */
823 821 diff = (prev_zone_rss - zone_rss) * 1024;
824 822
825 823 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
826 824 &diff, 0);
827 825 }
828 826 prev_zone_rss = zone_rss;
829 827
830 828 if (new_excess > 0) {
831 829 uint64_t n = 1;
832 830
833 831 /* Increment "nover" kstat. */
834 832 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
835 833
836 834 if (!skip_pf_throttle) {
837 835 /*
838 836 * Tell the kernel to start throttling page
839 837 * faults by some number of usecs to help us
840 838 * catch up. If we are persistently over the
841 839 * cap the delay ramps up to a max of 2000usecs.
842 840 * Note that for delays less than 1 tick
843 841 * (i.e. all of these) we busy-wait in as_fault.
844 842 * delay faults/sec
845 843 * 125 8000
846 844 * 250 4000
847 845 * 500 2000
848 846 * 1000 1000
849 847 * 2000 500
850 848 */
851 849 if (pfdelay == 0)
852 850 pfdelay = 125;
853 851 else if (pfdelay < 2000)
854 852 pfdelay *= 2;
855 853
856 854 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
857 855 &pfdelay, 0);
858 856 }
859 857
860 858 /*
861 859 * Once we go over the cap, then we want to
862 860 * page out a little extra instead of stopping
863 861 * right at the cap. To do this we add 5% to
864 862 * the excess so that pageout_proces will work
865 863 * a little longer before stopping.
866 864 */
867 865 return ((int64_t)(new_excess + addon));
868 866 }
869 867
870 868 /*
871 869 * At this point we are under the cap.
872 870 *
873 871 * Tell the kernel to stop throttling page faults.
874 872 *
875 873 * Scale the amount of time we sleep before rechecking the
876 874 * zone's memory usage. Also, scale the accpetable age of
877 875 * cached results from vm_getusage. We do this based on the
878 876 * penetration into the capped limit.
879 877 */
880 878 if (pfdelay > 0) {
881 879 pfdelay = 0;
882 880 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
883 881 &pfdelay, 0);
884 882 }
885 883
886 884 if (zone_rss <= lo_thresh) {
887 885 sleep_time = 120;
888 886 } else if (zone_rss <= hi_thresh) {
889 887 sleep_time = 60;
890 888 } else {
891 889 sleep_time = 30;
892 890 }
893 891
894 892 debug("sleep %d seconds\n", sleep_time);
895 893 (void) sleep_shutdown(sleep_time);
896 894 }
897 895
898 896 /* Shutting down, tell the kernel so it doesn't throttle */
899 897 if (pfdelay > 0) {
900 898 pfdelay = 0;
901 899 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
902 900 }
903 901
904 902 return (0);
905 903 }
906 904
907 905 static void
908 906 get_mcap_tunables()
909 907 {
910 908 zone_dochandle_t handle;
911 909 struct zone_attrtab attr;
912 910
913 911 over_cmd[0] = '\0';
914 912 if ((handle = zonecfg_init_handle()) == NULL)
915 913 return;
916 914
917 915 if (zonecfg_get_handle(zone_name, handle) != Z_OK)
918 916 goto done;
919 917
920 918 /* Reset to defaults in case rebooting and settings have changed */
921 919 over_cmd[0] = '\0';
922 920 skip_vmusage = B_FALSE;
923 921 skip_pageout = B_FALSE;
924 922 skip_pf_throttle = B_FALSE;
925 923
926 924 if (zonecfg_setattrent(handle) != Z_OK)
927 925 goto done;
928 926 while (zonecfg_getattrent(handle, &attr) == Z_OK) {
929 927 if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
930 928 (void) strlcpy(over_cmd, attr.zone_attr_value,
931 929 sizeof (over_cmd));
932 930 } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
933 931 if (strcmp("true", attr.zone_attr_value) == 0)
934 932 skip_vmusage = B_TRUE;
935 933 } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
936 934 if (strcmp("true", attr.zone_attr_value) == 0)
937 935 skip_pageout = B_TRUE;
938 936 } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
939 937 if (strcmp("true", attr.zone_attr_value) == 0)
940 938 skip_pf_throttle = B_TRUE;
941 939 }
942 940 }
943 941 (void) zonecfg_endattrent(handle);
944 942
945 943 done:
946 944 zonecfg_fini_handle(handle);
947 945 }
948 946
949 947 /* ARGSUSED */
950 948 static int
951 949 chk_proc_fs(void *data, const char *spec, const char *dir,
952 950 const char *fstype, const char *opt)
953 951 {
954 952 if (fstype != NULL && strcmp(fstype, "proc") == 0)
955 953 *((boolean_t *)data) = B_TRUE;
956 954
957 955 return (0);
958 956 }
959 957
960 958 static boolean_t
961 959 has_proc()
962 960 {
963 961 brand_handle_t bh;
964 962 boolean_t fnd = B_FALSE;
965 963
966 964 if ((bh = brand_open(brand_name)) != NULL) {
967 965 (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
968 966 }
969 967
970 968 brand_close(bh);
971 969 return (fnd);
972 970 }
973 971
974 972 /*
975 973 * We run this loop for brands with no /proc to simply update the RSS, using
976 974 * the cheap GZ /proc data, every 5 minutes.
977 975 */
978 976 static void
979 977 no_procfs()
980 978 {
981 979 DIR *pdir = NULL;
982 980 struct dirent *dent;
983 981 uint64_t zone_rss_bytes;
984 982
985 983 (void) sleep_shutdown(30);
986 984 while (!shutting_down) {
987 985 /*
988 986 * Just do the fast, cheap RSS calculation using the rss value
989 987 * in psinfo_t. Because that's per-process, it can lead to
990 988 * double counting some memory and overestimating how much is
991 989 * being used. Since there is no /proc in the zone, we use the
992 990 * GZ /proc and check for the correct zone.
993 991 */
994 992 if ((pdir = opendir("/proc")) == NULL)
995 993 return;
996 994
997 995 fast_rss = 0;
998 996 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
999 997 pid_t pid;
1000 998 int psfd;
1001 999 int64_t rss;
1002 1000 char pathbuf[MAXPATHLEN];
1003 1001 psinfo_t psinfo;
1004 1002
1005 1003 if (strcmp(".", dent->d_name) == 0 ||
1006 1004 strcmp("..", dent->d_name) == 0)
1007 1005 continue;
1008 1006
1009 1007 pid = atoi(dent->d_name);
1010 1008 if (pid == 0 || pid == 1)
1011 1009 continue;
1012 1010
1013 1011 (void) snprintf(pathbuf, sizeof (pathbuf),
1014 1012 "/proc/%d/psinfo", pid);
1015 1013
1016 1014 rss = 0;
1017 1015 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1018 1016 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1019 1017 sizeof (psinfo)) {
1020 1018 if (psinfo.pr_zoneid == zid)
1021 1019 rss = (int64_t)psinfo.pr_rssize;
1022 1020 }
1023 1021
1024 1022 (void) close(psfd);
1025 1023 }
1026 1024
1027 1025 fast_rss += rss;
1028 1026 }
1029 1027
1030 1028 (void) closedir(pdir);
1031 1029
1032 1030 if (shutting_down)
1033 1031 return;
1034 1032
1035 1033 zone_rss_bytes = fast_rss * 1024;
1036 1034 /* Use the zone's approx. RSS in the kernel */
1037 1035 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1038 1036
1039 1037 (void) sleep_shutdown(300);
1040 1038 }
1041 1039 }
1042 1040
1043 1041 /*
1044 1042 * Thread that checks zone's memory usage and when over the cap, goes through
1045 1043 * the zone's process list trying to pageout processes to get under the cap.
1046 1044 */
1047 1045 static void
1048 1046 mcap_zone()
1049 1047 {
1050 1048 DIR *pdir = NULL;
1051 1049 int64_t excess;
1052 1050
1053 1051 debug("thread startup\n");
1054 1052
1055 1053 get_mcap_tunables();
1056 1054
1057 1055 /*
1058 1056 * If the zone has no /proc filesystem, we can't use the fast algorithm
1059 1057 * to check RSS or pageout any processes. All we can do is periodically
1060 1058 * update it's RSS kstat using the expensive sycall.
1061 1059 */
1062 1060 if (!has_proc()) {
1063 1061 no_procfs();
1064 1062 debug("thread shutdown\n");
1065 1063 return;
1066 1064 }
1067 1065
1068 1066 /*
1069 1067 * When first starting it is likely lots of other zones are starting
1070 1068 * too because the system is booting. Since we just started the zone
1071 1069 * we're not worried about being over the cap right away, so we let
1072 1070 * things settle a bit and tolerate some older data here to minimize
1073 1071 * the load on the system.
1074 1072 */
1075 1073 (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1076 1074
1077 1075 /* Wait until zone's /proc is mounted */
1078 1076 while (!shutting_down) {
1079 1077 struct stat st;
1080 1078
1081 1079 if (stat(zoneproc, &st) == 0 &&
1082 1080 strcmp(st.st_fstype, "proc") == 0)
1083 1081 break;
1084 1082 sleep_shutdown(5);
1085 1083 }
1086 1084
1087 1085 /* Open zone's /proc and walk entries. */
1088 1086 while (!shutting_down) {
1089 1087 if ((pdir = opendir(zoneproc)) != NULL)
1090 1088 break;
1091 1089 sleep_shutdown(5);
1092 1090 }
1093 1091
1094 1092 while (!shutting_down) {
1095 1093 struct dirent *dirent;
1096 1094
1097 1095 /* Wait until we've gone over the cap. */
1098 1096 excess = check_suspend();
1099 1097
1100 1098 debug("starting to scan, excess %lldk\n", (long long)excess);
1101 1099
1102 1100 if (over_cmd[0] != '\0') {
1103 1101 uint64_t zone_rss; /* total RSS(KB) */
1104 1102
1105 1103 debug("run phys_mcap_cmd: %s\n", over_cmd);
1106 1104 run_over_cmd();
1107 1105
1108 1106 zone_rss = get_mem_info();
1109 1107 excess = zone_rss - zone_rss_cap;
1110 1108 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1111 1109 zone_rss, zone_rss_cap, excess);
1112 1110 if (excess <= 0)
1113 1111 continue;
1114 1112 }
1115 1113
1116 1114 while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1117 1115 pid_t pid;
1118 1116
1119 1117 if (strcmp(".", dirent->d_name) == 0 ||
1120 1118 strcmp("..", dirent->d_name) == 0)
1121 1119 continue;
1122 1120
1123 1121 pid = atoi(dirent->d_name);
1124 1122 if (pid == 0 || pid == 1)
1125 1123 continue;
1126 1124
1127 1125 if (skip_pageout)
1128 1126 (void) sleep_shutdown(2);
1129 1127 else
1130 1128 excess = pageout_process(pid, excess);
1131 1129
1132 1130 if (excess <= 0) {
1133 1131 debug("apparently under; excess %lld\n",
1134 1132 (long long)excess);
1135 1133 /* Double check the current excess */
1136 1134 excess = check_suspend();
1137 1135 }
1138 1136 }
1139 1137
1140 1138 debug("process pass done; excess %lld\n", (long long)excess);
1141 1139 rewinddir(pdir);
1142 1140
1143 1141 if (skip_pageout)
1144 1142 (void) sleep_shutdown(120);
1145 1143 }
1146 1144
1147 1145 if (pdir != NULL)
1148 1146 (void) closedir(pdir);
|
↓ open down ↓ |
693 lines elided |
↑ open up ↑ |
1149 1147 debug("thread shutdown\n");
1150 1148 }
1151 1149
1152 1150 void
1153 1151 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1154 1152 {
1155 1153 int res;
1156 1154
1157 1155 shutting_down = 0;
1158 1156 zid = id;
1159 - logp = zlogp;
1160 1157
1161 1158 /* all but the lx brand currently use /proc */
1162 1159 if (strcmp(brand_name, "lx") == 0) {
1163 1160 (void) snprintf(zoneproc, sizeof (zoneproc),
1164 1161 "%s/root/native/proc", zonepath);
1165 1162 } else {
1166 1163 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1167 1164 zonepath);
1168 1165 }
1169 1166
1170 1167 (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1171 1168 zonepath);
1172 1169
1173 1170 res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1174 1171 &mcap_tid);
1175 1172 if (res != 0) {
1176 1173 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1177 1174 res);
1178 1175 mcap_tid = 0;
1179 1176 }
1180 1177 }
1181 1178
1182 1179 void
1183 1180 destroy_mcap_thread()
1184 1181 {
1185 1182 if (mcap_tid != 0) {
1186 1183 shutting_down = 1;
1187 1184 (void) cond_signal(&shutdown_cv);
1188 1185 (void) thr_join(mcap_tid, NULL, NULL);
1189 1186 mcap_tid = 0;
1190 1187 }
1191 1188 }
|
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX