1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Copyright 2014, Joyent, Inc. All rights reserved.
24 */
25
26 /*
27 * This file implements the code which runs a thread inside zoneadmd to cap
28 * the associated zone's physical memory. A thread to do this is started
29 * when the zone boots and is halted when the zone shuts down.
30 *
31 * Because of the way that the VM system is currently implemented, there is no
32 * way to go from the bottom up (page to process to zone). Thus, there is no
33 * obvious way to hook an rctl into the kernel's paging code to enforce a hard
34 * memory cap. Instead, we implement a soft physical memory cap which looks
35 * at the zone's overall rss and once it is over the cap, works from the top
36 * down (zone to process to page), looking at zone processes, to determine
37 * what to try to pageout to get the zone under its memory cap.
38 *
39 * The code uses the fast, cheap, but potentially very inaccurate sum of the
40 * rss values from psinfo_t to first approximate the zone's rss and will
41 * fallback to the vm_getusage syscall to determine the zone's rss if needed.
42 * It then checks the rss against the zone's zone.max-physical-memory rctl.
43 * Once the zone goes over its cap, then this thread will work through the
44 * zone's /proc process list, Pgrab-bing each process and stepping through the
45 * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
46 * to pageout pages, until the zone is again under its cap.
47 *
48 * Although zone memory capping is implemented as a soft cap by this user-level
49 * thread, the interfaces around memory caps that are exposed to the user are
50 * the standard ones; an rctl and kstats. This thread uses the rctl value
51 * to obtain the cap and works with the zone kernel code to update the kstats.
52 * If the implementation ever moves into the kernel, these exposed interfaces
53 * do not need to change.
54 *
55 * The thread adaptively sleeps, periodically checking the state of the
56 * zone. As the zone's rss gets closer to the cap, the thread will wake up
57 * more often to check the zone's status. Once the zone is over the cap,
58 * the thread will work to pageout until the zone is under the cap, as shown
59 * by updated vm_usage data.
60 *
61 * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags
62 * are set by hrm_setbits() and on x86 that code path is only executed by
63 * segvn_pagelock -> hat_setstat -> hrm_setbits
64 * segvn_softunlock -^
65 * On SPARC there is an additional code path which may make this data
66 * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
67 * maps. If we ever fix this issue, then we could generalize this mcap code to
68 * do more with the data on active pages.
69 *
70 * For debugging, touch the file {zonepath}/mcap_debug.log. This will
71 * cause the thread to start logging its actions into that file (it may take
72 * a minute or two if the thread is currently sleeping). Removing that
73 * file will cause logging to stop.
74 */
75
76 #include <sys/mman.h>
77 #include <sys/param.h>
78 #include <sys/stat.h>
79 #include <sys/types.h>
80 #include <assert.h>
81 #include <errno.h>
82 #include <fcntl.h>
83 #include <libproc.h>
84 #include <limits.h>
85 #include <procfs.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <strings.h>
89 #include <time.h>
90 #include <unistd.h>
91 #include <sys/priocntl.h>
92 #include <dirent.h>
93 #include <zone.h>
94 #include <libzonecfg.h>
95 #include <thread.h>
96 #include <values.h>
97 #include <sys/vm_usage.h>
98 #include <sys/resource.h>
99 #include <sys/debug.h>
100 #include <synch.h>
101 #include <wait.h>
102 #include <libcontract.h>
103 #include <libcontract_priv.h>
104 #include <sys/contract/process.h>
105 #include "zoneadmd.h"
106
107 /* round up to next y = 2^n */
108 #define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1))
109
110 #define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */
111
112 /*
113 * zonecfg attribute tunables for memory capping.
114 * phys-mcap-cmd
115 * type: string
116 * specifies a command that can be run when over the cap
117 * phys-mcap-no-vmusage
118 * type: boolean
119 * true disables vm_getusage and just uses zone's proc. rss sum
120 * phys-mcap-no-pageout
121 * type: boolean
122 * true disables pageout when over
123 * phys-mcap-no-pf-throttle
124 * type: boolean
125 * true disables page fault throttling when over
126 */
127 #define TUNE_CMD "phys-mcap-cmd"
128 #define TUNE_NVMU "phys-mcap-no-vmusage"
129 #define TUNE_NPAGE "phys-mcap-no-pageout"
130 #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle"
131
132 /*
133 * The large mapping value was derived empirically by seeing that mappings
134 * much bigger than 16mb sometimes take a relatively long time to invalidate
135 * (significant fraction of a second).
136 */
137 #define SEC_INTERIM 4 /* num secs to pause after stopped too long */
138 #define MSEC_TOO_LONG 100 /* release proc. after stopped for 100ms */
139 #define LARGE_MAPPING 16384 /* >= 16MB in KB - pageout in chunks */
140
141 /*
142 * These are only used in get_mem_info but global. We always need scale_rss and
143 * prev_fast_rss to be persistent but we also have the other two global so we
144 * can easily see these with mdb.
145 */
146 uint64_t scale_rss = 0;
147 uint64_t prev_fast_rss = 0;
148 uint64_t fast_rss = 0;
149 uint64_t accurate_rss = 0;
150
151 static char zoneproc[MAXPATHLEN];
152 static char debug_log[MAXPATHLEN];
153 static zoneid_t zid;
154 static mutex_t shutdown_mx;
155 static cond_t shutdown_cv;
156 static int shutting_down = 0;
157 static thread_t mcap_tid;
158 static FILE *debug_log_fp = NULL;
159 static uint64_t zone_rss_cap; /* RSS cap(KB) */
160 static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
161 static boolean_t skip_vmusage = B_FALSE;
162 static boolean_t skip_pageout = B_FALSE;
163 static boolean_t skip_pf_throttle = B_FALSE;
164
165 static zlog_t *logp;
166
167 static int64_t check_suspend();
168 static void get_mcap_tunables();
169
170 /*
171 * Structure to hold current state about a process address space that we're
172 * working on.
173 */
174 typedef struct {
175 int pr_curr; /* the # of the mapping we're working on */
176 int pr_nmap; /* number of mappings in address space */
177 prmap_t *pr_mapp; /* process's map array */
178 } proc_map_t;
179
180 typedef struct zsd_vmusage64 {
181 id_t vmu_zoneid;
182 uint_t vmu_type;
183 id_t vmu_id;
184 /*
185 * An amd64 kernel will align the following uint64_t members, but a
186 * 32bit i386 process will not without help.
187 */
188 int vmu_align_next_members_on_8_bytes;
189 uint64_t vmu_rss_all;
190 uint64_t vmu_rss_private;
191 uint64_t vmu_rss_shared;
192 uint64_t vmu_swap_all;
193 uint64_t vmu_swap_private;
194 uint64_t vmu_swap_shared;
195 } zsd_vmusage64_t;
196
197 /*
198 * Output a debug log message.
199 */
200 /*PRINTFLIKE1*/
201 static void
202 debug(char *fmt, ...)
203 {
204 va_list ap;
205
206 if (debug_log_fp == NULL)
207 return;
208
209 va_start(ap, fmt);
210 (void) vfprintf(debug_log_fp, fmt, ap);
211 va_end(ap);
212 (void) fflush(debug_log_fp);
213 }
214
215 /*
216 * Like sleep(3C) but can be interupted by cond_signal which is posted when
217 * we're shutting down the mcap thread.
218 */
219 static void
220 sleep_shutdown(int secs)
221 {
222 timestruc_t to;
223
224 to.tv_sec = secs;
225 to.tv_nsec = 0;
226
227 (void) mutex_lock(&shutdown_mx);
228 if (!shutting_down)
229 (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
230 (void) mutex_unlock(&shutdown_mx);
231 }
232
233 static boolean_t
234 proc_issystem(pid_t pid)
235 {
236 char pc_clname[PC_CLNMSZ];
237
238 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
239 PC_KY_NULL) != -1)
240 return (strcmp(pc_clname, "SYS") == 0);
241
242 return (B_TRUE);
243 }
244
245 /*
246 * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
247 */
248 static void
249 run_over_cmd()
250 {
251 int ctfd;
252 int err;
253 pid_t childpid;
254 siginfo_t info;
255 ctid_t ct;
256
257 /*
258 * Before we enter the zone, we need to create a new process contract
259 * for the child, as required by zone_enter().
260 */
261 if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
262 return;
263 if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
264 ct_tmpl_set_informative(ctfd, 0) != 0 ||
265 ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
266 ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
267 ct_tmpl_activate(ctfd) != 0) {
268 (void) close(ctfd);
269 return;
270 }
271
272 childpid = fork();
273 switch (childpid) {
274 case -1:
275 (void) ct_tmpl_clear(ctfd);
276 (void) close(ctfd);
277 break;
278 case 0: /* Child */
279 (void) ct_tmpl_clear(ctfd);
280 (void) close(ctfd);
281 if (zone_enter(zid) == -1)
282 _exit(errno);
283 err = system(over_cmd);
284 _exit(err);
285 break;
286 default: /* Parent */
287 if (contract_latest(&ct) == -1)
288 ct = -1;
289 (void) ct_tmpl_clear(ctfd);
290 (void) close(ctfd);
291 err = waitid(P_PID, childpid, &info, WEXITED);
292 (void) contract_abandon_id(ct);
293 if (err == -1 || info.si_status != 0)
294 debug("over_cmd failed");
295 break;
296 }
297 }
298
299 /*
300 * Get the next mapping.
301 */
302 static prmap_t *
303 nextmapping(proc_map_t *pmp)
304 {
305 if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
306 return (NULL);
307
308 return (&pmp->pr_mapp[pmp->pr_curr++]);
309 }
310
311 /*
312 * Initialize the proc_map_t to access the first mapping of an address space.
313 */
314 static prmap_t *
315 init_map(proc_map_t *pmp, pid_t pid)
316 {
317 int fd;
318 int res;
319 struct stat st;
320 char pathbuf[MAXPATHLEN];
321
322 bzero(pmp, sizeof (proc_map_t));
323 pmp->pr_nmap = -1;
324
325 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
326 if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
327 return (NULL);
328
329 redo:
330 errno = 0;
331 if (fstat(fd, &st) != 0)
332 goto done;
333
334 if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
335 debug("cannot malloc() %ld bytes for xmap", st.st_size);
336 goto done;
337 }
338 (void) bzero(pmp->pr_mapp, st.st_size);
339
340 errno = 0;
341 if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
342 free(pmp->pr_mapp);
343 pmp->pr_mapp = NULL;
344 if (res > 0 || errno == E2BIG) {
345 goto redo;
346 } else {
347 debug("pid %ld cannot read xmap\n", pid);
348 goto done;
349 }
350 }
351
352 pmp->pr_nmap = st.st_size / sizeof (prmap_t);
353
354 done:
355 (void) close(fd);
356 return (nextmapping(pmp));
357 }
358
359 /*
360 * Attempt to invalidate the entire mapping from within the given process's
361 * address space. May return nonzero with errno as:
362 * ESRCH - process not found
363 * ENOMEM - segment not found
364 * EINVAL - mapping exceeds a single segment
365 */
366 static int
367 pageout_mapping(pid_t pid, prmap_t *pmp)
368 {
369 int res;
370
371 if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
372 return (0);
373
374 errno = 0;
375 res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
376 pmp->pr_size);
377
378 return (res);
379 }
380
381 /*
382 * Work through a process paging out mappings until the whole address space was
383 * examined or the excess is < 0. Return our estimate of the updated excess.
384 */
385 static int64_t
386 pageout_process(pid_t pid, int64_t excess)
387 {
388 int psfd;
389 prmap_t *pmap;
390 proc_map_t cur;
391 int res;
392 int64_t sum_d_rss, d_rss;
393 int64_t old_rss;
394 int map_cnt;
395 psinfo_t psinfo;
396 char pathbuf[MAXPATHLEN];
397
398 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
399 pid);
400 if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
401 return (excess);
402
403 cur.pr_mapp = NULL;
404
405 if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
406 goto done;
407
408 old_rss = (int64_t)psinfo.pr_rssize;
409 map_cnt = 0;
410
411 /* If unscannable, skip it. */
412 if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
413 debug("pid %ld: system process, skipping %s\n",
414 pid, psinfo.pr_psargs);
415 goto done;
416 }
417
418 /* If tiny RSS (16KB), skip it. */
419 if (old_rss <= 16) {
420 debug("pid %ld: skipping, RSS %lldKB %s\n",
421 pid, old_rss, psinfo.pr_psargs);
422 goto done;
423 }
424
425 /* Get segment residency information. */
426 pmap = init_map(&cur, pid);
427
428 /* Skip process if it has no mappings. */
429 if (pmap == NULL) {
430 debug("pid %ld: map unreadable; ignoring\n", pid);
431 goto done;
432 }
433
434 debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
435 pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
436
437 /*
438 * Within the process's address space, attempt to page out mappings.
439 */
440 sum_d_rss = 0;
441 while (excess > 0 && pmap != NULL && !shutting_down) {
442 /* invalidate the entire mapping */
443 if ((res = pageout_mapping(pid, pmap)) < 0)
444 debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
445 pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
446
447 map_cnt++;
448
449 /*
450 * Re-check the process rss and get the delta.
451 */
452 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
453 != sizeof (psinfo)) {
454 excess -= old_rss;
455 goto done;
456 }
457
458 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
459 old_rss = (int64_t)psinfo.pr_rssize;
460 sum_d_rss += d_rss;
461
462 /*
463 * d_rss hopefully should be negative (or 0 if nothing
464 * invalidated) but can be positive if more got paged in.
465 */
466 excess += d_rss;
467
468 if (excess <= 0) {
469 debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
470 "excess %lldKB\n", pid, map_cnt,
471 (unsigned long long)sum_d_rss, (long long)excess);
472 map_cnt = 0;
473
474 /*
475 * If we're actually under, this will suspend checking
476 * in the middle of this process's address space.
477 */
478 excess = check_suspend();
479 if (shutting_down)
480 goto done;
481
482 /*
483 * since we might have suspended, re-read process's rss
484 */
485 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
486 != sizeof (psinfo)) {
487 excess -= old_rss;
488 goto done;
489 }
490
491 old_rss = (int64_t)psinfo.pr_rssize;
492
493 debug("pid %ld: resume pageout; excess %lld\n", pid,
494 (long long)excess);
495 sum_d_rss = 0;
496 }
497
498 pmap = nextmapping(&cur);
499 }
500
501 debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
502 pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
503
504 done:
505 if (cur.pr_mapp != NULL)
506 free(cur.pr_mapp);
507
508 (void) close(psfd);
509
510 if (shutting_down)
511 return (0);
512
513 return (excess);
514 }
515
516 /*
517 * Get the zone's RSS data.
518 */
519 static uint64_t
520 get_mem_info()
521 {
522 uint64_t n = 1;
523 zsd_vmusage64_t buf;
524 uint64_t tmp_rss;
525 DIR *pdir = NULL;
526 struct dirent *dent;
527
528 /*
529 * Start by doing the fast, cheap RSS calculation using the rss value
530 * in psinfo_t. Because that's per-process, it can lead to double
531 * counting some memory and overestimating how much is being used, but
532 * as long as that's not over the cap, then we don't need do the
533 * expensive calculation.
534 *
535 * If we have to do the expensive calculation, we remember the scaling
536 * factor so that we can try to use that on subsequent iterations for
537 * the fast rss.
538 */
539 if (shutting_down)
540 return (0);
541
542 if ((pdir = opendir(zoneproc)) == NULL)
543 return (0);
544
545 accurate_rss = 0;
546 fast_rss = 0;
547 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
548 pid_t pid;
549 int psfd;
550 int64_t rss;
551 char pathbuf[MAXPATHLEN];
552 psinfo_t psinfo;
553
554 if (strcmp(".", dent->d_name) == 0 ||
555 strcmp("..", dent->d_name) == 0)
556 continue;
557
558 pid = atoi(dent->d_name);
559 if (pid == 0 || pid == 1)
560 continue;
561
562 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
563 zoneproc, pid);
564
565 rss = 0;
566 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
567 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
568 sizeof (psinfo))
569 rss = (int64_t)psinfo.pr_rssize;
570
571 (void) close(psfd);
572 }
573
574 fast_rss += rss;
575 }
576
577 (void) closedir(pdir);
578
579 if (shutting_down)
580 return (0);
581
582 debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
583 scale_rss, prev_fast_rss);
584
585 /* see if we can get by with a scaled fast rss */
586 tmp_rss = fast_rss;
587 if (scale_rss > 1 && prev_fast_rss > 0) {
588 /*
589 * Only scale the fast value if it hasn't ballooned too much
590 * to trust.
591 */
592 if (fast_rss / prev_fast_rss < 2) {
593 fast_rss /= scale_rss;
594 debug("scaled fast rss: %lluKB\n", fast_rss);
595 }
596 }
597
598 if (fast_rss <= zone_rss_cap || skip_vmusage) {
599 uint64_t zone_rss_bytes;
600
601 zone_rss_bytes = fast_rss * 1024;
602 /* Use the zone's approx. RSS in the kernel */
603 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
604 return (fast_rss);
605 }
606
607 buf.vmu_id = zid;
608
609 /* get accurate usage (cached data may be up to 5 seconds old) */
610 if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
611 (uintptr_t)&buf, (uintptr_t)&n) != 0) {
612 debug("vmusage failed\n");
613 (void) sleep_shutdown(1);
614 return (0);
615 }
616
617 if (n > 1) {
618 /* This should never happen */
619 debug("vmusage returned more than one result\n");
620 (void) sleep_shutdown(1);
621 return (0);
622 }
623
624 if (buf.vmu_id != zid) {
625 /* This should never happen */
626 debug("vmusage returned the incorrect zone\n");
627 (void) sleep_shutdown(1);
628 return (0);
629 }
630
631 accurate_rss = buf.vmu_rss_all / 1024;
632
633 /* calculate scaling factor to use for fast_rss from now on */
634 if (accurate_rss > 0) {
635 scale_rss = fast_rss / accurate_rss;
636 debug("new scaling factor: %llu\n", scale_rss);
637 /* remember the fast rss when we had to get the accurate rss */
638 prev_fast_rss = tmp_rss;
639 }
640
641 debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
642 scale_rss, prev_fast_rss);
643 return (accurate_rss);
644 }
645
646 /*
647 * Needed to read the zones physical-memory-cap rctl.
648 */
649 static struct ps_prochandle *
650 grab_zone_proc()
651 {
652 DIR *dirp;
653 struct dirent *dentp;
654 struct ps_prochandle *ph = NULL;
655 int tmp;
656
657 if ((dirp = opendir(zoneproc)) == NULL)
658 return (NULL);
659
660 while (!shutting_down && (dentp = readdir(dirp))) {
661 int pid;
662
663 if (strcmp(".", dentp->d_name) == 0 ||
664 strcmp("..", dentp->d_name) == 0)
665 continue;
666
667 pid = atoi(dentp->d_name);
668 /* attempt to grab process */
669 if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
670 if (Psetflags(ph, PR_RLC) == 0) {
671 if (Pcreate_agent(ph) == 0) {
672 (void) closedir(dirp);
673 return (ph);
674 }
675 }
676 Prelease(ph, 0);
677 }
678 }
679
680 (void) closedir(dirp);
681 return (NULL);
682 }
683
684 static uint64_t
685 get_zone_cap()
686 {
687 rctlblk_t *rblk;
688 uint64_t mcap;
689 struct ps_prochandle *ph;
690
691 if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
692 return (UINT64_MAX);
693
694 if ((ph = grab_zone_proc()) == NULL) {
695 free(rblk);
696 return (UINT64_MAX);
697 }
698
699 if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
700 RCTL_FIRST)) {
701 Pdestroy_agent(ph);
702 Prelease(ph, 0);
703 free(rblk);
704 return (UINT64_MAX);
705 }
706
707 Pdestroy_agent(ph);
708 Prelease(ph, 0);
709
710 mcap = rctlblk_get_value(rblk);
711 free(rblk);
712 return (mcap);
713 }
714
715 /*
716 * check_suspend is invoked at the beginning of every pass through the process
717 * list or after we've paged out enough so that we think the excess is under
718 * the cap. The purpose is to periodically check the zone's rss and return
719 * the excess when the zone is over the cap. The rest of the time this
720 * function will sleep, periodically waking up to check the current rss.
721 *
722 * Depending on the percentage of penetration of the zone's rss into the
723 * cap we sleep for longer or shorter amounts. This reduces the impact of this
724 * work on the system, which is important considering that each zone will be
725 * monitoring its rss.
726 */
727 static int64_t
728 check_suspend()
729 {
730 static hrtime_t last_cap_read = 0;
731 static uint64_t addon;
732 static uint64_t lo_thresh; /* Thresholds for how long to sleep */
733 static uint64_t hi_thresh; /* when under the cap (80% & 90%). */
734 static uint64_t prev_zone_rss = 0;
735 static uint32_t pfdelay = 0; /* usec page fault delay when over */
736
737 /* Wait a second to give the async pageout a chance to catch up. */
738 (void) sleep_shutdown(1);
739
740 while (!shutting_down) {
741 int64_t new_excess;
742 int sleep_time;
743 hrtime_t now;
744 struct stat st;
745 uint64_t zone_rss; /* total RSS(KB) */
746
747 /*
748 * Check if the debug log files exists and enable or disable
749 * debug.
750 */
751 if (debug_log_fp == NULL) {
752 if (stat(debug_log, &st) == 0)
753 debug_log_fp = fopen(debug_log, "w");
754 } else {
755 if (stat(debug_log, &st) == -1) {
756 (void) fclose(debug_log_fp);
757 debug_log_fp = NULL;
758 }
759 }
760
761 /*
762 * If the CAP_REFRESH interval has passed, re-get the current
763 * cap in case it has been dynamically updated.
764 */
765 now = gethrtime();
766 if (now - last_cap_read > CAP_REFRESH) {
767 uint64_t mcap;
768
769 last_cap_read = now;
770
771 mcap = get_zone_cap();
772 if (mcap != 0 && mcap != UINT64_MAX)
773 zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
774 else
775 zone_rss_cap = UINT64_MAX;
776
777 lo_thresh = (uint64_t)(zone_rss_cap * .8);
778 hi_thresh = (uint64_t)(zone_rss_cap * .9);
779 addon = (uint64_t)(zone_rss_cap * 0.05);
780
781 /*
782 * We allow the memory cap tunables to be changed on
783 * the fly.
784 */
785 get_mcap_tunables();
786
787 debug("%s: %s\n", TUNE_CMD, over_cmd);
788 debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
789 debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
790 debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
791 debug("current cap %lluKB lo %lluKB hi %lluKB\n",
792 zone_rss_cap, lo_thresh, hi_thresh);
793 }
794
795 /* No cap, nothing to do. */
796 if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
797 debug("no cap, sleep 120 seconds\n");
798 (void) sleep_shutdown(120);
799 continue;
800 }
801
802 zone_rss = get_mem_info();
803
804 /* calculate excess */
805 new_excess = zone_rss - zone_rss_cap;
806
807 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
808 zone_rss, zone_rss_cap, new_excess);
809
810 /*
811 * If necessary, updates stats.
812 */
813
814 /*
815 * If it looks like we did some paging out since last over the
816 * cap then update the kstat so we can approximate how much was
817 * paged out.
818 */
819 if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
820 uint64_t diff;
821
822 /* assume diff is num bytes we paged out */
823 diff = (prev_zone_rss - zone_rss) * 1024;
824
825 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
826 &diff, 0);
827 }
828 prev_zone_rss = zone_rss;
829
830 if (new_excess > 0) {
831 uint64_t n = 1;
832
833 /* Increment "nover" kstat. */
834 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
835
836 if (!skip_pf_throttle) {
837 /*
838 * Tell the kernel to start throttling page
839 * faults by some number of usecs to help us
840 * catch up. If we are persistently over the
841 * cap the delay ramps up to a max of 2000usecs.
842 * Note that for delays less than 1 tick
843 * (i.e. all of these) we busy-wait in as_fault.
844 * delay faults/sec
845 * 125 8000
846 * 250 4000
847 * 500 2000
848 * 1000 1000
849 * 2000 500
850 */
851 if (pfdelay == 0)
852 pfdelay = 125;
853 else if (pfdelay < 2000)
854 pfdelay *= 2;
855
856 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
857 &pfdelay, 0);
858 }
859
860 /*
861 * Once we go over the cap, then we want to
862 * page out a little extra instead of stopping
863 * right at the cap. To do this we add 5% to
864 * the excess so that pageout_proces will work
865 * a little longer before stopping.
866 */
867 return ((int64_t)(new_excess + addon));
868 }
869
870 /*
871 * At this point we are under the cap.
872 *
873 * Tell the kernel to stop throttling page faults.
874 *
875 * Scale the amount of time we sleep before rechecking the
876 * zone's memory usage. Also, scale the accpetable age of
877 * cached results from vm_getusage. We do this based on the
878 * penetration into the capped limit.
879 */
880 if (pfdelay > 0) {
881 pfdelay = 0;
882 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
883 &pfdelay, 0);
884 }
885
886 if (zone_rss <= lo_thresh) {
887 sleep_time = 120;
888 } else if (zone_rss <= hi_thresh) {
889 sleep_time = 60;
890 } else {
891 sleep_time = 30;
892 }
893
894 debug("sleep %d seconds\n", sleep_time);
895 (void) sleep_shutdown(sleep_time);
896 }
897
898 /* Shutting down, tell the kernel so it doesn't throttle */
899 if (pfdelay > 0) {
900 pfdelay = 0;
901 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
902 }
903
904 return (0);
905 }
906
907 static void
908 get_mcap_tunables()
909 {
910 zone_dochandle_t handle;
911 struct zone_attrtab attr;
912
913 over_cmd[0] = '\0';
914 if ((handle = zonecfg_init_handle()) == NULL)
915 return;
916
917 if (zonecfg_get_handle(zone_name, handle) != Z_OK)
918 goto done;
919
920 /* Reset to defaults in case rebooting and settings have changed */
921 over_cmd[0] = '\0';
922 skip_vmusage = B_FALSE;
923 skip_pageout = B_FALSE;
924 skip_pf_throttle = B_FALSE;
925
926 if (zonecfg_setattrent(handle) != Z_OK)
927 goto done;
928 while (zonecfg_getattrent(handle, &attr) == Z_OK) {
929 if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
930 (void) strlcpy(over_cmd, attr.zone_attr_value,
931 sizeof (over_cmd));
932 } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
933 if (strcmp("true", attr.zone_attr_value) == 0)
934 skip_vmusage = B_TRUE;
935 } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
936 if (strcmp("true", attr.zone_attr_value) == 0)
937 skip_pageout = B_TRUE;
938 } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
939 if (strcmp("true", attr.zone_attr_value) == 0)
940 skip_pf_throttle = B_TRUE;
941 }
942 }
943 (void) zonecfg_endattrent(handle);
944
945 done:
946 zonecfg_fini_handle(handle);
947 }
948
949 /* ARGSUSED */
950 static int
951 chk_proc_fs(void *data, const char *spec, const char *dir,
952 const char *fstype, const char *opt)
953 {
954 if (fstype != NULL && strcmp(fstype, "proc") == 0)
955 *((boolean_t *)data) = B_TRUE;
956
957 return (0);
958 }
959
960 static boolean_t
961 has_proc()
962 {
963 brand_handle_t bh;
964 boolean_t fnd = B_FALSE;
965
966 if ((bh = brand_open(brand_name)) != NULL) {
967 (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
968 }
969
970 brand_close(bh);
971 return (fnd);
972 }
973
974 /*
975 * We run this loop for brands with no /proc to simply update the RSS, using
976 * the cheap GZ /proc data, every 5 minutes.
977 */
978 static void
979 no_procfs()
980 {
981 DIR *pdir = NULL;
982 struct dirent *dent;
983 uint64_t zone_rss_bytes;
984
985 (void) sleep_shutdown(30);
986 while (!shutting_down) {
987 /*
988 * Just do the fast, cheap RSS calculation using the rss value
989 * in psinfo_t. Because that's per-process, it can lead to
990 * double counting some memory and overestimating how much is
991 * being used. Since there is no /proc in the zone, we use the
992 * GZ /proc and check for the correct zone.
993 */
994 if ((pdir = opendir("/proc")) == NULL)
995 return;
996
997 fast_rss = 0;
998 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
999 pid_t pid;
1000 int psfd;
1001 int64_t rss;
1002 char pathbuf[MAXPATHLEN];
1003 psinfo_t psinfo;
1004
1005 if (strcmp(".", dent->d_name) == 0 ||
1006 strcmp("..", dent->d_name) == 0)
1007 continue;
1008
1009 pid = atoi(dent->d_name);
1010 if (pid == 0 || pid == 1)
1011 continue;
1012
1013 (void) snprintf(pathbuf, sizeof (pathbuf),
1014 "/proc/%d/psinfo", pid);
1015
1016 rss = 0;
1017 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1018 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1019 sizeof (psinfo)) {
1020 if (psinfo.pr_zoneid == zid)
1021 rss = (int64_t)psinfo.pr_rssize;
1022 }
1023
1024 (void) close(psfd);
1025 }
1026
1027 fast_rss += rss;
1028 }
1029
1030 (void) closedir(pdir);
1031
1032 if (shutting_down)
1033 return;
1034
1035 zone_rss_bytes = fast_rss * 1024;
1036 /* Use the zone's approx. RSS in the kernel */
1037 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1038
1039 (void) sleep_shutdown(300);
1040 }
1041 }
1042
1043 /*
1044 * Thread that checks zone's memory usage and when over the cap, goes through
1045 * the zone's process list trying to pageout processes to get under the cap.
1046 */
1047 static void
1048 mcap_zone()
1049 {
1050 DIR *pdir = NULL;
1051 int64_t excess;
1052
1053 debug("thread startup\n");
1054
1055 get_mcap_tunables();
1056
1057 /*
1058 * If the zone has no /proc filesystem, we can't use the fast algorithm
1059 * to check RSS or pageout any processes. All we can do is periodically
1060 * update it's RSS kstat using the expensive sycall.
1061 */
1062 if (!has_proc()) {
1063 no_procfs();
1064 debug("thread shutdown\n");
1065 return;
1066 }
1067
1068 /*
1069 * When first starting it is likely lots of other zones are starting
1070 * too because the system is booting. Since we just started the zone
1071 * we're not worried about being over the cap right away, so we let
1072 * things settle a bit and tolerate some older data here to minimize
1073 * the load on the system.
1074 */
1075 (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1076
1077 /* Wait until zone's /proc is mounted */
1078 while (!shutting_down) {
1079 struct stat st;
1080
1081 if (stat(zoneproc, &st) == 0 &&
1082 strcmp(st.st_fstype, "proc") == 0)
1083 break;
1084 sleep_shutdown(5);
1085 }
1086
1087 /* Open zone's /proc and walk entries. */
1088 while (!shutting_down) {
1089 if ((pdir = opendir(zoneproc)) != NULL)
1090 break;
1091 sleep_shutdown(5);
1092 }
1093
1094 while (!shutting_down) {
1095 struct dirent *dirent;
1096
1097 /* Wait until we've gone over the cap. */
1098 excess = check_suspend();
1099
1100 debug("starting to scan, excess %lldk\n", (long long)excess);
1101
1102 if (over_cmd[0] != '\0') {
1103 uint64_t zone_rss; /* total RSS(KB) */
1104
1105 debug("run phys_mcap_cmd: %s\n", over_cmd);
1106 run_over_cmd();
1107
1108 zone_rss = get_mem_info();
1109 excess = zone_rss - zone_rss_cap;
1110 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1111 zone_rss, zone_rss_cap, excess);
1112 if (excess <= 0)
1113 continue;
1114 }
1115
1116 while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1117 pid_t pid;
1118
1119 if (strcmp(".", dirent->d_name) == 0 ||
1120 strcmp("..", dirent->d_name) == 0)
1121 continue;
1122
1123 pid = atoi(dirent->d_name);
1124 if (pid == 0 || pid == 1)
1125 continue;
1126
1127 if (skip_pageout)
1128 (void) sleep_shutdown(2);
1129 else
1130 excess = pageout_process(pid, excess);
1131
1132 if (excess <= 0) {
1133 debug("apparently under; excess %lld\n",
1134 (long long)excess);
1135 /* Double check the current excess */
1136 excess = check_suspend();
1137 }
1138 }
1139
1140 debug("process pass done; excess %lld\n", (long long)excess);
1141 rewinddir(pdir);
1142
1143 if (skip_pageout)
1144 (void) sleep_shutdown(120);
1145 }
1146
1147 if (pdir != NULL)
1148 (void) closedir(pdir);
1149 debug("thread shutdown\n");
1150 }
1151
1152 void
1153 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1154 {
1155 int res;
1156
1157 shutting_down = 0;
1158 zid = id;
1159 logp = zlogp;
1160
1161 /* all but the lx brand currently use /proc */
1162 if (strcmp(brand_name, "lx") == 0) {
1163 (void) snprintf(zoneproc, sizeof (zoneproc),
1164 "%s/root/native/proc", zonepath);
1165 } else {
1166 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1167 zonepath);
1168 }
1169
1170 (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1171 zonepath);
1172
1173 res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1174 &mcap_tid);
1175 if (res != 0) {
1176 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1177 res);
1178 mcap_tid = 0;
1179 }
1180 }
1181
1182 void
1183 destroy_mcap_thread()
1184 {
1185 if (mcap_tid != 0) {
1186 shutting_down = 1;
1187 (void) cond_signal(&shutdown_cv);
1188 (void) thr_join(mcap_tid, NULL, NULL);
1189 mcap_tid = 0;
1190 }
1191 }