1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Copyright 2014, Joyent, Inc. All rights reserved.
24 */
25
26 /*
27 * This file implements the code which runs a thread inside zoneadmd to cap
28 * the associated zone's physical memory. A thread to do this is started
29 * when the zone boots and is halted when the zone shuts down.
30 *
31 * Because of the way that the VM system is currently implemented, there is no
32 * way to go from the bottom up (page to process to zone). Thus, there is no
33 * obvious way to hook an rctl into the kernel's paging code to enforce a hard
34 * memory cap. Instead, we implement a soft physical memory cap which looks
35 * at the zone's overall rss and once it is over the cap, works from the top
36 * down (zone to process to page), looking at zone processes, to determine
37 * what to try to pageout to get the zone under its memory cap.
38 *
39 * The code uses the fast, cheap, but potentially very inaccurate sum of the
40 * rss values from psinfo_t to first approximate the zone's rss and will
41 * fallback to the vm_getusage syscall to determine the zone's rss if needed.
42 * It then checks the rss against the zone's zone.max-physical-memory rctl.
43 * Once the zone goes over its cap, then this thread will work through the
44 * zone's /proc process list, Pgrab-bing each process and stepping through the
45 * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
46 * to pageout pages, until the zone is again under its cap.
47 *
48 * Although zone memory capping is implemented as a soft cap by this user-level
49 * thread, the interfaces around memory caps that are exposed to the user are
50 * the standard ones; an rctl and kstats. This thread uses the rctl value
51 * to obtain the cap and works with the zone kernel code to update the kstats.
52 * If the implementation ever moves into the kernel, these exposed interfaces
53 * do not need to change.
54 *
55 * The thread adaptively sleeps, periodically checking the state of the
56 * zone. As the zone's rss gets closer to the cap, the thread will wake up
57 * more often to check the zone's status. Once the zone is over the cap,
58 * the thread will work to pageout until the zone is under the cap, as shown
59 * by updated vm_usage data.
60 *
61 * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags
62 * are set by hrm_setbits() and on x86 that code path is only executed by
63 * segvn_pagelock -> hat_setstat -> hrm_setbits
64 * segvn_softunlock -^
65 * On SPARC there is an additional code path which may make this data
66 * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
67 * maps. If we ever fix this issue, then we could generalize this mcap code to
68 * do more with the data on active pages.
69 *
70 * For debugging, touch the file {zonepath}/mcap_debug.log. This will
71 * cause the thread to start logging its actions into that file (it may take
72 * a minute or two if the thread is currently sleeping). Removing that
73 * file will cause logging to stop.
74 */
75
76 #include <sys/mman.h>
77 #include <sys/param.h>
78 #include <sys/stat.h>
79 #include <sys/types.h>
80 #include <assert.h>
81 #include <errno.h>
82 #include <fcntl.h>
83 #include <libproc.h>
84 #include <limits.h>
85 #include <procfs.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <strings.h>
89 #include <time.h>
90 #include <unistd.h>
91 #include <sys/priocntl.h>
92 #include <dirent.h>
93 #include <zone.h>
94 #include <libzonecfg.h>
95 #include <thread.h>
96 #include <values.h>
97 #include <sys/vm_usage.h>
98 #include <sys/resource.h>
99 #include <sys/debug.h>
100 #include <synch.h>
101 #include <wait.h>
102 #include <libcontract.h>
103 #include <libcontract_priv.h>
104 #include <sys/contract/process.h>
105 #include "zoneadmd.h"
106
107 /* round up to next y = 2^n */
108 #define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1))
109
110 #define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */
111
112 /*
113 * zonecfg attribute tunables for memory capping.
114 * phys-mcap-cmd
115 * type: string
116 * specifies a command that can be run when over the cap
117 * phys-mcap-no-vmusage
118 * type: boolean
119 * true disables vm_getusage and just uses zone's proc. rss sum
120 * phys-mcap-no-pageout
121 * type: boolean
122 * true disables pageout when over
123 * phys-mcap-no-pf-throttle
124 * type: boolean
125 * true disables page fault throttling when over
126 */
127 #define TUNE_CMD "phys-mcap-cmd"
128 #define TUNE_NVMU "phys-mcap-no-vmusage"
129 #define TUNE_NPAGE "phys-mcap-no-pageout"
130 #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle"
131
132 /*
133 * The large mapping value was derived empirically by seeing that mappings
134 * much bigger than 16mb sometimes take a relatively long time to invalidate
135 * (significant fraction of a second).
136 */
137 #define SEC_INTERIM 4 /* num secs to pause after stopped too long */
138 #define MSEC_TOO_LONG 100 /* release proc. after stopped for 100ms */
139 #define LARGE_MAPPING 16384 /* >= 16MB in KB - pageout in chunks */
140
141 /*
142 * These are only used in get_mem_info but global. We always need scale_rss and
143 * prev_fast_rss to be persistent but we also have the other two global so we
144 * can easily see these with mdb.
145 */
146 uint64_t scale_rss = 0;
147 uint64_t prev_fast_rss = 0;
148 uint64_t fast_rss = 0;
149 uint64_t accurate_rss = 0;
150
151 static char zoneproc[MAXPATHLEN];
152 static char debug_log[MAXPATHLEN];
153 static zoneid_t zid;
154 static mutex_t shutdown_mx;
155 static cond_t shutdown_cv;
156 static int shutting_down = 0;
157 static thread_t mcap_tid;
158 static FILE *debug_log_fp = NULL;
159 static uint64_t zone_rss_cap; /* RSS cap(KB) */
160 static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
161 static boolean_t skip_vmusage = B_FALSE;
162 static boolean_t skip_pageout = B_FALSE;
163 static boolean_t skip_pf_throttle = B_FALSE;
164
165 static int64_t check_suspend();
166 static void get_mcap_tunables();
167
168 /*
169 * Structure to hold current state about a process address space that we're
170 * working on.
171 */
172 typedef struct {
173 int pr_curr; /* the # of the mapping we're working on */
174 int pr_nmap; /* number of mappings in address space */
175 prmap_t *pr_mapp; /* process's map array */
176 } proc_map_t;
177
178 typedef struct zsd_vmusage64 {
179 id_t vmu_zoneid;
180 uint_t vmu_type;
181 id_t vmu_id;
182 /*
183 * An amd64 kernel will align the following uint64_t members, but a
184 * 32bit i386 process will not without help.
185 */
186 int vmu_align_next_members_on_8_bytes;
187 uint64_t vmu_rss_all;
188 uint64_t vmu_rss_private;
189 uint64_t vmu_rss_shared;
190 uint64_t vmu_swap_all;
191 uint64_t vmu_swap_private;
192 uint64_t vmu_swap_shared;
193 } zsd_vmusage64_t;
194
195 /*
196 * Output a debug log message.
197 */
198 /*PRINTFLIKE1*/
199 static void
200 debug(char *fmt, ...)
201 {
202 va_list ap;
203
204 if (debug_log_fp == NULL)
205 return;
206
207 va_start(ap, fmt);
208 (void) vfprintf(debug_log_fp, fmt, ap);
209 va_end(ap);
210 (void) fflush(debug_log_fp);
211 }
212
213 /*
214 * Like sleep(3C) but can be interupted by cond_signal which is posted when
215 * we're shutting down the mcap thread.
216 */
217 static void
218 sleep_shutdown(int secs)
219 {
220 timestruc_t to;
221
222 to.tv_sec = secs;
223 to.tv_nsec = 0;
224
225 (void) mutex_lock(&shutdown_mx);
226 if (!shutting_down)
227 (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
228 (void) mutex_unlock(&shutdown_mx);
229 }
230
231 static boolean_t
232 proc_issystem(pid_t pid)
233 {
234 char pc_clname[PC_CLNMSZ];
235
236 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
237 PC_KY_NULL) != -1)
238 return (strcmp(pc_clname, "SYS") == 0);
239
240 return (B_TRUE);
241 }
242
243 /*
244 * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
245 */
246 static void
247 run_over_cmd()
248 {
249 int ctfd;
250 int err;
251 pid_t childpid;
252 siginfo_t info;
253 ctid_t ct;
254
255 /*
256 * Before we enter the zone, we need to create a new process contract
257 * for the child, as required by zone_enter().
258 */
259 if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
260 return;
261 if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
262 ct_tmpl_set_informative(ctfd, 0) != 0 ||
263 ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
264 ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
265 ct_tmpl_activate(ctfd) != 0) {
266 (void) close(ctfd);
267 return;
268 }
269
270 childpid = fork();
271 switch (childpid) {
272 case -1:
273 (void) ct_tmpl_clear(ctfd);
274 (void) close(ctfd);
275 break;
276 case 0: /* Child */
277 (void) ct_tmpl_clear(ctfd);
278 (void) close(ctfd);
279 if (zone_enter(zid) == -1)
280 _exit(errno);
281 err = system(over_cmd);
282 _exit(err);
283 break;
284 default: /* Parent */
285 if (contract_latest(&ct) == -1)
286 ct = -1;
287 (void) ct_tmpl_clear(ctfd);
288 (void) close(ctfd);
289 err = waitid(P_PID, childpid, &info, WEXITED);
290 (void) contract_abandon_id(ct);
291 if (err == -1 || info.si_status != 0)
292 debug("over_cmd failed");
293 break;
294 }
295 }
296
297 /*
298 * Get the next mapping.
299 */
300 static prmap_t *
301 nextmapping(proc_map_t *pmp)
302 {
303 if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
304 return (NULL);
305
306 return (&pmp->pr_mapp[pmp->pr_curr++]);
307 }
308
309 /*
310 * Initialize the proc_map_t to access the first mapping of an address space.
311 */
312 static prmap_t *
313 init_map(proc_map_t *pmp, pid_t pid)
314 {
315 int fd;
316 int res;
317 struct stat st;
318 char pathbuf[MAXPATHLEN];
319
320 bzero(pmp, sizeof (proc_map_t));
321 pmp->pr_nmap = -1;
322
323 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
324 if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
325 return (NULL);
326
327 redo:
328 errno = 0;
329 if (fstat(fd, &st) != 0)
330 goto done;
331
332 if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
333 debug("cannot malloc() %ld bytes for xmap", st.st_size);
334 goto done;
335 }
336 (void) bzero(pmp->pr_mapp, st.st_size);
337
338 errno = 0;
339 if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
340 free(pmp->pr_mapp);
341 pmp->pr_mapp = NULL;
342 if (res > 0 || errno == E2BIG) {
343 goto redo;
344 } else {
345 debug("pid %ld cannot read xmap\n", pid);
346 goto done;
347 }
348 }
349
350 pmp->pr_nmap = st.st_size / sizeof (prmap_t);
351
352 done:
353 (void) close(fd);
354 return (nextmapping(pmp));
355 }
356
357 /*
358 * Attempt to invalidate the entire mapping from within the given process's
359 * address space. May return nonzero with errno as:
360 * ESRCH - process not found
361 * ENOMEM - segment not found
362 * EINVAL - mapping exceeds a single segment
363 */
364 static int
365 pageout_mapping(pid_t pid, prmap_t *pmp)
366 {
367 int res;
368
369 if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
370 return (0);
371
372 errno = 0;
373 res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
374 pmp->pr_size);
375
376 return (res);
377 }
378
379 /*
380 * Work through a process paging out mappings until the whole address space was
381 * examined or the excess is < 0. Return our estimate of the updated excess.
382 */
383 static int64_t
384 pageout_process(pid_t pid, int64_t excess)
385 {
386 int psfd;
387 prmap_t *pmap;
388 proc_map_t cur;
389 int64_t sum_d_rss, d_rss;
390 int64_t old_rss;
391 int map_cnt;
392 psinfo_t psinfo;
393 char pathbuf[MAXPATHLEN];
394
395 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
396 pid);
397 if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
398 return (excess);
399
400 cur.pr_mapp = NULL;
401
402 if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
403 goto done;
404
405 old_rss = (int64_t)psinfo.pr_rssize;
406 map_cnt = 0;
407
408 /* If unscannable, skip it. */
409 if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
410 debug("pid %ld: system process, skipping %s\n",
411 pid, psinfo.pr_psargs);
412 goto done;
413 }
414
415 /* If tiny RSS (16KB), skip it. */
416 if (old_rss <= 16) {
417 debug("pid %ld: skipping, RSS %lldKB %s\n",
418 pid, old_rss, psinfo.pr_psargs);
419 goto done;
420 }
421
422 /* Get segment residency information. */
423 pmap = init_map(&cur, pid);
424
425 /* Skip process if it has no mappings. */
426 if (pmap == NULL) {
427 debug("pid %ld: map unreadable; ignoring\n", pid);
428 goto done;
429 }
430
431 debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
432 pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
433
434 /*
435 * Within the process's address space, attempt to page out mappings.
436 */
437 sum_d_rss = 0;
438 while (excess > 0 && pmap != NULL && !shutting_down) {
439 /* invalidate the entire mapping */
440 if (pageout_mapping(pid, pmap) < 0)
441 debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
442 pid, (void *)pmap->pr_vaddr,
443 (long)pmap->pr_size / 1024L, errno);
444
445 map_cnt++;
446
447 /*
448 * Re-check the process rss and get the delta.
449 */
450 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
451 != sizeof (psinfo)) {
452 excess -= old_rss;
453 goto done;
454 }
455
456 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
457 old_rss = (int64_t)psinfo.pr_rssize;
458 sum_d_rss += d_rss;
459
460 /*
461 * d_rss hopefully should be negative (or 0 if nothing
462 * invalidated) but can be positive if more got paged in.
463 */
464 excess += d_rss;
465
466 if (excess <= 0) {
467 debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
468 "excess %lldKB\n", pid, map_cnt,
469 (unsigned long long)sum_d_rss, (long long)excess);
470 map_cnt = 0;
471
472 /*
473 * If we're actually under, this will suspend checking
474 * in the middle of this process's address space.
475 */
476 excess = check_suspend();
477 if (shutting_down)
478 goto done;
479
480 /*
481 * since we might have suspended, re-read process's rss
482 */
483 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
484 != sizeof (psinfo)) {
485 excess -= old_rss;
486 goto done;
487 }
488
489 old_rss = (int64_t)psinfo.pr_rssize;
490
491 debug("pid %ld: resume pageout; excess %lld\n", pid,
492 (long long)excess);
493 sum_d_rss = 0;
494 }
495
496 pmap = nextmapping(&cur);
497 }
498
499 debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
500 pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
501
502 done:
503 if (cur.pr_mapp != NULL)
504 free(cur.pr_mapp);
505
506 (void) close(psfd);
507
508 if (shutting_down)
509 return (0);
510
511 return (excess);
512 }
513
514 /*
515 * Get the zone's RSS data.
516 */
517 static uint64_t
518 get_mem_info()
519 {
520 uint64_t n = 1;
521 zsd_vmusage64_t buf;
522 uint64_t tmp_rss;
523 DIR *pdir = NULL;
524 struct dirent *dent;
525
526 /*
527 * Start by doing the fast, cheap RSS calculation using the rss value
528 * in psinfo_t. Because that's per-process, it can lead to double
529 * counting some memory and overestimating how much is being used, but
530 * as long as that's not over the cap, then we don't need do the
531 * expensive calculation.
532 *
533 * If we have to do the expensive calculation, we remember the scaling
534 * factor so that we can try to use that on subsequent iterations for
535 * the fast rss.
536 */
537 if (shutting_down)
538 return (0);
539
540 if ((pdir = opendir(zoneproc)) == NULL)
541 return (0);
542
543 accurate_rss = 0;
544 fast_rss = 0;
545 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
546 pid_t pid;
547 int psfd;
548 int64_t rss;
549 char pathbuf[MAXPATHLEN];
550 psinfo_t psinfo;
551
552 if (strcmp(".", dent->d_name) == 0 ||
553 strcmp("..", dent->d_name) == 0)
554 continue;
555
556 pid = atoi(dent->d_name);
557 if (pid == 0 || pid == 1)
558 continue;
559
560 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
561 zoneproc, pid);
562
563 rss = 0;
564 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
565 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
566 sizeof (psinfo))
567 rss = (int64_t)psinfo.pr_rssize;
568
569 (void) close(psfd);
570 }
571
572 fast_rss += rss;
573 }
574
575 (void) closedir(pdir);
576
577 if (shutting_down)
578 return (0);
579
580 debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
581 scale_rss, prev_fast_rss);
582
583 /* see if we can get by with a scaled fast rss */
584 tmp_rss = fast_rss;
585 if (scale_rss > 1 && prev_fast_rss > 0) {
586 /*
587 * Only scale the fast value if it hasn't ballooned too much
588 * to trust.
589 */
590 if (fast_rss / prev_fast_rss < 2) {
591 fast_rss /= scale_rss;
592 debug("scaled fast rss: %lluKB\n", fast_rss);
593 }
594 }
595
596 if (fast_rss <= zone_rss_cap || skip_vmusage) {
597 uint64_t zone_rss_bytes;
598
599 zone_rss_bytes = fast_rss * 1024;
600 /* Use the zone's approx. RSS in the kernel */
601 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
602 return (fast_rss);
603 }
604
605 buf.vmu_id = zid;
606
607 /* get accurate usage (cached data may be up to 5 seconds old) */
608 if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
609 (uintptr_t)&buf, (uintptr_t)&n) != 0) {
610 debug("vmusage failed\n");
611 (void) sleep_shutdown(1);
612 return (0);
613 }
614
615 if (n > 1) {
616 /* This should never happen */
617 debug("vmusage returned more than one result\n");
618 (void) sleep_shutdown(1);
619 return (0);
620 }
621
622 if (buf.vmu_id != zid) {
623 /* This should never happen */
624 debug("vmusage returned the incorrect zone\n");
625 (void) sleep_shutdown(1);
626 return (0);
627 }
628
629 accurate_rss = buf.vmu_rss_all / 1024;
630
631 /* calculate scaling factor to use for fast_rss from now on */
632 if (accurate_rss > 0) {
633 scale_rss = fast_rss / accurate_rss;
634 debug("new scaling factor: %llu\n", scale_rss);
635 /* remember the fast rss when we had to get the accurate rss */
636 prev_fast_rss = tmp_rss;
637 }
638
639 debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
640 scale_rss, prev_fast_rss);
641 return (accurate_rss);
642 }
643
644 /*
645 * Needed to read the zones physical-memory-cap rctl.
646 */
647 static struct ps_prochandle *
648 grab_zone_proc()
649 {
650 DIR *dirp;
651 struct dirent *dentp;
652 struct ps_prochandle *ph = NULL;
653 int tmp;
654
655 if ((dirp = opendir(zoneproc)) == NULL)
656 return (NULL);
657
658 while (!shutting_down && (dentp = readdir(dirp))) {
659 int pid;
660
661 if (strcmp(".", dentp->d_name) == 0 ||
662 strcmp("..", dentp->d_name) == 0)
663 continue;
664
665 pid = atoi(dentp->d_name);
666 /* attempt to grab process */
667 if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
668 if (Psetflags(ph, PR_RLC) == 0) {
669 if (Pcreate_agent(ph) == 0) {
670 (void) closedir(dirp);
671 return (ph);
672 }
673 }
674 Prelease(ph, 0);
675 }
676 }
677
678 (void) closedir(dirp);
679 return (NULL);
680 }
681
682 static uint64_t
683 get_zone_cap()
684 {
685 rctlblk_t *rblk;
686 uint64_t mcap;
687 struct ps_prochandle *ph;
688
689 if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
690 return (UINT64_MAX);
691
692 if ((ph = grab_zone_proc()) == NULL) {
693 free(rblk);
694 return (UINT64_MAX);
695 }
696
697 if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
698 RCTL_FIRST)) {
699 Pdestroy_agent(ph);
700 Prelease(ph, 0);
701 free(rblk);
702 return (UINT64_MAX);
703 }
704
705 Pdestroy_agent(ph);
706 Prelease(ph, 0);
707
708 mcap = rctlblk_get_value(rblk);
709 free(rblk);
710 return (mcap);
711 }
712
713 /*
714 * check_suspend is invoked at the beginning of every pass through the process
715 * list or after we've paged out enough so that we think the excess is under
716 * the cap. The purpose is to periodically check the zone's rss and return
717 * the excess when the zone is over the cap. The rest of the time this
718 * function will sleep, periodically waking up to check the current rss.
719 *
720 * Depending on the percentage of penetration of the zone's rss into the
721 * cap we sleep for longer or shorter amounts. This reduces the impact of this
722 * work on the system, which is important considering that each zone will be
723 * monitoring its rss.
724 */
725 static int64_t
726 check_suspend()
727 {
728 static hrtime_t last_cap_read = 0;
729 static uint64_t addon;
730 static uint64_t lo_thresh; /* Thresholds for how long to sleep */
731 static uint64_t hi_thresh; /* when under the cap (80% & 90%). */
732 static uint64_t prev_zone_rss = 0;
733 static uint32_t pfdelay = 0; /* usec page fault delay when over */
734
735 /* Wait a second to give the async pageout a chance to catch up. */
736 (void) sleep_shutdown(1);
737
738 while (!shutting_down) {
739 int64_t new_excess;
740 int sleep_time;
741 hrtime_t now;
742 struct stat st;
743 uint64_t zone_rss; /* total RSS(KB) */
744
745 /*
746 * Check if the debug log files exists and enable or disable
747 * debug.
748 */
749 if (debug_log_fp == NULL) {
750 if (stat(debug_log, &st) == 0)
751 debug_log_fp = fopen(debug_log, "w");
752 } else {
753 if (stat(debug_log, &st) == -1) {
754 (void) fclose(debug_log_fp);
755 debug_log_fp = NULL;
756 }
757 }
758
759 /*
760 * If the CAP_REFRESH interval has passed, re-get the current
761 * cap in case it has been dynamically updated.
762 */
763 now = gethrtime();
764 if (now - last_cap_read > CAP_REFRESH) {
765 uint64_t mcap;
766
767 last_cap_read = now;
768
769 mcap = get_zone_cap();
770 if (mcap != 0 && mcap != UINT64_MAX)
771 zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
772 else
773 zone_rss_cap = UINT64_MAX;
774
775 lo_thresh = (uint64_t)(zone_rss_cap * .8);
776 hi_thresh = (uint64_t)(zone_rss_cap * .9);
777 addon = (uint64_t)(zone_rss_cap * 0.05);
778
779 /*
780 * We allow the memory cap tunables to be changed on
781 * the fly.
782 */
783 get_mcap_tunables();
784
785 debug("%s: %s\n", TUNE_CMD, over_cmd);
786 debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
787 debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
788 debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
789 debug("current cap %lluKB lo %lluKB hi %lluKB\n",
790 zone_rss_cap, lo_thresh, hi_thresh);
791 }
792
793 /* No cap, nothing to do. */
794 if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
795 debug("no cap, sleep 120 seconds\n");
796 (void) sleep_shutdown(120);
797 continue;
798 }
799
800 zone_rss = get_mem_info();
801
802 /* calculate excess */
803 new_excess = zone_rss - zone_rss_cap;
804
805 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
806 zone_rss, zone_rss_cap, new_excess);
807
808 /*
809 * If necessary, updates stats.
810 */
811
812 /*
813 * If it looks like we did some paging out since last over the
814 * cap then update the kstat so we can approximate how much was
815 * paged out.
816 */
817 if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
818 uint64_t diff;
819
820 /* assume diff is num bytes we paged out */
821 diff = (prev_zone_rss - zone_rss) * 1024;
822
823 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
824 &diff, 0);
825 }
826 prev_zone_rss = zone_rss;
827
828 if (new_excess > 0) {
829 uint64_t n = 1;
830
831 /* Increment "nover" kstat. */
832 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
833
834 if (!skip_pf_throttle) {
835 /*
836 * Tell the kernel to start throttling page
837 * faults by some number of usecs to help us
838 * catch up. If we are persistently over the
839 * cap the delay ramps up to a max of 2000usecs.
840 * Note that for delays less than 1 tick
841 * (i.e. all of these) we busy-wait in as_fault.
842 * delay faults/sec
843 * 125 8000
844 * 250 4000
845 * 500 2000
846 * 1000 1000
847 * 2000 500
848 */
849 if (pfdelay == 0)
850 pfdelay = 125;
851 else if (pfdelay < 2000)
852 pfdelay *= 2;
853
854 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
855 &pfdelay, 0);
856 }
857
858 /*
859 * Once we go over the cap, then we want to
860 * page out a little extra instead of stopping
861 * right at the cap. To do this we add 5% to
862 * the excess so that pageout_proces will work
863 * a little longer before stopping.
864 */
865 return ((int64_t)(new_excess + addon));
866 }
867
868 /*
869 * At this point we are under the cap.
870 *
871 * Tell the kernel to stop throttling page faults.
872 *
873 * Scale the amount of time we sleep before rechecking the
874 * zone's memory usage. Also, scale the accpetable age of
875 * cached results from vm_getusage. We do this based on the
876 * penetration into the capped limit.
877 */
878 if (pfdelay > 0) {
879 pfdelay = 0;
880 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
881 &pfdelay, 0);
882 }
883
884 if (zone_rss <= lo_thresh) {
885 sleep_time = 120;
886 } else if (zone_rss <= hi_thresh) {
887 sleep_time = 60;
888 } else {
889 sleep_time = 30;
890 }
891
892 debug("sleep %d seconds\n", sleep_time);
893 (void) sleep_shutdown(sleep_time);
894 }
895
896 /* Shutting down, tell the kernel so it doesn't throttle */
897 if (pfdelay > 0) {
898 pfdelay = 0;
899 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
900 }
901
902 return (0);
903 }
904
905 static void
906 get_mcap_tunables()
907 {
908 zone_dochandle_t handle;
909 struct zone_attrtab attr;
910
911 over_cmd[0] = '\0';
912 if ((handle = zonecfg_init_handle()) == NULL)
913 return;
914
915 if (zonecfg_get_handle(zone_name, handle) != Z_OK)
916 goto done;
917
918 /* Reset to defaults in case rebooting and settings have changed */
919 over_cmd[0] = '\0';
920 skip_vmusage = B_FALSE;
921 skip_pageout = B_FALSE;
922 skip_pf_throttle = B_FALSE;
923
924 if (zonecfg_setattrent(handle) != Z_OK)
925 goto done;
926 while (zonecfg_getattrent(handle, &attr) == Z_OK) {
927 if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
928 (void) strlcpy(over_cmd, attr.zone_attr_value,
929 sizeof (over_cmd));
930 } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
931 if (strcmp("true", attr.zone_attr_value) == 0)
932 skip_vmusage = B_TRUE;
933 } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
934 if (strcmp("true", attr.zone_attr_value) == 0)
935 skip_pageout = B_TRUE;
936 } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
937 if (strcmp("true", attr.zone_attr_value) == 0)
938 skip_pf_throttle = B_TRUE;
939 }
940 }
941 (void) zonecfg_endattrent(handle);
942
943 done:
944 zonecfg_fini_handle(handle);
945 }
946
947 /* ARGSUSED */
948 static int
949 chk_proc_fs(void *data, const char *spec, const char *dir,
950 const char *fstype, const char *opt)
951 {
952 if (fstype != NULL && strcmp(fstype, "proc") == 0)
953 *((boolean_t *)data) = B_TRUE;
954
955 return (0);
956 }
957
958 static boolean_t
959 has_proc()
960 {
961 brand_handle_t bh;
962 boolean_t fnd = B_FALSE;
963
964 if ((bh = brand_open(brand_name)) != NULL) {
965 (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
966 }
967
968 brand_close(bh);
969 return (fnd);
970 }
971
972 /*
973 * We run this loop for brands with no /proc to simply update the RSS, using
974 * the cheap GZ /proc data, every 5 minutes.
975 */
976 static void
977 no_procfs()
978 {
979 DIR *pdir = NULL;
980 struct dirent *dent;
981 uint64_t zone_rss_bytes;
982
983 (void) sleep_shutdown(30);
984 while (!shutting_down) {
985 /*
986 * Just do the fast, cheap RSS calculation using the rss value
987 * in psinfo_t. Because that's per-process, it can lead to
988 * double counting some memory and overestimating how much is
989 * being used. Since there is no /proc in the zone, we use the
990 * GZ /proc and check for the correct zone.
991 */
992 if ((pdir = opendir("/proc")) == NULL)
993 return;
994
995 fast_rss = 0;
996 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
997 pid_t pid;
998 int psfd;
999 int64_t rss;
1000 char pathbuf[MAXPATHLEN];
1001 psinfo_t psinfo;
1002
1003 if (strcmp(".", dent->d_name) == 0 ||
1004 strcmp("..", dent->d_name) == 0)
1005 continue;
1006
1007 pid = atoi(dent->d_name);
1008 if (pid == 0 || pid == 1)
1009 continue;
1010
1011 (void) snprintf(pathbuf, sizeof (pathbuf),
1012 "/proc/%d/psinfo", pid);
1013
1014 rss = 0;
1015 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1016 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1017 sizeof (psinfo)) {
1018 if (psinfo.pr_zoneid == zid)
1019 rss = (int64_t)psinfo.pr_rssize;
1020 }
1021
1022 (void) close(psfd);
1023 }
1024
1025 fast_rss += rss;
1026 }
1027
1028 (void) closedir(pdir);
1029
1030 if (shutting_down)
1031 return;
1032
1033 zone_rss_bytes = fast_rss * 1024;
1034 /* Use the zone's approx. RSS in the kernel */
1035 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1036
1037 (void) sleep_shutdown(300);
1038 }
1039 }
1040
1041 /*
1042 * Thread that checks zone's memory usage and when over the cap, goes through
1043 * the zone's process list trying to pageout processes to get under the cap.
1044 */
1045 static void
1046 mcap_zone()
1047 {
1048 DIR *pdir = NULL;
1049 int64_t excess;
1050
1051 debug("thread startup\n");
1052
1053 get_mcap_tunables();
1054
1055 /*
1056 * If the zone has no /proc filesystem, we can't use the fast algorithm
1057 * to check RSS or pageout any processes. All we can do is periodically
1058 * update it's RSS kstat using the expensive sycall.
1059 */
1060 if (!has_proc()) {
1061 no_procfs();
1062 debug("thread shutdown\n");
1063 return;
1064 }
1065
1066 /*
1067 * When first starting it is likely lots of other zones are starting
1068 * too because the system is booting. Since we just started the zone
1069 * we're not worried about being over the cap right away, so we let
1070 * things settle a bit and tolerate some older data here to minimize
1071 * the load on the system.
1072 */
1073 (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1074
1075 /* Wait until zone's /proc is mounted */
1076 while (!shutting_down) {
1077 struct stat st;
1078
1079 if (stat(zoneproc, &st) == 0 &&
1080 strcmp(st.st_fstype, "proc") == 0)
1081 break;
1082 sleep_shutdown(5);
1083 }
1084
1085 /* Open zone's /proc and walk entries. */
1086 while (!shutting_down) {
1087 if ((pdir = opendir(zoneproc)) != NULL)
1088 break;
1089 sleep_shutdown(5);
1090 }
1091
1092 while (!shutting_down) {
1093 struct dirent *dirent;
1094
1095 /* Wait until we've gone over the cap. */
1096 excess = check_suspend();
1097
1098 debug("starting to scan, excess %lldk\n", (long long)excess);
1099
1100 if (over_cmd[0] != '\0') {
1101 uint64_t zone_rss; /* total RSS(KB) */
1102
1103 debug("run phys_mcap_cmd: %s\n", over_cmd);
1104 run_over_cmd();
1105
1106 zone_rss = get_mem_info();
1107 excess = zone_rss - zone_rss_cap;
1108 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1109 zone_rss, zone_rss_cap, excess);
1110 if (excess <= 0)
1111 continue;
1112 }
1113
1114 while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1115 pid_t pid;
1116
1117 if (strcmp(".", dirent->d_name) == 0 ||
1118 strcmp("..", dirent->d_name) == 0)
1119 continue;
1120
1121 pid = atoi(dirent->d_name);
1122 if (pid == 0 || pid == 1)
1123 continue;
1124
1125 if (skip_pageout)
1126 (void) sleep_shutdown(2);
1127 else
1128 excess = pageout_process(pid, excess);
1129
1130 if (excess <= 0) {
1131 debug("apparently under; excess %lld\n",
1132 (long long)excess);
1133 /* Double check the current excess */
1134 excess = check_suspend();
1135 }
1136 }
1137
1138 debug("process pass done; excess %lld\n", (long long)excess);
1139 rewinddir(pdir);
1140
1141 if (skip_pageout)
1142 (void) sleep_shutdown(120);
1143 }
1144
1145 if (pdir != NULL)
1146 (void) closedir(pdir);
1147 debug("thread shutdown\n");
1148 }
1149
1150 void
1151 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1152 {
1153 int res;
1154
1155 shutting_down = 0;
1156 zid = id;
1157
1158 /* all but the lx brand currently use /proc */
1159 if (strcmp(brand_name, "lx") == 0) {
1160 (void) snprintf(zoneproc, sizeof (zoneproc),
1161 "%s/root/native/proc", zonepath);
1162 } else {
1163 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1164 zonepath);
1165 }
1166
1167 (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1168 zonepath);
1169
1170 res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1171 &mcap_tid);
1172 if (res != 0) {
1173 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1174 res);
1175 mcap_tid = 0;
1176 }
1177 }
1178
1179 void
1180 destroy_mcap_thread()
1181 {
1182 if (mcap_tid != 0) {
1183 shutting_down = 1;
1184 (void) cond_signal(&shutdown_cv);
1185 (void) thr_join(mcap_tid, NULL, NULL);
1186 mcap_tid = 0;
1187 }
1188 }