1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Copyright 2014, Joyent, Inc. All rights reserved.
24 */
25
26 /*
27 * This file implements the code which runs a thread inside zoneadmd to cap
28 * the associated zone's physical memory. A thread to do this is started
29 * when the zone boots and is halted when the zone shuts down.
30 *
31 * Because of the way that the VM system is currently implemented, there is no
32 * way to go from the bottom up (page to process to zone). Thus, there is no
33 * obvious way to hook an rctl into the kernel's paging code to enforce a hard
34 * memory cap. Instead, we implement a soft physical memory cap which looks
35 * at the zone's overall rss and once it is over the cap, works from the top
36 * down (zone to process to page), looking at zone processes, to determine
37 * what to try to pageout to get the zone under its memory cap.
38 *
39 * The code uses the fast, cheap, but potentially very inaccurate sum of the
40 * rss values from psinfo_t to first approximate the zone's rss and will
41 * fallback to the vm_getusage syscall to determine the zone's rss if needed.
42 * It then checks the rss against the zone's zone.max-physical-memory rctl.
43 * Once the zone goes over its cap, then this thread will work through the
44 * zone's /proc process list, Pgrab-bing each process and stepping through the
45 * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
46 * to pageout pages, until the zone is again under its cap.
47 *
48 * Although zone memory capping is implemented as a soft cap by this user-level
49 * thread, the interfaces around memory caps that are exposed to the user are
50 * the standard ones; an rctl and kstats. This thread uses the rctl value
51 * to obtain the cap and works with the zone kernel code to update the kstats.
52 * If the implementation ever moves into the kernel, these exposed interfaces
53 * do not need to change.
54 *
55 * The thread adaptively sleeps, periodically checking the state of the
56 * zone. As the zone's rss gets closer to the cap, the thread will wake up
57 * more often to check the zone's status. Once the zone is over the cap,
58 * the thread will work to pageout until the zone is under the cap, as shown
59 * by updated vm_usage data.
60 *
61 * NOTE: The pagedata page maps (at least on x86) are not useful. Those flags
62 * are set by hrm_setbits() and on x86 that code path is only executed by
63 * segvn_pagelock -> hat_setstat -> hrm_setbits
64 * segvn_softunlock -^
65 * On SPARC there is an additional code path which may make this data
66 * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
67 * maps. If we ever fix this issue, then we could generalize this mcap code to
68 * do more with the data on active pages.
69 *
70 * For debugging, touch the file {zonepath}/mcap_debug.log. This will
71 * cause the thread to start logging its actions into that file (it may take
72 * a minute or two if the thread is currently sleeping). Removing that
73 * file will cause logging to stop.
74 */
75
76 #include <sys/mman.h>
77 #include <sys/param.h>
78 #include <sys/stat.h>
79 #include <sys/types.h>
80 #include <assert.h>
81 #include <errno.h>
82 #include <fcntl.h>
83 #include <libproc.h>
84 #include <limits.h>
85 #include <procfs.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <strings.h>
89 #include <time.h>
90 #include <unistd.h>
91 #include <sys/priocntl.h>
92 #include <dirent.h>
93 #include <zone.h>
94 #include <libzonecfg.h>
95 #include <thread.h>
96 #include <values.h>
97 #include <sys/vm_usage.h>
98 #include <sys/resource.h>
99 #include <sys/debug.h>
100 #include <synch.h>
101 #include <wait.h>
102 #include <libcontract.h>
103 #include <libcontract_priv.h>
104 #include <sys/contract/process.h>
105 #include "zoneadmd.h"
106
107 /* round up to next y = 2^n */
108 #define ROUNDUP(x, y) (((x) + ((y) - 1)) & ~((y) - 1))
109
110 #define CAP_REFRESH ((uint64_t)300 * NANOSEC) /* every 5 minutes */
111
112 /*
113 * zonecfg attribute tunables for memory capping.
114 * phys-mcap-cmd
115 * type: string
116 * specifies a command that can be run when over the cap
117 * phys-mcap-no-vmusage
118 * type: boolean
119 * true disables vm_getusage and just uses zone's proc. rss sum
120 * phys-mcap-no-pageout
121 * type: boolean
122 * true disables pageout when over
123 * phys-mcap-no-pf-throttle
124 * type: boolean
125 * true disables page fault throttling when over
126 */
127 #define TUNE_CMD "phys-mcap-cmd"
128 #define TUNE_NVMU "phys-mcap-no-vmusage"
129 #define TUNE_NPAGE "phys-mcap-no-pageout"
130 #define TUNE_NPFTHROT "phys-mcap-no-pf-throttle"
131
132 /*
133 * These are only used in get_mem_info but global. We always need scale_rss and
134 * prev_fast_rss to be persistent but we also have the other two global so we
135 * can easily see these with mdb.
136 */
137 uint64_t scale_rss = 0;
138 uint64_t prev_fast_rss = 0;
139 uint64_t fast_rss = 0;
140 uint64_t accurate_rss = 0;
141
142 static char zoneproc[MAXPATHLEN];
143 static char debug_log[MAXPATHLEN];
144 static zoneid_t zid;
145 static mutex_t shutdown_mx;
146 static cond_t shutdown_cv;
147 static int shutting_down = 0;
148 static thread_t mcap_tid;
149 static FILE *debug_log_fp = NULL;
150 static uint64_t zone_rss_cap; /* RSS cap(KB) */
151 static char over_cmd[2 * BUFSIZ]; /* same size as zone_attr_value */
152 static boolean_t skip_vmusage = B_FALSE;
153 static boolean_t skip_pageout = B_FALSE;
154 static boolean_t skip_pf_throttle = B_FALSE;
155
156 static zlog_t *logp;
157
158 static int64_t check_suspend();
159 static void get_mcap_tunables();
160
161 /*
162 * Structure to hold current state about a process address space that we're
163 * working on.
164 */
165 typedef struct {
166 int pr_curr; /* the # of the mapping we're working on */
167 int pr_nmap; /* number of mappings in address space */
168 prmap_t *pr_mapp; /* process's map array */
169 } proc_map_t;
170
171 typedef struct zsd_vmusage64 {
172 id_t vmu_zoneid;
173 uint_t vmu_type;
174 id_t vmu_id;
175 /*
176 * An amd64 kernel will align the following uint64_t members, but a
177 * 32bit i386 process will not without help.
178 */
179 int vmu_align_next_members_on_8_bytes;
180 uint64_t vmu_rss_all;
181 uint64_t vmu_rss_private;
182 uint64_t vmu_rss_shared;
183 uint64_t vmu_swap_all;
184 uint64_t vmu_swap_private;
185 uint64_t vmu_swap_shared;
186 } zsd_vmusage64_t;
187
188 /*
189 * Output a debug log message.
190 */
191 /*PRINTFLIKE1*/
192 static void
193 debug(char *fmt, ...)
194 {
195 va_list ap;
196
197 if (debug_log_fp == NULL)
198 return;
199
200 va_start(ap, fmt);
201 (void) vfprintf(debug_log_fp, fmt, ap);
202 va_end(ap);
203 (void) fflush(debug_log_fp);
204 }
205
206 /*
207 * Like sleep(3C) but can be interupted by cond_signal which is posted when
208 * we're shutting down the mcap thread.
209 */
210 static void
211 sleep_shutdown(int secs)
212 {
213 timestruc_t to;
214
215 to.tv_sec = secs;
216 to.tv_nsec = 0;
217
218 (void) mutex_lock(&shutdown_mx);
219 if (!shutting_down)
220 (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
221 (void) mutex_unlock(&shutdown_mx);
222 }
223
224 static boolean_t
225 proc_issystem(pid_t pid)
226 {
227 char pc_clname[PC_CLNMSZ];
228
229 if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
230 PC_KY_NULL) != -1)
231 return (strcmp(pc_clname, "SYS") == 0);
232
233 return (B_TRUE);
234 }
235
236 /*
237 * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
238 */
239 static void
240 run_over_cmd()
241 {
242 int ctfd;
243 int err;
244 pid_t childpid;
245 siginfo_t info;
246 ctid_t ct;
247
248 /*
249 * Before we enter the zone, we need to create a new process contract
250 * for the child, as required by zone_enter().
251 */
252 if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
253 return;
254 if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
255 ct_tmpl_set_informative(ctfd, 0) != 0 ||
256 ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
257 ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
258 ct_tmpl_activate(ctfd) != 0) {
259 (void) close(ctfd);
260 return;
261 }
262
263 childpid = fork();
264 switch (childpid) {
265 case -1:
266 (void) ct_tmpl_clear(ctfd);
267 (void) close(ctfd);
268 break;
269 case 0: /* Child */
270 (void) ct_tmpl_clear(ctfd);
271 (void) close(ctfd);
272 if (zone_enter(zid) == -1)
273 _exit(errno);
274 err = system(over_cmd);
275 _exit(err);
276 break;
277 default: /* Parent */
278 if (contract_latest(&ct) == -1)
279 ct = -1;
280 (void) ct_tmpl_clear(ctfd);
281 (void) close(ctfd);
282 err = waitid(P_PID, childpid, &info, WEXITED);
283 (void) contract_abandon_id(ct);
284 if (err == -1 || info.si_status != 0)
285 debug("over_cmd failed");
286 break;
287 }
288 }
289
290 /*
291 * Get the next mapping.
292 */
293 static prmap_t *
294 nextmapping(proc_map_t *pmp)
295 {
296 if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
297 return (NULL);
298
299 return (&pmp->pr_mapp[pmp->pr_curr++]);
300 }
301
302 /*
303 * Initialize the proc_map_t to access the first mapping of an address space.
304 */
305 static prmap_t *
306 init_map(proc_map_t *pmp, pid_t pid)
307 {
308 int fd;
309 int res;
310 struct stat st;
311 char pathbuf[MAXPATHLEN];
312
313 bzero(pmp, sizeof (proc_map_t));
314 pmp->pr_nmap = -1;
315
316 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
317 if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
318 return (NULL);
319
320 redo:
321 errno = 0;
322 if (fstat(fd, &st) != 0)
323 goto done;
324
325 if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
326 debug("cannot malloc() %ld bytes for xmap", st.st_size);
327 goto done;
328 }
329 (void) bzero(pmp->pr_mapp, st.st_size);
330
331 errno = 0;
332 if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
333 free(pmp->pr_mapp);
334 pmp->pr_mapp = NULL;
335 if (res > 0 || errno == E2BIG) {
336 goto redo;
337 } else {
338 debug("pid %ld cannot read xmap\n", pid);
339 goto done;
340 }
341 }
342
343 pmp->pr_nmap = st.st_size / sizeof (prmap_t);
344
345 done:
346 (void) close(fd);
347 return (nextmapping(pmp));
348 }
349
350 /*
351 * Attempt to invalidate the entire mapping from within the given process's
352 * address space. May return nonzero with errno as:
353 * ESRCH - process not found
354 * ENOMEM - segment not found
355 * EINVAL - mapping exceeds a single segment
356 */
357 static int
358 pageout_mapping(pid_t pid, prmap_t *pmp)
359 {
360 int res;
361
362 if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
363 return (0);
364
365 errno = 0;
366 res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
367 pmp->pr_size);
368
369 return (res);
370 }
371
372 /*
373 * Work through a process paging out mappings until the whole address space was
374 * examined or the excess is < 0. Return our estimate of the updated excess.
375 */
376 static int64_t
377 pageout_process(pid_t pid, int64_t excess)
378 {
379 int psfd;
380 prmap_t *pmap;
381 proc_map_t cur;
382 int res;
383 int64_t sum_d_rss, d_rss;
384 int64_t old_rss;
385 int map_cnt;
386 psinfo_t psinfo;
387 char pathbuf[MAXPATHLEN];
388
389 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
390 pid);
391 if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
392 return (excess);
393
394 cur.pr_mapp = NULL;
395
396 if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
397 goto done;
398
399 old_rss = (int64_t)psinfo.pr_rssize;
400 map_cnt = 0;
401
402 /* If unscannable, skip it. */
403 if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
404 debug("pid %ld: system process, skipping %s\n",
405 pid, psinfo.pr_psargs);
406 goto done;
407 }
408
409 /* If tiny RSS (16KB), skip it. */
410 if (old_rss <= 16) {
411 debug("pid %ld: skipping, RSS %lldKB %s\n",
412 pid, old_rss, psinfo.pr_psargs);
413 goto done;
414 }
415
416 /* Get segment residency information. */
417 pmap = init_map(&cur, pid);
418
419 /* Skip process if it has no mappings. */
420 if (pmap == NULL) {
421 debug("pid %ld: map unreadable; ignoring\n", pid);
422 goto done;
423 }
424
425 debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
426 pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
427
428 /*
429 * Within the process's address space, attempt to page out mappings.
430 */
431 sum_d_rss = 0;
432 while (excess > 0 && pmap != NULL && !shutting_down) {
433 /* invalidate the entire mapping */
434 if ((res = pageout_mapping(pid, pmap)) < 0)
435 debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
436 pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
437
438 map_cnt++;
439
440 /*
441 * Re-check the process rss and get the delta.
442 */
443 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
444 != sizeof (psinfo)) {
445 excess -= old_rss;
446 goto done;
447 }
448
449 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
450 old_rss = (int64_t)psinfo.pr_rssize;
451 sum_d_rss += d_rss;
452
453 /*
454 * d_rss hopefully should be negative (or 0 if nothing
455 * invalidated) but can be positive if more got paged in.
456 */
457 excess += d_rss;
458
459 if (excess <= 0) {
460 debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
461 "excess %lldKB\n", pid, map_cnt,
462 (unsigned long long)sum_d_rss, (long long)excess);
463 map_cnt = 0;
464
465 /*
466 * If we're actually under, this will suspend checking
467 * in the middle of this process's address space.
468 */
469 excess = check_suspend();
470 if (shutting_down)
471 goto done;
472
473 /*
474 * since we might have suspended, re-read process's rss
475 */
476 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
477 != sizeof (psinfo)) {
478 excess -= old_rss;
479 goto done;
480 }
481
482 old_rss = (int64_t)psinfo.pr_rssize;
483
484 debug("pid %ld: resume pageout; excess %lld\n", pid,
485 (long long)excess);
486 sum_d_rss = 0;
487 }
488
489 pmap = nextmapping(&cur);
490 }
491
492 debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
493 pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
494
495 done:
496 if (cur.pr_mapp != NULL)
497 free(cur.pr_mapp);
498
499 (void) close(psfd);
500
501 if (shutting_down)
502 return (0);
503
504 return (excess);
505 }
506
507 /*
508 * Get the zone's RSS data.
509 */
510 static uint64_t
511 get_mem_info()
512 {
513 uint64_t n = 1;
514 zsd_vmusage64_t buf;
515 uint64_t tmp_rss;
516 DIR *pdir = NULL;
517 struct dirent *dent;
518
519 /*
520 * Start by doing the fast, cheap RSS calculation using the rss value
521 * in psinfo_t. Because that's per-process, it can lead to double
522 * counting some memory and overestimating how much is being used, but
523 * as long as that's not over the cap, then we don't need do the
524 * expensive calculation.
525 *
526 * If we have to do the expensive calculation, we remember the scaling
527 * factor so that we can try to use that on subsequent iterations for
528 * the fast rss.
529 */
530 if (shutting_down)
531 return (0);
532
533 if ((pdir = opendir(zoneproc)) == NULL)
534 return (0);
535
536 accurate_rss = 0;
537 fast_rss = 0;
538 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
539 pid_t pid;
540 int psfd;
541 int64_t rss;
542 char pathbuf[MAXPATHLEN];
543 psinfo_t psinfo;
544
545 if (strcmp(".", dent->d_name) == 0 ||
546 strcmp("..", dent->d_name) == 0)
547 continue;
548
549 pid = atoi(dent->d_name);
550 if (pid == 0 || pid == 1)
551 continue;
552
553 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
554 zoneproc, pid);
555
556 rss = 0;
557 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
558 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
559 sizeof (psinfo))
560 rss = (int64_t)psinfo.pr_rssize;
561
562 (void) close(psfd);
563 }
564
565 fast_rss += rss;
566 }
567
568 (void) closedir(pdir);
569
570 if (shutting_down)
571 return (0);
572
573 debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
574 scale_rss, prev_fast_rss);
575
576 /* see if we can get by with a scaled fast rss */
577 tmp_rss = fast_rss;
578 if (scale_rss > 1 && prev_fast_rss > 0) {
579 /*
580 * Only scale the fast value if it hasn't ballooned too much
581 * to trust.
582 */
583 if (fast_rss / prev_fast_rss < 2) {
584 fast_rss /= scale_rss;
585 debug("scaled fast rss: %lluKB\n", fast_rss);
586 }
587 }
588
589 if (fast_rss <= zone_rss_cap || skip_vmusage) {
590 uint64_t zone_rss_bytes;
591
592 zone_rss_bytes = fast_rss * 1024;
593 /* Use the zone's approx. RSS in the kernel */
594 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
595 return (fast_rss);
596 }
597
598 buf.vmu_id = zid;
599
600 /* get accurate usage (cached data may be up to 5 seconds old) */
601 if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
602 (uintptr_t)&buf, (uintptr_t)&n) != 0) {
603 debug("vmusage failed\n");
604 (void) sleep_shutdown(1);
605 return (0);
606 }
607
608 if (n > 1) {
609 /* This should never happen */
610 debug("vmusage returned more than one result\n");
611 (void) sleep_shutdown(1);
612 return (0);
613 }
614
615 if (buf.vmu_id != zid) {
616 /* This should never happen */
617 debug("vmusage returned the incorrect zone\n");
618 (void) sleep_shutdown(1);
619 return (0);
620 }
621
622 accurate_rss = buf.vmu_rss_all / 1024;
623
624 /* calculate scaling factor to use for fast_rss from now on */
625 if (accurate_rss > 0) {
626 scale_rss = fast_rss / accurate_rss;
627 debug("new scaling factor: %llu\n", scale_rss);
628 /* remember the fast rss when we had to get the accurate rss */
629 prev_fast_rss = tmp_rss;
630 }
631
632 debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
633 scale_rss, prev_fast_rss);
634 return (accurate_rss);
635 }
636
637 /*
638 * Needed to read the zones physical-memory-cap rctl.
639 */
640 static struct ps_prochandle *
641 grab_zone_proc()
642 {
643 DIR *dirp;
644 struct dirent *dentp;
645 struct ps_prochandle *ph = NULL;
646 int tmp;
647
648 if ((dirp = opendir(zoneproc)) == NULL)
649 return (NULL);
650
651 while (!shutting_down && (dentp = readdir(dirp))) {
652 int pid;
653
654 if (strcmp(".", dentp->d_name) == 0 ||
655 strcmp("..", dentp->d_name) == 0)
656 continue;
657
658 pid = atoi(dentp->d_name);
659 /* attempt to grab process */
660 if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
661 if (Psetflags(ph, PR_RLC) == 0) {
662 if (Pcreate_agent(ph) == 0) {
663 (void) closedir(dirp);
664 return (ph);
665 }
666 }
667 Prelease(ph, 0);
668 }
669 }
670
671 (void) closedir(dirp);
672 return (NULL);
673 }
674
675 static uint64_t
676 get_zone_cap()
677 {
678 rctlblk_t *rblk;
679 uint64_t mcap;
680 struct ps_prochandle *ph;
681
682 if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
683 return (UINT64_MAX);
684
685 if ((ph = grab_zone_proc()) == NULL) {
686 free(rblk);
687 return (UINT64_MAX);
688 }
689
690 if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
691 RCTL_FIRST)) {
692 Pdestroy_agent(ph);
693 Prelease(ph, 0);
694 free(rblk);
695 return (UINT64_MAX);
696 }
697
698 Pdestroy_agent(ph);
699 Prelease(ph, 0);
700
701 mcap = rctlblk_get_value(rblk);
702 free(rblk);
703 return (mcap);
704 }
705
706 /*
707 * check_suspend is invoked at the beginning of every pass through the process
708 * list or after we've paged out enough so that we think the excess is under
709 * the cap. The purpose is to periodically check the zone's rss and return
710 * the excess when the zone is over the cap. The rest of the time this
711 * function will sleep, periodically waking up to check the current rss.
712 *
713 * Depending on the percentage of penetration of the zone's rss into the
714 * cap we sleep for longer or shorter amounts. This reduces the impact of this
715 * work on the system, which is important considering that each zone will be
716 * monitoring its rss.
717 */
718 static int64_t
719 check_suspend()
720 {
721 static hrtime_t last_cap_read = 0;
722 static uint64_t addon;
723 static uint64_t lo_thresh; /* Thresholds for how long to sleep */
724 static uint64_t hi_thresh; /* when under the cap (80% & 90%). */
725 static uint64_t prev_zone_rss = 0;
726 static uint32_t pfdelay = 0; /* usec page fault delay when over */
727
728 /* Wait a second to give the async pageout a chance to catch up. */
729 (void) sleep_shutdown(1);
730
731 while (!shutting_down) {
732 int64_t new_excess;
733 int sleep_time;
734 hrtime_t now;
735 struct stat st;
736 uint64_t zone_rss; /* total RSS(KB) */
737
738 /*
739 * Check if the debug log files exists and enable or disable
740 * debug.
741 */
742 if (debug_log_fp == NULL) {
743 if (stat(debug_log, &st) == 0)
744 debug_log_fp = fopen(debug_log, "w");
745 } else {
746 if (stat(debug_log, &st) == -1) {
747 (void) fclose(debug_log_fp);
748 debug_log_fp = NULL;
749 }
750 }
751
752 /*
753 * If the CAP_REFRESH interval has passed, re-get the current
754 * cap in case it has been dynamically updated.
755 */
756 now = gethrtime();
757 if (now - last_cap_read > CAP_REFRESH) {
758 uint64_t mcap;
759
760 last_cap_read = now;
761
762 mcap = get_zone_cap();
763 if (mcap != 0 && mcap != UINT64_MAX)
764 zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
765 else
766 zone_rss_cap = UINT64_MAX;
767
768 lo_thresh = (uint64_t)(zone_rss_cap * .8);
769 hi_thresh = (uint64_t)(zone_rss_cap * .9);
770 addon = (uint64_t)(zone_rss_cap * 0.05);
771
772 /*
773 * We allow the memory cap tunables to be changed on
774 * the fly.
775 */
776 get_mcap_tunables();
777
778 debug("%s: %s\n", TUNE_CMD, over_cmd);
779 debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
780 debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
781 debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
782 debug("current cap %lluKB lo %lluKB hi %lluKB\n",
783 zone_rss_cap, lo_thresh, hi_thresh);
784 }
785
786 /* No cap, nothing to do. */
787 if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
788 debug("no cap, sleep 120 seconds\n");
789 (void) sleep_shutdown(120);
790 continue;
791 }
792
793 zone_rss = get_mem_info();
794
795 /* calculate excess */
796 new_excess = zone_rss - zone_rss_cap;
797
798 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
799 zone_rss, zone_rss_cap, new_excess);
800
801 /*
802 * If necessary, updates stats.
803 */
804
805 /*
806 * If it looks like we did some paging out since last over the
807 * cap then update the kstat so we can approximate how much was
808 * paged out.
809 */
810 if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
811 uint64_t diff;
812
813 /* assume diff is num bytes we paged out */
814 diff = (prev_zone_rss - zone_rss) * 1024;
815
816 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
817 &diff, 0);
818 }
819 prev_zone_rss = zone_rss;
820
821 if (new_excess > 0) {
822 uint64_t n = 1;
823
824 /* Increment "nover" kstat. */
825 (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
826
827 if (!skip_pf_throttle) {
828 /*
829 * Tell the kernel to start throttling page
830 * faults by some number of usecs to help us
831 * catch up. If we are persistently over the
832 * cap the delay ramps up to a max of 2000usecs.
833 * Note that for delays less than 1 tick
834 * (i.e. all of these) we busy-wait in as_fault.
835 * delay faults/sec
836 * 125 8000
837 * 250 4000
838 * 500 2000
839 * 1000 1000
840 * 2000 500
841 */
842 if (pfdelay == 0)
843 pfdelay = 125;
844 else if (pfdelay < 2000)
845 pfdelay *= 2;
846
847 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
848 &pfdelay, 0);
849 }
850
851 /*
852 * Once we go over the cap, then we want to
853 * page out a little extra instead of stopping
854 * right at the cap. To do this we add 5% to
855 * the excess so that pageout_proces will work
856 * a little longer before stopping.
857 */
858 return ((int64_t)(new_excess + addon));
859 }
860
861 /*
862 * At this point we are under the cap.
863 *
864 * Tell the kernel to stop throttling page faults.
865 *
866 * Scale the amount of time we sleep before rechecking the
867 * zone's memory usage. Also, scale the accpetable age of
868 * cached results from vm_getusage. We do this based on the
869 * penetration into the capped limit.
870 */
871 if (pfdelay > 0) {
872 pfdelay = 0;
873 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
874 &pfdelay, 0);
875 }
876
877 if (zone_rss <= lo_thresh) {
878 sleep_time = 120;
879 } else if (zone_rss <= hi_thresh) {
880 sleep_time = 60;
881 } else {
882 sleep_time = 30;
883 }
884
885 debug("sleep %d seconds\n", sleep_time);
886 (void) sleep_shutdown(sleep_time);
887 }
888
889 /* Shutting down, tell the kernel so it doesn't throttle */
890 if (pfdelay > 0) {
891 pfdelay = 0;
892 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
893 }
894
895 return (0);
896 }
897
898 static void
899 get_mcap_tunables()
900 {
901 zone_dochandle_t handle;
902 struct zone_attrtab attr;
903
904 over_cmd[0] = '\0';
905 if ((handle = zonecfg_init_handle()) == NULL)
906 return;
907
908 if (zonecfg_get_handle(zone_name, handle) != Z_OK)
909 goto done;
910
911 /* Reset to defaults in case rebooting and settings have changed */
912 over_cmd[0] = '\0';
913 skip_vmusage = B_FALSE;
914 skip_pageout = B_FALSE;
915 skip_pf_throttle = B_FALSE;
916
917 if (zonecfg_setattrent(handle) != Z_OK)
918 goto done;
919 while (zonecfg_getattrent(handle, &attr) == Z_OK) {
920 if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
921 (void) strlcpy(over_cmd, attr.zone_attr_value,
922 sizeof (over_cmd));
923 } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
924 if (strcmp("true", attr.zone_attr_value) == 0)
925 skip_vmusage = B_TRUE;
926 } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
927 if (strcmp("true", attr.zone_attr_value) == 0)
928 skip_pageout = B_TRUE;
929 } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
930 if (strcmp("true", attr.zone_attr_value) == 0)
931 skip_pf_throttle = B_TRUE;
932 }
933 }
934 (void) zonecfg_endattrent(handle);
935
936 done:
937 zonecfg_fini_handle(handle);
938 }
939
940 /* ARGSUSED */
941 static int
942 chk_proc_fs(void *data, const char *spec, const char *dir,
943 const char *fstype, const char *opt)
944 {
945 if (fstype != NULL && strcmp(fstype, "proc") == 0)
946 *((boolean_t *)data) = B_TRUE;
947
948 return (0);
949 }
950
951 static boolean_t
952 has_proc()
953 {
954 brand_handle_t bh;
955 boolean_t fnd = B_FALSE;
956
957 if ((bh = brand_open(brand_name)) != NULL) {
958 (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
959 }
960
961 brand_close(bh);
962 return (fnd);
963 }
964
965 /*
966 * We run this loop for brands with no /proc to simply update the RSS, using
967 * the cheap GZ /proc data, every 5 minutes.
968 */
969 static void
970 no_procfs()
971 {
972 DIR *pdir = NULL;
973 struct dirent *dent;
974 uint64_t zone_rss_bytes;
975
976 (void) sleep_shutdown(30);
977 while (!shutting_down) {
978 /*
979 * Just do the fast, cheap RSS calculation using the rss value
980 * in psinfo_t. Because that's per-process, it can lead to
981 * double counting some memory and overestimating how much is
982 * being used. Since there is no /proc in the zone, we use the
983 * GZ /proc and check for the correct zone.
984 */
985 if ((pdir = opendir("/proc")) == NULL)
986 return;
987
988 fast_rss = 0;
989 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
990 pid_t pid;
991 int psfd;
992 int64_t rss;
993 char pathbuf[MAXPATHLEN];
994 psinfo_t psinfo;
995
996 if (strcmp(".", dent->d_name) == 0 ||
997 strcmp("..", dent->d_name) == 0)
998 continue;
999
1000 pid = atoi(dent->d_name);
1001 if (pid == 0 || pid == 1)
1002 continue;
1003
1004 (void) snprintf(pathbuf, sizeof (pathbuf),
1005 "/proc/%d/psinfo", pid);
1006
1007 rss = 0;
1008 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1009 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1010 sizeof (psinfo)) {
1011 if (psinfo.pr_zoneid == zid)
1012 rss = (int64_t)psinfo.pr_rssize;
1013 }
1014
1015 (void) close(psfd);
1016 }
1017
1018 fast_rss += rss;
1019 }
1020
1021 (void) closedir(pdir);
1022
1023 if (shutting_down)
1024 return;
1025
1026 zone_rss_bytes = fast_rss * 1024;
1027 /* Use the zone's approx. RSS in the kernel */
1028 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1029
1030 (void) sleep_shutdown(300);
1031 }
1032 }
1033
1034 /*
1035 * Thread that checks zone's memory usage and when over the cap, goes through
1036 * the zone's process list trying to pageout processes to get under the cap.
1037 */
1038 static void
1039 mcap_zone()
1040 {
1041 DIR *pdir = NULL;
1042 int64_t excess;
1043
1044 debug("thread startup\n");
1045
1046 get_mcap_tunables();
1047
1048 /*
1049 * If the zone has no /proc filesystem, we can't use the fast algorithm
1050 * to check RSS or pageout any processes. All we can do is periodically
1051 * update it's RSS kstat using the expensive sycall.
1052 */
1053 if (!has_proc()) {
1054 no_procfs();
1055 debug("thread shutdown\n");
1056 return;
1057 }
1058
1059 /*
1060 * When first starting it is likely lots of other zones are starting
1061 * too because the system is booting. Since we just started the zone
1062 * we're not worried about being over the cap right away, so we let
1063 * things settle a bit and tolerate some older data here to minimize
1064 * the load on the system.
1065 */
1066 (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1067
1068 /* Wait until zone's /proc is mounted */
1069 while (!shutting_down) {
1070 struct stat st;
1071
1072 if (stat(zoneproc, &st) == 0 &&
1073 strcmp(st.st_fstype, "proc") == 0)
1074 break;
1075 sleep_shutdown(5);
1076 }
1077
1078 /* Open zone's /proc and walk entries. */
1079 while (!shutting_down) {
1080 if ((pdir = opendir(zoneproc)) != NULL)
1081 break;
1082 sleep_shutdown(5);
1083 }
1084
1085 while (!shutting_down) {
1086 struct dirent *dirent;
1087
1088 /* Wait until we've gone over the cap. */
1089 excess = check_suspend();
1090
1091 debug("starting to scan, excess %lldk\n", (long long)excess);
1092
1093 if (over_cmd[0] != '\0') {
1094 uint64_t zone_rss; /* total RSS(KB) */
1095
1096 debug("run phys_mcap_cmd: %s\n", over_cmd);
1097 run_over_cmd();
1098
1099 zone_rss = get_mem_info();
1100 excess = zone_rss - zone_rss_cap;
1101 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1102 zone_rss, zone_rss_cap, excess);
1103 if (excess <= 0)
1104 continue;
1105 }
1106
1107 while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1108 pid_t pid;
1109
1110 if (strcmp(".", dirent->d_name) == 0 ||
1111 strcmp("..", dirent->d_name) == 0)
1112 continue;
1113
1114 pid = atoi(dirent->d_name);
1115 if (pid == 0 || pid == 1)
1116 continue;
1117
1118 if (skip_pageout)
1119 (void) sleep_shutdown(2);
1120 else
1121 excess = pageout_process(pid, excess);
1122
1123 if (excess <= 0) {
1124 debug("apparently under; excess %lld\n",
1125 (long long)excess);
1126 /* Double check the current excess */
1127 excess = check_suspend();
1128 }
1129 }
1130
1131 debug("process pass done; excess %lld\n", (long long)excess);
1132 rewinddir(pdir);
1133
1134 if (skip_pageout)
1135 (void) sleep_shutdown(120);
1136 }
1137
1138 if (pdir != NULL)
1139 (void) closedir(pdir);
1140 debug("thread shutdown\n");
1141 }
1142
1143 void
1144 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1145 {
1146 int res;
1147
1148 shutting_down = 0;
1149 zid = id;
1150 logp = zlogp;
1151
1152 /* all but the lx brand currently use /proc */
1153 if (strcmp(brand_name, "lx") == 0) {
1154 (void) snprintf(zoneproc, sizeof (zoneproc),
1155 "%s/root/native/proc", zonepath);
1156 } else {
1157 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1158 zonepath);
1159 }
1160
1161 (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1162 zonepath);
1163
1164 res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1165 &mcap_tid);
1166 if (res != 0) {
1167 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1168 res);
1169 mcap_tid = 0;
1170 }
1171 }
1172
1173 void
1174 destroy_mcap_thread()
1175 {
1176 if (mcap_tid != 0) {
1177 shutting_down = 1;
1178 (void) cond_signal(&shutdown_cv);
1179 (void) thr_join(mcap_tid, NULL, NULL);
1180 mcap_tid = 0;
1181 }
1182 }