io-lx-public-vs-joyent Old usr/src/cmd/zoneadmd/mcap.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23  * Copyright 2014, Joyent, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  * This file implements the code which runs a thread inside zoneadmd to cap
  28  * the associated zone's physical memory.  A thread to do this is started
  29  * when the zone boots and is halted when the zone shuts down.
  30  *
  31  * Because of the way that the VM system is currently implemented, there is no
  32  * way to go from the bottom up (page to process to zone).  Thus, there is no
  33  * obvious way to hook an rctl into the kernel's paging code to enforce a hard
  34  * memory cap.  Instead, we implement a soft physical memory cap which looks
  35  * at the zone's overall rss and once it is over the cap, works from the top
  36  * down (zone to process to page), looking at zone processes, to determine
  37  * what to try to pageout to get the zone under its memory cap.
  38  *
  39  * The code uses the fast, cheap, but potentially very inaccurate sum of the
  40  * rss values from psinfo_t to first approximate the zone's rss and will
  41  * fallback to the vm_getusage syscall to determine the zone's rss if needed.
  42  * It then checks the rss against the zone's zone.max-physical-memory rctl.
  43  * Once the zone goes over its cap, then this thread will work through the
  44  * zone's /proc process list, Pgrab-bing each process and stepping through the
  45  * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
  46  * to pageout pages, until the zone is again under its cap.
  47  *
  48  * Although zone memory capping is implemented as a soft cap by this user-level
  49  * thread, the interfaces around memory caps that are exposed to the user are
  50  * the standard ones; an rctl and kstats.  This thread uses the rctl value
  51  * to obtain the cap and works with the zone kernel code to update the kstats.
  52  * If the implementation ever moves into the kernel, these exposed interfaces
  53  * do not need to change.
  54  *
  55  * The thread adaptively sleeps, periodically checking the state of the
  56  * zone.  As the zone's rss gets closer to the cap, the thread will wake up
  57  * more often to check the zone's status.  Once the zone is over the cap,
  58  * the thread will work to pageout until the zone is under the cap, as shown
  59  * by updated vm_usage data.
  60  *
  61  * NOTE: The pagedata page maps (at least on x86) are not useful.  Those flags
  62  * are set by hrm_setbits() and on x86 that code path is only executed by
  63  *     segvn_pagelock -> hat_setstat -> hrm_setbits
  64  *     segvn_softunlock -^
  65  * On SPARC there is an additional code path which may make this data
  66  * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
  67  * maps.  If we ever fix this issue, then we could generalize this mcap code to
  68  * do more with the data on active pages.
  69  *
  70  * For debugging, touch the file {zonepath}/mcap_debug.log.  This will
  71  * cause the thread to start logging its actions into that file (it may take
  72  * a minute or two if the thread is currently sleeping).  Removing that
  73  * file will cause logging to stop.
  74  */
  75 
  76 #include <sys/mman.h>
  77 #include <sys/param.h>
  78 #include <sys/stat.h>
  79 #include <sys/types.h>
  80 #include <assert.h>
  81 #include <errno.h>
  82 #include <fcntl.h>
  83 #include <libproc.h>
  84 #include <limits.h>
  85 #include <procfs.h>
  86 #include <stdio.h>
  87 #include <stdlib.h>
  88 #include <strings.h>
  89 #include <time.h>
  90 #include <unistd.h>
  91 #include <sys/priocntl.h>
  92 #include <dirent.h>
  93 #include <zone.h>
  94 #include <libzonecfg.h>
  95 #include <thread.h>
  96 #include <values.h>
  97 #include <sys/vm_usage.h>
  98 #include <sys/resource.h>
  99 #include <sys/debug.h>
 100 #include <synch.h>
 101 #include <wait.h>
 102 #include <libcontract.h>
 103 #include <libcontract_priv.h>
 104 #include <sys/contract/process.h>
 105 #include "zoneadmd.h"
 106 
 107                                         /* round up to next y = 2^n */
 108 #define ROUNDUP(x, y)   (((x) + ((y) - 1)) & ~((y) - 1))
 109 
 110 #define CAP_REFRESH     ((uint64_t)300 * NANOSEC) /* every 5 minutes */
 111 
 112 /*
 113  * zonecfg attribute tunables for memory capping.
 114  *    phys-mcap-cmd
 115  *      type: string
 116  *      specifies a command that can be run when over the cap
 117  *    phys-mcap-no-vmusage
 118  *      type: boolean
 119  *      true disables vm_getusage and just uses zone's proc. rss sum
 120  *    phys-mcap-no-pageout
 121  *      type: boolean
 122  *      true disables pageout when over
 123  *    phys-mcap-no-pf-throttle
 124  *      type: boolean
 125  *      true disables page fault throttling when over
 126  */
 127 #define TUNE_CMD        "phys-mcap-cmd"
 128 #define TUNE_NVMU       "phys-mcap-no-vmusage"
 129 #define TUNE_NPAGE      "phys-mcap-no-pageout"
 130 #define TUNE_NPFTHROT   "phys-mcap-no-pf-throttle"
 131 
 132 /*
 133  * These are only used in get_mem_info but global. We always need scale_rss and
 134  * prev_fast_rss to be persistent but we also have the other two global so we
 135  * can easily see these with mdb.
 136  */
 137 uint64_t        scale_rss = 0;
 138 uint64_t        prev_fast_rss = 0;
 139 uint64_t        fast_rss = 0;
 140 uint64_t        accurate_rss = 0;
 141 
 142 static char     zoneproc[MAXPATHLEN];
 143 static char     debug_log[MAXPATHLEN];
 144 static zoneid_t zid;
 145 static mutex_t  shutdown_mx;
 146 static cond_t   shutdown_cv;
 147 static int      shutting_down = 0;
 148 static thread_t mcap_tid;
 149 static FILE     *debug_log_fp = NULL;
 150 static uint64_t zone_rss_cap;           /* RSS cap(KB) */
 151 static char     over_cmd[2 * BUFSIZ];   /* same size as zone_attr_value */
 152 static boolean_t skip_vmusage = B_FALSE;
 153 static boolean_t skip_pageout = B_FALSE;
 154 static boolean_t skip_pf_throttle = B_FALSE;
 155 
 156 static zlog_t   *logp;
 157 
 158 static int64_t check_suspend();
 159 static void get_mcap_tunables();
 160 
 161 /*
 162  * Structure to hold current state about a process address space that we're
 163  * working on.
 164  */
 165 typedef struct {
 166         int pr_curr;            /* the # of the mapping we're working on */
 167         int pr_nmap;            /* number of mappings in address space */
 168         prmap_t *pr_mapp;       /* process's map array */
 169 } proc_map_t;
 170 
 171 typedef struct zsd_vmusage64 {
 172         id_t vmu_zoneid;
 173         uint_t vmu_type;
 174         id_t vmu_id;
 175         /*
 176          * An amd64 kernel will align the following uint64_t members, but a
 177          * 32bit i386 process will not without help.
 178          */
 179         int vmu_align_next_members_on_8_bytes;
 180         uint64_t vmu_rss_all;
 181         uint64_t vmu_rss_private;
 182         uint64_t vmu_rss_shared;
 183         uint64_t vmu_swap_all;
 184         uint64_t vmu_swap_private;
 185         uint64_t vmu_swap_shared;
 186 } zsd_vmusage64_t;
 187 
 188 /*
 189  * Output a debug log message.
 190  */
 191 /*PRINTFLIKE1*/
 192 static void
 193 debug(char *fmt, ...)
 194 {
 195         va_list ap;
 196 
 197         if (debug_log_fp == NULL)
 198                 return;
 199 
 200         va_start(ap, fmt);
 201         (void) vfprintf(debug_log_fp, fmt, ap);
 202         va_end(ap);
 203         (void) fflush(debug_log_fp);
 204 }
 205 
 206 /*
 207  * Like sleep(3C) but can be interupted by cond_signal which is posted when
 208  * we're shutting down the mcap thread.
 209  */
 210 static void
 211 sleep_shutdown(int secs)
 212 {
 213         timestruc_t to;
 214 
 215         to.tv_sec = secs;
 216         to.tv_nsec = 0;
 217 
 218         (void) mutex_lock(&shutdown_mx);
 219         if (!shutting_down)
 220                 (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
 221         (void) mutex_unlock(&shutdown_mx);
 222 }
 223 
 224 static boolean_t
 225 proc_issystem(pid_t pid)
 226 {
 227         char pc_clname[PC_CLNMSZ];
 228 
 229         if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
 230             PC_KY_NULL) != -1)
 231                 return (strcmp(pc_clname, "SYS") == 0);
 232 
 233         return (B_TRUE);
 234 }
 235 
 236 /*
 237  * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
 238  */
 239 static void
 240 run_over_cmd()
 241 {
 242         int             ctfd;
 243         int             err;
 244         pid_t           childpid;
 245         siginfo_t       info;
 246         ctid_t          ct;
 247 
 248         /*
 249          * Before we enter the zone, we need to create a new process contract
 250          * for the child, as required by zone_enter().
 251          */
 252         if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
 253                 return;
 254         if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
 255             ct_tmpl_set_informative(ctfd, 0) != 0 ||
 256             ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
 257             ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
 258             ct_tmpl_activate(ctfd) != 0) {
 259                 (void) close(ctfd);
 260                 return;
 261         }
 262 
 263         childpid = fork();
 264         switch (childpid) {
 265         case -1:
 266                 (void) ct_tmpl_clear(ctfd);
 267                 (void) close(ctfd);
 268                 break;
 269         case 0: /* Child */
 270                 (void) ct_tmpl_clear(ctfd);
 271                 (void) close(ctfd);
 272                 if (zone_enter(zid) == -1)
 273                         _exit(errno);
 274                 err = system(over_cmd);
 275                 _exit(err);
 276                 break;
 277         default:        /* Parent */
 278                 if (contract_latest(&ct) == -1)
 279                         ct = -1;
 280                 (void) ct_tmpl_clear(ctfd);
 281                 (void) close(ctfd);
 282                 err = waitid(P_PID, childpid, &info, WEXITED);
 283                 (void) contract_abandon_id(ct);
 284                 if (err == -1 || info.si_status != 0)
 285                         debug("over_cmd failed");
 286                 break;
 287         }
 288 }
 289 
 290 /*
 291  * Get the next mapping.
 292  */
 293 static prmap_t *
 294 nextmapping(proc_map_t *pmp)
 295 {
 296         if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
 297                 return (NULL);
 298 
 299         return (&pmp->pr_mapp[pmp->pr_curr++]);
 300 }
 301 
 302 /*
 303  * Initialize the proc_map_t to access the first mapping of an address space.
 304  */
 305 static prmap_t *
 306 init_map(proc_map_t *pmp, pid_t pid)
 307 {
 308         int fd;
 309         int res;
 310         struct stat st;
 311         char pathbuf[MAXPATHLEN];
 312 
 313         bzero(pmp, sizeof (proc_map_t));
 314         pmp->pr_nmap = -1;
 315 
 316         (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
 317         if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
 318                 return (NULL);
 319 
 320 redo:
 321         errno = 0;
 322         if (fstat(fd, &st) != 0)
 323                 goto done;
 324 
 325         if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
 326                 debug("cannot malloc() %ld bytes for xmap", st.st_size);
 327                 goto done;
 328         }
 329         (void) bzero(pmp->pr_mapp, st.st_size);
 330 
 331         errno = 0;
 332         if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
 333                 free(pmp->pr_mapp);
 334                 pmp->pr_mapp = NULL;
 335                 if (res > 0 || errno == E2BIG) {
 336                         goto redo;
 337                 } else {
 338                         debug("pid %ld cannot read xmap\n", pid);
 339                         goto done;
 340                 }
 341         }
 342 
 343         pmp->pr_nmap = st.st_size / sizeof (prmap_t);
 344 
 345 done:
 346         (void) close(fd);
 347         return (nextmapping(pmp));
 348 }
 349 
 350 /*
 351  * Attempt to invalidate the entire mapping from within the given process's
 352  * address space. May return nonzero with errno as:
 353  *    ESRCH  - process not found
 354  *    ENOMEM - segment not found
 355  *    EINVAL - mapping exceeds a single segment
 356  */
 357 static int
 358 pageout_mapping(pid_t pid, prmap_t *pmp)
 359 {
 360         int res;
 361 
 362         if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
 363                 return (0);
 364 
 365         errno = 0;
 366         res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
 367             pmp->pr_size);
 368 
 369         return (res);
 370 }
 371 
 372 /*
 373  * Work through a process paging out mappings until the whole address space was
 374  * examined or the excess is < 0.  Return our estimate of the updated excess.
 375  */
 376 static int64_t
 377 pageout_process(pid_t pid, int64_t excess)
 378 {
 379         int                     psfd;
 380         prmap_t                 *pmap;
 381         proc_map_t              cur;
 382         int                     res;
 383         int64_t                 sum_d_rss, d_rss;
 384         int64_t                 old_rss;
 385         int                     map_cnt;
 386         psinfo_t                psinfo;
 387         char                    pathbuf[MAXPATHLEN];
 388 
 389         (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
 390             pid);
 391         if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
 392                 return (excess);
 393 
 394         cur.pr_mapp = NULL;
 395 
 396         if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
 397                 goto done;
 398 
 399         old_rss = (int64_t)psinfo.pr_rssize;
 400         map_cnt = 0;
 401 
 402         /* If unscannable, skip it. */
 403         if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
 404                 debug("pid %ld: system process, skipping %s\n",
 405                     pid, psinfo.pr_psargs);
 406                 goto done;
 407         }
 408 
 409         /* If tiny RSS (16KB), skip it. */
 410         if (old_rss <= 16) {
 411                 debug("pid %ld: skipping, RSS %lldKB %s\n",
 412                     pid, old_rss, psinfo.pr_psargs);
 413                 goto done;
 414         }
 415 
 416         /* Get segment residency information. */
 417         pmap = init_map(&cur, pid);
 418 
 419         /* Skip process if it has no mappings. */
 420         if (pmap == NULL) {
 421                 debug("pid %ld: map unreadable; ignoring\n", pid);
 422                 goto done;
 423         }
 424 
 425         debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
 426             pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
 427 
 428         /*
 429          * Within the process's address space, attempt to page out mappings.
 430          */
 431         sum_d_rss = 0;
 432         while (excess > 0 && pmap != NULL && !shutting_down) {
 433                 /* invalidate the entire mapping */
 434                 if ((res = pageout_mapping(pid, pmap)) < 0)
 435                         debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
 436                             pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
 437 
 438                 map_cnt++;
 439 
 440                 /*
 441                  * Re-check the process rss and get the delta.
 442                  */
 443                 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 444                     != sizeof (psinfo)) {
 445                         excess -= old_rss;
 446                         goto done;
 447                 }
 448 
 449                 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
 450                 old_rss = (int64_t)psinfo.pr_rssize;
 451                 sum_d_rss += d_rss;
 452 
 453                 /*
 454                  * d_rss hopefully should be negative (or 0 if nothing
 455                  * invalidated) but can be positive if more got paged in.
 456                  */
 457                 excess += d_rss;
 458 
 459                 if (excess <= 0) {
 460                         debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
 461                             "excess %lldKB\n", pid, map_cnt,
 462                             (unsigned long long)sum_d_rss, (long long)excess);
 463                         map_cnt = 0;
 464 
 465                         /*
 466                          * If we're actually under, this will suspend checking
 467                          * in the middle of this process's address space.
 468                          */
 469                         excess = check_suspend();
 470                         if (shutting_down)
 471                                 goto done;
 472 
 473                         /*
 474                          * since we might have suspended, re-read process's rss
 475                          */
 476                         if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 477                             != sizeof (psinfo)) {
 478                                 excess -= old_rss;
 479                                 goto done;
 480                         }
 481 
 482                         old_rss = (int64_t)psinfo.pr_rssize;
 483 
 484                         debug("pid %ld: resume pageout; excess %lld\n", pid,
 485                             (long long)excess);
 486                         sum_d_rss = 0;
 487                 }
 488 
 489                 pmap = nextmapping(&cur);
 490         }
 491 
 492         debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
 493             pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
 494 
 495 done:
 496         if (cur.pr_mapp != NULL)
 497                 free(cur.pr_mapp);
 498 
 499         (void) close(psfd);
 500 
 501         if (shutting_down)
 502                 return (0);
 503 
 504         return (excess);
 505 }
 506 
 507 /*
 508  * Get the zone's RSS data.
 509  */
 510 static uint64_t
 511 get_mem_info()
 512 {
 513         uint64_t                n = 1;
 514         zsd_vmusage64_t         buf;
 515         uint64_t                tmp_rss;
 516         DIR                     *pdir = NULL;
 517         struct dirent           *dent;
 518 
 519         /*
 520          * Start by doing the fast, cheap RSS calculation using the rss value
 521          * in psinfo_t.  Because that's per-process, it can lead to double
 522          * counting some memory and overestimating how much is being used, but
 523          * as long as that's not over the cap, then we don't need do the
 524          * expensive calculation.
 525          *
 526          * If we have to do the expensive calculation, we remember the scaling
 527          * factor so that we can try to use that on subsequent iterations for
 528          * the fast rss.
 529          */
 530         if (shutting_down)
 531                 return (0);
 532 
 533         if ((pdir = opendir(zoneproc)) == NULL)
 534                 return (0);
 535 
 536         accurate_rss = 0;
 537         fast_rss = 0;
 538         while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 539                 pid_t           pid;
 540                 int             psfd;
 541                 int64_t         rss;
 542                 char            pathbuf[MAXPATHLEN];
 543                 psinfo_t        psinfo;
 544 
 545                 if (strcmp(".", dent->d_name) == 0 ||
 546                     strcmp("..", dent->d_name) == 0)
 547                         continue;
 548 
 549                 pid = atoi(dent->d_name);
 550                 if (pid == 0 || pid == 1)
 551                         continue;
 552 
 553                 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
 554                     zoneproc, pid);
 555 
 556                 rss = 0;
 557                 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
 558                         if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
 559                             sizeof (psinfo))
 560                                 rss = (int64_t)psinfo.pr_rssize;
 561 
 562                         (void) close(psfd);
 563                 }
 564 
 565                 fast_rss += rss;
 566         }
 567 
 568         (void) closedir(pdir);
 569 
 570         if (shutting_down)
 571                 return (0);
 572 
 573         debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
 574             scale_rss, prev_fast_rss);
 575 
 576         /* see if we can get by with a scaled fast rss */
 577         tmp_rss = fast_rss;
 578         if (scale_rss > 1 && prev_fast_rss > 0) {
 579                 /*
 580                  * Only scale the fast value if it hasn't ballooned too much
 581                  * to trust.
 582                  */
 583                 if (fast_rss / prev_fast_rss < 2) {
 584                         fast_rss /= scale_rss;
 585                         debug("scaled fast rss: %lluKB\n", fast_rss);
 586                 }
 587         }
 588 
 589         if (fast_rss <= zone_rss_cap || skip_vmusage) {
 590                 uint64_t zone_rss_bytes;
 591 
 592                 zone_rss_bytes = fast_rss * 1024;
 593                 /* Use the zone's approx. RSS in the kernel */
 594                 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
 595                 return (fast_rss);
 596         }
 597 
 598         buf.vmu_id = zid;
 599 
 600         /* get accurate usage (cached data may be up to 5 seconds old) */
 601         if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
 602             (uintptr_t)&buf, (uintptr_t)&n) != 0) {
 603                 debug("vmusage failed\n");
 604                 (void) sleep_shutdown(1);
 605                 return (0);
 606         }
 607 
 608         if (n > 1) {
 609                 /* This should never happen */
 610                 debug("vmusage returned more than one result\n");
 611                 (void) sleep_shutdown(1);
 612                 return (0);
 613         }
 614 
 615         if (buf.vmu_id != zid) {
 616                 /* This should never happen */
 617                 debug("vmusage returned the incorrect zone\n");
 618                 (void) sleep_shutdown(1);
 619                 return (0);
 620         }
 621 
 622         accurate_rss = buf.vmu_rss_all / 1024;
 623 
 624         /* calculate scaling factor to use for fast_rss from now on */
 625         if (accurate_rss > 0) {
 626                 scale_rss = fast_rss / accurate_rss;
 627                 debug("new scaling factor: %llu\n", scale_rss);
 628                 /* remember the fast rss when we had to get the accurate rss */
 629                 prev_fast_rss = tmp_rss;
 630         }
 631 
 632         debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
 633             scale_rss, prev_fast_rss);
 634         return (accurate_rss);
 635 }
 636 
 637 /*
 638  * Needed to read the zones physical-memory-cap rctl.
 639  */
 640 static struct ps_prochandle *
 641 grab_zone_proc()
 642 {
 643         DIR *dirp;
 644         struct dirent *dentp;
 645         struct ps_prochandle *ph = NULL;
 646         int tmp;
 647 
 648         if ((dirp = opendir(zoneproc)) == NULL)
 649                 return (NULL);
 650 
 651         while (!shutting_down && (dentp = readdir(dirp))) {
 652                 int pid;
 653 
 654                 if (strcmp(".", dentp->d_name) == 0 ||
 655                     strcmp("..", dentp->d_name) == 0)
 656                         continue;
 657 
 658                 pid = atoi(dentp->d_name);
 659                 /* attempt to grab process */
 660                 if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
 661                         if (Psetflags(ph, PR_RLC) == 0) {
 662                                 if (Pcreate_agent(ph) == 0) {
 663                                         (void) closedir(dirp);
 664                                         return (ph);
 665                                 }
 666                         }
 667                         Prelease(ph, 0);
 668                 }
 669         }
 670 
 671         (void) closedir(dirp);
 672         return (NULL);
 673 }
 674 
 675 static uint64_t
 676 get_zone_cap()
 677 {
 678         rctlblk_t *rblk;
 679         uint64_t mcap;
 680         struct ps_prochandle *ph;
 681 
 682         if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
 683                 return (UINT64_MAX);
 684 
 685         if ((ph = grab_zone_proc()) == NULL) {
 686                 free(rblk);
 687                 return (UINT64_MAX);
 688         }
 689 
 690         if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
 691             RCTL_FIRST)) {
 692                 Pdestroy_agent(ph);
 693                 Prelease(ph, 0);
 694                 free(rblk);
 695                 return (UINT64_MAX);
 696         }
 697 
 698         Pdestroy_agent(ph);
 699         Prelease(ph, 0);
 700 
 701         mcap = rctlblk_get_value(rblk);
 702         free(rblk);
 703         return (mcap);
 704 }
 705 
 706 /*
 707  * check_suspend is invoked at the beginning of every pass through the process
 708  * list or after we've paged out enough so that we think the excess is under
 709  * the cap.  The purpose is to periodically check the zone's rss and return
 710  * the excess when the zone is over the cap.  The rest of the time this
 711  * function will sleep, periodically waking up to check the current rss.
 712  *
 713  * Depending on the percentage of penetration of the zone's rss into the
 714  * cap we sleep for longer or shorter amounts. This reduces the impact of this
 715  * work on the system, which is important considering that each zone will be
 716  * monitoring its rss.
 717  */
 718 static int64_t
 719 check_suspend()
 720 {
 721         static hrtime_t last_cap_read = 0;
 722         static uint64_t addon;
 723         static uint64_t lo_thresh;      /* Thresholds for how long to  sleep */
 724         static uint64_t hi_thresh;      /* when under the cap (80% & 90%). */
 725         static uint64_t prev_zone_rss = 0;
 726         static uint32_t pfdelay = 0;    /* usec page fault delay when over */
 727 
 728         /* Wait a second to give the async pageout a chance to catch up. */
 729         (void) sleep_shutdown(1);
 730 
 731         while (!shutting_down) {
 732                 int64_t new_excess;
 733                 int sleep_time;
 734                 hrtime_t now;
 735                 struct stat st;
 736                 uint64_t zone_rss;              /* total RSS(KB) */
 737 
 738                 /*
 739                  * Check if the debug log files exists and enable or disable
 740                  * debug.
 741                  */
 742                 if (debug_log_fp == NULL) {
 743                         if (stat(debug_log, &st) == 0)
 744                                 debug_log_fp = fopen(debug_log, "w");
 745                 } else {
 746                         if (stat(debug_log, &st) == -1) {
 747                                 (void) fclose(debug_log_fp);
 748                                 debug_log_fp = NULL;
 749                         }
 750                 }
 751 
 752                 /*
 753                  * If the CAP_REFRESH interval has passed, re-get the current
 754                  * cap in case it has been dynamically updated.
 755                  */
 756                 now = gethrtime();
 757                 if (now - last_cap_read > CAP_REFRESH) {
 758                         uint64_t mcap;
 759 
 760                         last_cap_read = now;
 761 
 762                         mcap = get_zone_cap();
 763                         if (mcap != 0 && mcap != UINT64_MAX)
 764                                 zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
 765                         else
 766                                 zone_rss_cap = UINT64_MAX;
 767 
 768                         lo_thresh = (uint64_t)(zone_rss_cap * .8);
 769                         hi_thresh = (uint64_t)(zone_rss_cap * .9);
 770                         addon = (uint64_t)(zone_rss_cap * 0.05);
 771 
 772                         /*
 773                          * We allow the memory cap tunables to be changed on
 774                          * the fly.
 775                          */
 776                         get_mcap_tunables();
 777 
 778                         debug("%s: %s\n", TUNE_CMD, over_cmd);
 779                         debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
 780                         debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
 781                         debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
 782                         debug("current cap %lluKB lo %lluKB hi %lluKB\n",
 783                             zone_rss_cap, lo_thresh, hi_thresh);
 784                 }
 785 
 786                 /* No cap, nothing to do. */
 787                 if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
 788                         debug("no cap, sleep 120 seconds\n");
 789                         (void) sleep_shutdown(120);
 790                         continue;
 791                 }
 792 
 793                 zone_rss = get_mem_info();
 794 
 795                 /* calculate excess */
 796                 new_excess = zone_rss - zone_rss_cap;
 797 
 798                 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
 799                     zone_rss, zone_rss_cap, new_excess);
 800 
 801                 /*
 802                  * If necessary, updates stats.
 803                  */
 804 
 805                 /*
 806                  * If it looks like we did some paging out since last over the
 807                  * cap then update the kstat so we can approximate how much was
 808                  * paged out.
 809                  */
 810                 if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
 811                         uint64_t diff;
 812 
 813                         /* assume diff is num bytes we paged out */
 814                         diff = (prev_zone_rss - zone_rss) * 1024;
 815 
 816                         (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
 817                             &diff, 0);
 818                 }
 819                 prev_zone_rss = zone_rss;
 820 
 821                 if (new_excess > 0) {
 822                         uint64_t n = 1;
 823 
 824                         /* Increment "nover" kstat. */
 825                         (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
 826 
 827                         if (!skip_pf_throttle) {
 828                                 /*
 829                                  * Tell the kernel to start throttling page
 830                                  * faults by some number of usecs to help us
 831                                  * catch up. If we are persistently over the
 832                                  * cap the delay ramps up to a max of 2000usecs.
 833                                  * Note that for delays less than 1 tick
 834                                  * (i.e. all of these) we busy-wait in as_fault.
 835                                  *      delay   faults/sec
 836                                  *       125    8000
 837                                  *       250    4000
 838                                  *       500    2000
 839                                  *      1000    1000
 840                                  *      2000     500
 841                                  */
 842                                 if (pfdelay == 0)
 843                                         pfdelay = 125;
 844                                 else if (pfdelay < 2000)
 845                                         pfdelay *= 2;
 846 
 847                                 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 848                                     &pfdelay, 0);
 849                         }
 850 
 851                         /*
 852                          * Once we go over the cap, then we want to
 853                          * page out a little extra instead of stopping
 854                          * right at the cap. To do this we add 5% to
 855                          * the excess so that pageout_proces will work
 856                          * a little longer before stopping.
 857                          */
 858                         return ((int64_t)(new_excess + addon));
 859                 }
 860 
 861                 /*
 862                  * At this point we are under the cap.
 863                  *
 864                  * Tell the kernel to stop throttling page faults.
 865                  *
 866                  * Scale the amount of time we sleep before rechecking the
 867                  * zone's memory usage.  Also, scale the accpetable age of
 868                  * cached results from vm_getusage.  We do this based on the
 869                  * penetration into the capped limit.
 870                  */
 871                 if (pfdelay > 0) {
 872                         pfdelay = 0;
 873                         (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 874                             &pfdelay, 0);
 875                 }
 876 
 877                 if (zone_rss <= lo_thresh) {
 878                         sleep_time = 120;
 879                 } else if (zone_rss <= hi_thresh) {
 880                         sleep_time = 60;
 881                 } else {
 882                         sleep_time = 30;
 883                 }
 884 
 885                 debug("sleep %d seconds\n", sleep_time);
 886                 (void) sleep_shutdown(sleep_time);
 887         }
 888 
 889         /* Shutting down, tell the kernel so it doesn't throttle */
 890         if (pfdelay > 0) {
 891                 pfdelay = 0;
 892                 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
 893         }
 894 
 895         return (0);
 896 }
 897 
 898 static void
 899 get_mcap_tunables()
 900 {
 901         zone_dochandle_t handle;
 902         struct zone_attrtab attr;
 903 
 904         over_cmd[0] = '\0';
 905         if ((handle = zonecfg_init_handle()) == NULL)
 906                 return;
 907 
 908         if (zonecfg_get_handle(zone_name, handle) != Z_OK)
 909                 goto done;
 910 
 911         /* Reset to defaults in case rebooting and settings have changed */
 912         over_cmd[0] = '\0';
 913         skip_vmusage = B_FALSE;
 914         skip_pageout = B_FALSE;
 915         skip_pf_throttle = B_FALSE;
 916 
 917         if (zonecfg_setattrent(handle) != Z_OK)
 918                 goto done;
 919         while (zonecfg_getattrent(handle, &attr) == Z_OK) {
 920                 if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
 921                         (void) strlcpy(over_cmd, attr.zone_attr_value,
 922                             sizeof (over_cmd));
 923                 } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
 924                         if (strcmp("true", attr.zone_attr_value) == 0)
 925                                 skip_vmusage = B_TRUE;
 926                 } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
 927                         if (strcmp("true", attr.zone_attr_value) == 0)
 928                                 skip_pageout = B_TRUE;
 929                 } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
 930                         if (strcmp("true", attr.zone_attr_value) == 0)
 931                                 skip_pf_throttle = B_TRUE;
 932                 }
 933         }
 934         (void) zonecfg_endattrent(handle);
 935 
 936 done:
 937         zonecfg_fini_handle(handle);
 938 }
 939 
 940 /* ARGSUSED */
 941 static int
 942 chk_proc_fs(void *data, const char *spec, const char *dir,
 943     const char *fstype, const char *opt)
 944 {
 945         if (fstype != NULL && strcmp(fstype, "proc") == 0)
 946                 *((boolean_t *)data) = B_TRUE;
 947 
 948         return (0);
 949 }
 950 
 951 static boolean_t
 952 has_proc()
 953 {
 954         brand_handle_t bh;
 955         boolean_t fnd = B_FALSE;
 956 
 957         if ((bh = brand_open(brand_name)) != NULL) {
 958                 (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
 959         }
 960 
 961         brand_close(bh);
 962         return (fnd);
 963 }
 964 
 965 /*
 966  * We run this loop for brands with no /proc to simply update the RSS, using
 967  * the cheap GZ /proc data, every 5 minutes.
 968  */
 969 static void
 970 no_procfs()
 971 {
 972         DIR                     *pdir = NULL;
 973         struct dirent           *dent;
 974         uint64_t                zone_rss_bytes;
 975 
 976         (void) sleep_shutdown(30);
 977         while (!shutting_down) {
 978                 /*
 979                  * Just do the fast, cheap RSS calculation using the rss value
 980                  * in psinfo_t.  Because that's per-process, it can lead to
 981                  * double counting some memory and overestimating how much is
 982                  * being used. Since there is no /proc in the zone, we use the
 983                  * GZ /proc and check for the correct zone.
 984                  */
 985                 if ((pdir = opendir("/proc")) == NULL)
 986                         return;
 987 
 988                 fast_rss = 0;
 989                 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 990                         pid_t           pid;
 991                         int             psfd;
 992                         int64_t         rss;
 993                         char            pathbuf[MAXPATHLEN];
 994                         psinfo_t        psinfo;
 995 
 996                         if (strcmp(".", dent->d_name) == 0 ||
 997                             strcmp("..", dent->d_name) == 0)
 998                                 continue;
 999 
1000                         pid = atoi(dent->d_name);
1001                         if (pid == 0 || pid == 1)
1002                                 continue;
1003 
1004                         (void) snprintf(pathbuf, sizeof (pathbuf),
1005                             "/proc/%d/psinfo", pid);
1006 
1007                         rss = 0;
1008                         if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1009                                 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1010                                     sizeof (psinfo)) {
1011                                         if (psinfo.pr_zoneid == zid)
1012                                                 rss = (int64_t)psinfo.pr_rssize;
1013                                 }
1014 
1015                                 (void) close(psfd);
1016                         }
1017 
1018                         fast_rss += rss;
1019                 }
1020 
1021                 (void) closedir(pdir);
1022 
1023                 if (shutting_down)
1024                         return;
1025 
1026                 zone_rss_bytes = fast_rss * 1024;
1027                 /* Use the zone's approx. RSS in the kernel */
1028                 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1029 
1030                 (void) sleep_shutdown(300);
1031         }
1032 }
1033 
1034 /*
1035  * Thread that checks zone's memory usage and when over the cap, goes through
1036  * the zone's process list trying to pageout processes to get under the cap.
1037  */
1038 static void
1039 mcap_zone()
1040 {
1041         DIR *pdir = NULL;
1042         int64_t excess;
1043 
1044         debug("thread startup\n");
1045 
1046         get_mcap_tunables();
1047 
1048         /*
1049          * If the zone has no /proc filesystem, we can't use the fast algorithm
1050          * to check RSS or pageout any processes. All we can do is periodically
1051          * update it's RSS kstat using the expensive sycall.
1052          */
1053         if (!has_proc()) {
1054                 no_procfs();
1055                 debug("thread shutdown\n");
1056                 return;
1057         }
1058 
1059         /*
1060          * When first starting it is likely lots of other zones are starting
1061          * too because the system is booting.  Since we just started the zone
1062          * we're not worried about being over the cap right away, so we let
1063          * things settle a bit and tolerate some older data here to minimize
1064          * the load on the system.
1065          */
1066         (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1067 
1068         /* Wait until zone's /proc is mounted */
1069         while (!shutting_down) {
1070                 struct stat st;
1071 
1072                 if (stat(zoneproc, &st) == 0 &&
1073                     strcmp(st.st_fstype, "proc") == 0)
1074                         break;
1075                 sleep_shutdown(5);
1076         }
1077 
1078         /* Open zone's /proc and walk entries. */
1079         while (!shutting_down) {
1080                 if ((pdir = opendir(zoneproc)) != NULL)
1081                         break;
1082                 sleep_shutdown(5);
1083         }
1084 
1085         while (!shutting_down) {
1086                 struct dirent *dirent;
1087 
1088                 /* Wait until we've gone over the cap. */
1089                 excess = check_suspend();
1090 
1091                 debug("starting to scan, excess %lldk\n", (long long)excess);
1092 
1093                 if (over_cmd[0] != '\0') {
1094                         uint64_t zone_rss;      /* total RSS(KB) */
1095 
1096                         debug("run phys_mcap_cmd: %s\n", over_cmd);
1097                         run_over_cmd();
1098 
1099                         zone_rss = get_mem_info();
1100                         excess = zone_rss - zone_rss_cap;
1101                         debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1102                             zone_rss, zone_rss_cap, excess);
1103                         if (excess <= 0)
1104                                 continue;
1105                 }
1106 
1107                 while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1108                         pid_t pid;
1109 
1110                         if (strcmp(".", dirent->d_name) == 0 ||
1111                             strcmp("..", dirent->d_name) == 0)
1112                                 continue;
1113 
1114                         pid = atoi(dirent->d_name);
1115                         if (pid == 0 || pid == 1)
1116                                 continue;
1117 
1118                         if (skip_pageout)
1119                                 (void) sleep_shutdown(2);
1120                         else
1121                                 excess = pageout_process(pid, excess);
1122 
1123                         if (excess <= 0) {
1124                                 debug("apparently under; excess %lld\n",
1125                                     (long long)excess);
1126                                 /* Double check the current excess */
1127                                 excess = check_suspend();
1128                         }
1129                 }
1130 
1131                 debug("process pass done; excess %lld\n", (long long)excess);
1132                 rewinddir(pdir);
1133 
1134                 if (skip_pageout)
1135                         (void) sleep_shutdown(120);
1136         }
1137 
1138         if (pdir != NULL)
1139                 (void) closedir(pdir);
1140         debug("thread shutdown\n");
1141 }
1142 
1143 void
1144 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1145 {
1146         int             res;
1147 
1148         shutting_down = 0;
1149         zid = id;
1150         logp = zlogp;
1151 
1152         /* all but the lx brand currently use /proc */
1153         if (strcmp(brand_name, "lx") == 0) {
1154                 (void) snprintf(zoneproc, sizeof (zoneproc),
1155                     "%s/root/native/proc", zonepath);
1156         } else {
1157                 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1158                     zonepath);
1159         }
1160 
1161         (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1162             zonepath);
1163 
1164         res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1165             &mcap_tid);
1166         if (res != 0) {
1167                 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1168                     res);
1169                 mcap_tid = 0;
1170         }
1171 }
1172 
1173 void
1174 destroy_mcap_thread()
1175 {
1176         if (mcap_tid != 0) {
1177                 shutting_down = 1;
1178                 (void) cond_signal(&shutdown_cv);
1179                 (void) thr_join(mcap_tid, NULL, NULL);
1180                 mcap_tid = 0;
1181         }
1182 }