io-lx-lint New usr/src/cmd/zoneadmd/mcap.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23  * Copyright 2014, Joyent, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  * This file implements the code which runs a thread inside zoneadmd to cap
  28  * the associated zone's physical memory.  A thread to do this is started
  29  * when the zone boots and is halted when the zone shuts down.
  30  *
  31  * Because of the way that the VM system is currently implemented, there is no
  32  * way to go from the bottom up (page to process to zone).  Thus, there is no
  33  * obvious way to hook an rctl into the kernel's paging code to enforce a hard
  34  * memory cap.  Instead, we implement a soft physical memory cap which looks
  35  * at the zone's overall rss and once it is over the cap, works from the top
  36  * down (zone to process to page), looking at zone processes, to determine
  37  * what to try to pageout to get the zone under its memory cap.
  38  *
  39  * The code uses the fast, cheap, but potentially very inaccurate sum of the
  40  * rss values from psinfo_t to first approximate the zone's rss and will
  41  * fallback to the vm_getusage syscall to determine the zone's rss if needed.
  42  * It then checks the rss against the zone's zone.max-physical-memory rctl.
  43  * Once the zone goes over its cap, then this thread will work through the
  44  * zone's /proc process list, Pgrab-bing each process and stepping through the
  45  * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
  46  * to pageout pages, until the zone is again under its cap.
  47  *
  48  * Although zone memory capping is implemented as a soft cap by this user-level
  49  * thread, the interfaces around memory caps that are exposed to the user are
  50  * the standard ones; an rctl and kstats.  This thread uses the rctl value
  51  * to obtain the cap and works with the zone kernel code to update the kstats.
  52  * If the implementation ever moves into the kernel, these exposed interfaces
  53  * do not need to change.
  54  *
  55  * The thread adaptively sleeps, periodically checking the state of the
  56  * zone.  As the zone's rss gets closer to the cap, the thread will wake up
  57  * more often to check the zone's status.  Once the zone is over the cap,
  58  * the thread will work to pageout until the zone is under the cap, as shown
  59  * by updated vm_usage data.
  60  *
  61  * NOTE: The pagedata page maps (at least on x86) are not useful.  Those flags
  62  * are set by hrm_setbits() and on x86 that code path is only executed by
  63  *     segvn_pagelock -> hat_setstat -> hrm_setbits
  64  *     segvn_softunlock -^
  65  * On SPARC there is an additional code path which may make this data
  66  * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
  67  * maps.  If we ever fix this issue, then we could generalize this mcap code to
  68  * do more with the data on active pages.
  69  *
  70  * For debugging, touch the file {zonepath}/mcap_debug.log.  This will
  71  * cause the thread to start logging its actions into that file (it may take
  72  * a minute or two if the thread is currently sleeping).  Removing that
  73  * file will cause logging to stop.
  74  */
  75 
  76 #include <sys/mman.h>
  77 #include <sys/param.h>
  78 #include <sys/stat.h>
  79 #include <sys/types.h>
  80 #include <assert.h>
  81 #include <errno.h>
  82 #include <fcntl.h>
  83 #include <libproc.h>
  84 #include <limits.h>
  85 #include <procfs.h>
  86 #include <stdio.h>
  87 #include <stdlib.h>
  88 #include <strings.h>
  89 #include <time.h>
  90 #include <unistd.h>
  91 #include <sys/priocntl.h>
  92 #include <dirent.h>
  93 #include <zone.h>
  94 #include <libzonecfg.h>
  95 #include <thread.h>
  96 #include <values.h>
  97 #include <sys/vm_usage.h>
  98 #include <sys/resource.h>
  99 #include <sys/debug.h>
 100 #include <synch.h>
 101 #include <wait.h>
 102 #include <libcontract.h>
 103 #include <libcontract_priv.h>
 104 #include <sys/contract/process.h>
 105 #include "zoneadmd.h"
 106 
 107                                         /* round up to next y = 2^n */
 108 #define ROUNDUP(x, y)   (((x) + ((y) - 1)) & ~((y) - 1))
 109 
 110 #define CAP_REFRESH     ((uint64_t)300 * NANOSEC) /* every 5 minutes */
 111 
 112 /*
 113  * zonecfg attribute tunables for memory capping.
 114  *    phys-mcap-cmd
 115  *      type: string
 116  *      specifies a command that can be run when over the cap
 117  *    phys-mcap-no-vmusage
 118  *      type: boolean
 119  *      true disables vm_getusage and just uses zone's proc. rss sum
 120  *    phys-mcap-no-pageout
 121  *      type: boolean
 122  *      true disables pageout when over
 123  *    phys-mcap-no-pf-throttle
 124  *      type: boolean
 125  *      true disables page fault throttling when over
 126  */
 127 #define TUNE_CMD        "phys-mcap-cmd"
 128 #define TUNE_NVMU       "phys-mcap-no-vmusage"
 129 #define TUNE_NPAGE      "phys-mcap-no-pageout"
 130 #define TUNE_NPFTHROT   "phys-mcap-no-pf-throttle"
 131 
 132 /*
 133  * The large mapping value was derived empirically by seeing that mappings
 134  * much bigger than 16mb sometimes take a relatively long time to invalidate
 135  * (significant fraction of a second).
 136  */
 137 #define SEC_INTERIM     4       /* num secs to pause after stopped too long */
 138 #define MSEC_TOO_LONG   100     /* release proc. after stopped for 100ms */
 139 #define LARGE_MAPPING   16384   /* >= 16MB in KB - pageout in chunks */
 140 
 141 /*
 142  * These are only used in get_mem_info but global. We always need scale_rss and
 143  * prev_fast_rss to be persistent but we also have the other two global so we
 144  * can easily see these with mdb.
 145  */
 146 uint64_t        scale_rss = 0;
 147 uint64_t        prev_fast_rss = 0;
 148 uint64_t        fast_rss = 0;
 149 uint64_t        accurate_rss = 0;
 150 
 151 static char     zoneproc[MAXPATHLEN];
 152 static char     debug_log[MAXPATHLEN];
 153 static zoneid_t zid;
 154 static mutex_t  shutdown_mx;
 155 static cond_t   shutdown_cv;
 156 static int      shutting_down = 0;
 157 static thread_t mcap_tid;
 158 static FILE     *debug_log_fp = NULL;
 159 static uint64_t zone_rss_cap;           /* RSS cap(KB) */
 160 static char     over_cmd[2 * BUFSIZ];   /* same size as zone_attr_value */
 161 static boolean_t skip_vmusage = B_FALSE;
 162 static boolean_t skip_pageout = B_FALSE;
 163 static boolean_t skip_pf_throttle = B_FALSE;
 164 
 165 static int64_t check_suspend();
 166 static void get_mcap_tunables();
 167 
 168 /*
 169  * Structure to hold current state about a process address space that we're
 170  * working on.
 171  */
 172 typedef struct {
 173         int pr_curr;            /* the # of the mapping we're working on */
 174         int pr_nmap;            /* number of mappings in address space */
 175         prmap_t *pr_mapp;       /* process's map array */
 176 } proc_map_t;
 177 
 178 typedef struct zsd_vmusage64 {
 179         id_t vmu_zoneid;
 180         uint_t vmu_type;
 181         id_t vmu_id;
 182         /*
 183          * An amd64 kernel will align the following uint64_t members, but a
 184          * 32bit i386 process will not without help.
 185          */
 186         int vmu_align_next_members_on_8_bytes;
 187         uint64_t vmu_rss_all;
 188         uint64_t vmu_rss_private;
 189         uint64_t vmu_rss_shared;
 190         uint64_t vmu_swap_all;
 191         uint64_t vmu_swap_private;
 192         uint64_t vmu_swap_shared;
 193 } zsd_vmusage64_t;
 194 
 195 /*
 196  * Output a debug log message.
 197  */
 198 /*PRINTFLIKE1*/
 199 static void
 200 debug(char *fmt, ...)
 201 {
 202         va_list ap;
 203 
 204         if (debug_log_fp == NULL)
 205                 return;
 206 
 207         va_start(ap, fmt);
 208         (void) vfprintf(debug_log_fp, fmt, ap);
 209         va_end(ap);
 210         (void) fflush(debug_log_fp);
 211 }
 212 
 213 /*
 214  * Like sleep(3C) but can be interupted by cond_signal which is posted when
 215  * we're shutting down the mcap thread.
 216  */
 217 static void
 218 sleep_shutdown(int secs)
 219 {
 220         timestruc_t to;
 221 
 222         to.tv_sec = secs;
 223         to.tv_nsec = 0;
 224 
 225         (void) mutex_lock(&shutdown_mx);
 226         if (!shutting_down)
 227                 (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
 228         (void) mutex_unlock(&shutdown_mx);
 229 }
 230 
 231 static boolean_t
 232 proc_issystem(pid_t pid)
 233 {
 234         char pc_clname[PC_CLNMSZ];
 235 
 236         if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
 237             PC_KY_NULL) != -1)
 238                 return (strcmp(pc_clname, "SYS") == 0);
 239 
 240         return (B_TRUE);
 241 }
 242 
 243 /*
 244  * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
 245  */
 246 static void
 247 run_over_cmd()
 248 {
 249         int             ctfd;
 250         int             err;
 251         pid_t           childpid;
 252         siginfo_t       info;
 253         ctid_t          ct;
 254 
 255         /*
 256          * Before we enter the zone, we need to create a new process contract
 257          * for the child, as required by zone_enter().
 258          */
 259         if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
 260                 return;
 261         if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
 262             ct_tmpl_set_informative(ctfd, 0) != 0 ||
 263             ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
 264             ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
 265             ct_tmpl_activate(ctfd) != 0) {
 266                 (void) close(ctfd);
 267                 return;
 268         }
 269 
 270         childpid = fork();
 271         switch (childpid) {
 272         case -1:
 273                 (void) ct_tmpl_clear(ctfd);
 274                 (void) close(ctfd);
 275                 break;
 276         case 0: /* Child */
 277                 (void) ct_tmpl_clear(ctfd);
 278                 (void) close(ctfd);
 279                 if (zone_enter(zid) == -1)
 280                         _exit(errno);
 281                 err = system(over_cmd);
 282                 _exit(err);
 283                 break;
 284         default:        /* Parent */
 285                 if (contract_latest(&ct) == -1)
 286                         ct = -1;
 287                 (void) ct_tmpl_clear(ctfd);
 288                 (void) close(ctfd);
 289                 err = waitid(P_PID, childpid, &info, WEXITED);
 290                 (void) contract_abandon_id(ct);
 291                 if (err == -1 || info.si_status != 0)
 292                         debug("over_cmd failed");
 293                 break;
 294         }
 295 }
 296 
 297 /*
 298  * Get the next mapping.
 299  */
 300 static prmap_t *
 301 nextmapping(proc_map_t *pmp)
 302 {
 303         if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
 304                 return (NULL);
 305 
 306         return (&pmp->pr_mapp[pmp->pr_curr++]);
 307 }
 308 
 309 /*
 310  * Initialize the proc_map_t to access the first mapping of an address space.
 311  */
 312 static prmap_t *
 313 init_map(proc_map_t *pmp, pid_t pid)
 314 {
 315         int fd;
 316         int res;
 317         struct stat st;
 318         char pathbuf[MAXPATHLEN];
 319 
 320         bzero(pmp, sizeof (proc_map_t));
 321         pmp->pr_nmap = -1;
 322 
 323         (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
 324         if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
 325                 return (NULL);
 326 
 327 redo:
 328         errno = 0;
 329         if (fstat(fd, &st) != 0)
 330                 goto done;
 331 
 332         if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
 333                 debug("cannot malloc() %ld bytes for xmap", st.st_size);
 334                 goto done;
 335         }
 336         (void) bzero(pmp->pr_mapp, st.st_size);
 337 
 338         errno = 0;
 339         if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
 340                 free(pmp->pr_mapp);
 341                 pmp->pr_mapp = NULL;
 342                 if (res > 0 || errno == E2BIG) {
 343                         goto redo;
 344                 } else {
 345                         debug("pid %ld cannot read xmap\n", pid);
 346                         goto done;
 347                 }
 348         }
 349 
 350         pmp->pr_nmap = st.st_size / sizeof (prmap_t);
 351 
 352 done:
 353         (void) close(fd);
 354         return (nextmapping(pmp));
 355 }
 356 
 357 /*
 358  * Attempt to invalidate the entire mapping from within the given process's
 359  * address space. May return nonzero with errno as:
 360  *    ESRCH  - process not found
 361  *    ENOMEM - segment not found
 362  *    EINVAL - mapping exceeds a single segment
 363  */
 364 static int
 365 pageout_mapping(pid_t pid, prmap_t *pmp)
 366 {
 367         int res;
 368 
 369         if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
 370                 return (0);
 371 
 372         errno = 0;
 373         res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
 374             pmp->pr_size);
 375 
 376         return (res);
 377 }
 378 
 379 /*
 380  * Work through a process paging out mappings until the whole address space was
 381  * examined or the excess is < 0.  Return our estimate of the updated excess.
 382  */
 383 static int64_t
 384 pageout_process(pid_t pid, int64_t excess)
 385 {
 386         int                     psfd;
 387         prmap_t                 *pmap;
 388         proc_map_t              cur;
 389         int64_t                 sum_d_rss, d_rss;
 390         int64_t                 old_rss;
 391         int                     map_cnt;
 392         psinfo_t                psinfo;
 393         char                    pathbuf[MAXPATHLEN];
 394 
 395         (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
 396             pid);
 397         if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
 398                 return (excess);
 399 
 400         cur.pr_mapp = NULL;
 401 
 402         if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
 403                 goto done;
 404 
 405         old_rss = (int64_t)psinfo.pr_rssize;
 406         map_cnt = 0;
 407 
 408         /* If unscannable, skip it. */
 409         if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
 410                 debug("pid %ld: system process, skipping %s\n",
 411                     pid, psinfo.pr_psargs);
 412                 goto done;
 413         }
 414 
 415         /* If tiny RSS (16KB), skip it. */
 416         if (old_rss <= 16) {
 417                 debug("pid %ld: skipping, RSS %lldKB %s\n",
 418                     pid, old_rss, psinfo.pr_psargs);
 419                 goto done;
 420         }
 421 
 422         /* Get segment residency information. */
 423         pmap = init_map(&cur, pid);
 424 
 425         /* Skip process if it has no mappings. */
 426         if (pmap == NULL) {
 427                 debug("pid %ld: map unreadable; ignoring\n", pid);
 428                 goto done;
 429         }
 430 
 431         debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
 432             pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
 433 
 434         /*
 435          * Within the process's address space, attempt to page out mappings.
 436          */
 437         sum_d_rss = 0;
 438         while (excess > 0 && pmap != NULL && !shutting_down) {
 439                 /* invalidate the entire mapping */
 440                 if (pageout_mapping(pid, pmap) < 0)
 441                         debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
 442                             pid, (void *)pmap->pr_vaddr,
 443                             (long)pmap->pr_size / 1024L, errno);
 444 
 445                 map_cnt++;
 446 
 447                 /*
 448                  * Re-check the process rss and get the delta.
 449                  */
 450                 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 451                     != sizeof (psinfo)) {
 452                         excess -= old_rss;
 453                         goto done;
 454                 }
 455 
 456                 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
 457                 old_rss = (int64_t)psinfo.pr_rssize;
 458                 sum_d_rss += d_rss;
 459 
 460                 /*
 461                  * d_rss hopefully should be negative (or 0 if nothing
 462                  * invalidated) but can be positive if more got paged in.
 463                  */
 464                 excess += d_rss;
 465 
 466                 if (excess <= 0) {
 467                         debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
 468                             "excess %lldKB\n", pid, map_cnt,
 469                             (unsigned long long)sum_d_rss, (long long)excess);
 470                         map_cnt = 0;
 471 
 472                         /*
 473                          * If we're actually under, this will suspend checking
 474                          * in the middle of this process's address space.
 475                          */
 476                         excess = check_suspend();
 477                         if (shutting_down)
 478                                 goto done;
 479 
 480                         /*
 481                          * since we might have suspended, re-read process's rss
 482                          */
 483                         if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 484                             != sizeof (psinfo)) {
 485                                 excess -= old_rss;
 486                                 goto done;
 487                         }
 488 
 489                         old_rss = (int64_t)psinfo.pr_rssize;
 490 
 491                         debug("pid %ld: resume pageout; excess %lld\n", pid,
 492                             (long long)excess);
 493                         sum_d_rss = 0;
 494                 }
 495 
 496                 pmap = nextmapping(&cur);
 497         }
 498 
 499         debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
 500             pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
 501 
 502 done:
 503         if (cur.pr_mapp != NULL)
 504                 free(cur.pr_mapp);
 505 
 506         (void) close(psfd);
 507 
 508         if (shutting_down)
 509                 return (0);
 510 
 511         return (excess);
 512 }
 513 
 514 /*
 515  * Get the zone's RSS data.
 516  */
 517 static uint64_t
 518 get_mem_info()
 519 {
 520         uint64_t                n = 1;
 521         zsd_vmusage64_t         buf;
 522         uint64_t                tmp_rss;
 523         DIR                     *pdir = NULL;
 524         struct dirent           *dent;
 525 
 526         /*
 527          * Start by doing the fast, cheap RSS calculation using the rss value
 528          * in psinfo_t.  Because that's per-process, it can lead to double
 529          * counting some memory and overestimating how much is being used, but
 530          * as long as that's not over the cap, then we don't need do the
 531          * expensive calculation.
 532          *
 533          * If we have to do the expensive calculation, we remember the scaling
 534          * factor so that we can try to use that on subsequent iterations for
 535          * the fast rss.
 536          */
 537         if (shutting_down)
 538                 return (0);
 539 
 540         if ((pdir = opendir(zoneproc)) == NULL)
 541                 return (0);
 542 
 543         accurate_rss = 0;
 544         fast_rss = 0;
 545         while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 546                 pid_t           pid;
 547                 int             psfd;
 548                 int64_t         rss;
 549                 char            pathbuf[MAXPATHLEN];
 550                 psinfo_t        psinfo;
 551 
 552                 if (strcmp(".", dent->d_name) == 0 ||
 553                     strcmp("..", dent->d_name) == 0)
 554                         continue;
 555 
 556                 pid = atoi(dent->d_name);
 557                 if (pid == 0 || pid == 1)
 558                         continue;
 559 
 560                 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
 561                     zoneproc, pid);
 562 
 563                 rss = 0;
 564                 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
 565                         if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
 566                             sizeof (psinfo))
 567                                 rss = (int64_t)psinfo.pr_rssize;
 568 
 569                         (void) close(psfd);
 570                 }
 571 
 572                 fast_rss += rss;
 573         }
 574 
 575         (void) closedir(pdir);
 576 
 577         if (shutting_down)
 578                 return (0);
 579 
 580         debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
 581             scale_rss, prev_fast_rss);
 582 
 583         /* see if we can get by with a scaled fast rss */
 584         tmp_rss = fast_rss;
 585         if (scale_rss > 1 && prev_fast_rss > 0) {
 586                 /*
 587                  * Only scale the fast value if it hasn't ballooned too much
 588                  * to trust.
 589                  */
 590                 if (fast_rss / prev_fast_rss < 2) {
 591                         fast_rss /= scale_rss;
 592                         debug("scaled fast rss: %lluKB\n", fast_rss);
 593                 }
 594         }
 595 
 596         if (fast_rss <= zone_rss_cap || skip_vmusage) {
 597                 uint64_t zone_rss_bytes;
 598 
 599                 zone_rss_bytes = fast_rss * 1024;
 600                 /* Use the zone's approx. RSS in the kernel */
 601                 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
 602                 return (fast_rss);
 603         }
 604 
 605         buf.vmu_id = zid;
 606 
 607         /* get accurate usage (cached data may be up to 5 seconds old) */
 608         if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
 609             (uintptr_t)&buf, (uintptr_t)&n) != 0) {
 610                 debug("vmusage failed\n");
 611                 (void) sleep_shutdown(1);
 612                 return (0);
 613         }
 614 
 615         if (n > 1) {
 616                 /* This should never happen */
 617                 debug("vmusage returned more than one result\n");
 618                 (void) sleep_shutdown(1);
 619                 return (0);
 620         }
 621 
 622         if (buf.vmu_id != zid) {
 623                 /* This should never happen */
 624                 debug("vmusage returned the incorrect zone\n");
 625                 (void) sleep_shutdown(1);
 626                 return (0);
 627         }
 628 
 629         accurate_rss = buf.vmu_rss_all / 1024;
 630 
 631         /* calculate scaling factor to use for fast_rss from now on */
 632         if (accurate_rss > 0) {
 633                 scale_rss = fast_rss / accurate_rss;
 634                 debug("new scaling factor: %llu\n", scale_rss);
 635                 /* remember the fast rss when we had to get the accurate rss */
 636                 prev_fast_rss = tmp_rss;
 637         }
 638 
 639         debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
 640             scale_rss, prev_fast_rss);
 641         return (accurate_rss);
 642 }
 643 
 644 /*
 645  * Needed to read the zones physical-memory-cap rctl.
 646  */
 647 static struct ps_prochandle *
 648 grab_zone_proc()
 649 {
 650         DIR *dirp;
 651         struct dirent *dentp;
 652         struct ps_prochandle *ph = NULL;
 653         int tmp;
 654 
 655         if ((dirp = opendir(zoneproc)) == NULL)
 656                 return (NULL);
 657 
 658         while (!shutting_down && (dentp = readdir(dirp))) {
 659                 int pid;
 660 
 661                 if (strcmp(".", dentp->d_name) == 0 ||
 662                     strcmp("..", dentp->d_name) == 0)
 663                         continue;
 664 
 665                 pid = atoi(dentp->d_name);
 666                 /* attempt to grab process */
 667                 if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
 668                         if (Psetflags(ph, PR_RLC) == 0) {
 669                                 if (Pcreate_agent(ph) == 0) {
 670                                         (void) closedir(dirp);
 671                                         return (ph);
 672                                 }
 673                         }
 674                         Prelease(ph, 0);
 675                 }
 676         }
 677 
 678         (void) closedir(dirp);
 679         return (NULL);
 680 }
 681 
 682 static uint64_t
 683 get_zone_cap()
 684 {
 685         rctlblk_t *rblk;
 686         uint64_t mcap;
 687         struct ps_prochandle *ph;
 688 
 689         if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
 690                 return (UINT64_MAX);
 691 
 692         if ((ph = grab_zone_proc()) == NULL) {
 693                 free(rblk);
 694                 return (UINT64_MAX);
 695         }
 696 
 697         if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
 698             RCTL_FIRST)) {
 699                 Pdestroy_agent(ph);
 700                 Prelease(ph, 0);
 701                 free(rblk);
 702                 return (UINT64_MAX);
 703         }
 704 
 705         Pdestroy_agent(ph);
 706         Prelease(ph, 0);
 707 
 708         mcap = rctlblk_get_value(rblk);
 709         free(rblk);
 710         return (mcap);
 711 }
 712 
 713 /*
 714  * check_suspend is invoked at the beginning of every pass through the process
 715  * list or after we've paged out enough so that we think the excess is under
 716  * the cap.  The purpose is to periodically check the zone's rss and return
 717  * the excess when the zone is over the cap.  The rest of the time this
 718  * function will sleep, periodically waking up to check the current rss.
 719  *
 720  * Depending on the percentage of penetration of the zone's rss into the
 721  * cap we sleep for longer or shorter amounts. This reduces the impact of this
 722  * work on the system, which is important considering that each zone will be
 723  * monitoring its rss.
 724  */
 725 static int64_t
 726 check_suspend()
 727 {
 728         static hrtime_t last_cap_read = 0;
 729         static uint64_t addon;
 730         static uint64_t lo_thresh;      /* Thresholds for how long to  sleep */
 731         static uint64_t hi_thresh;      /* when under the cap (80% & 90%). */
 732         static uint64_t prev_zone_rss = 0;
 733         static uint32_t pfdelay = 0;    /* usec page fault delay when over */
 734 
 735         /* Wait a second to give the async pageout a chance to catch up. */
 736         (void) sleep_shutdown(1);
 737 
 738         while (!shutting_down) {
 739                 int64_t new_excess;
 740                 int sleep_time;
 741                 hrtime_t now;
 742                 struct stat st;
 743                 uint64_t zone_rss;              /* total RSS(KB) */
 744 
 745                 /*
 746                  * Check if the debug log files exists and enable or disable
 747                  * debug.
 748                  */
 749                 if (debug_log_fp == NULL) {
 750                         if (stat(debug_log, &st) == 0)
 751                                 debug_log_fp = fopen(debug_log, "w");
 752                 } else {
 753                         if (stat(debug_log, &st) == -1) {
 754                                 (void) fclose(debug_log_fp);
 755                                 debug_log_fp = NULL;
 756                         }
 757                 }
 758 
 759                 /*
 760                  * If the CAP_REFRESH interval has passed, re-get the current
 761                  * cap in case it has been dynamically updated.
 762                  */
 763                 now = gethrtime();
 764                 if (now - last_cap_read > CAP_REFRESH) {
 765                         uint64_t mcap;
 766 
 767                         last_cap_read = now;
 768 
 769                         mcap = get_zone_cap();
 770                         if (mcap != 0 && mcap != UINT64_MAX)
 771                                 zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
 772                         else
 773                                 zone_rss_cap = UINT64_MAX;
 774 
 775                         lo_thresh = (uint64_t)(zone_rss_cap * .8);
 776                         hi_thresh = (uint64_t)(zone_rss_cap * .9);
 777                         addon = (uint64_t)(zone_rss_cap * 0.05);
 778 
 779                         /*
 780                          * We allow the memory cap tunables to be changed on
 781                          * the fly.
 782                          */
 783                         get_mcap_tunables();
 784 
 785                         debug("%s: %s\n", TUNE_CMD, over_cmd);
 786                         debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
 787                         debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
 788                         debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
 789                         debug("current cap %lluKB lo %lluKB hi %lluKB\n",
 790                             zone_rss_cap, lo_thresh, hi_thresh);
 791                 }
 792 
 793                 /* No cap, nothing to do. */
 794                 if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
 795                         debug("no cap, sleep 120 seconds\n");
 796                         (void) sleep_shutdown(120);
 797                         continue;
 798                 }
 799 
 800                 zone_rss = get_mem_info();
 801 
 802                 /* calculate excess */
 803                 new_excess = zone_rss - zone_rss_cap;
 804 
 805                 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
 806                     zone_rss, zone_rss_cap, new_excess);
 807 
 808                 /*
 809                  * If necessary, updates stats.
 810                  */
 811 
 812                 /*
 813                  * If it looks like we did some paging out since last over the
 814                  * cap then update the kstat so we can approximate how much was
 815                  * paged out.
 816                  */
 817                 if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
 818                         uint64_t diff;
 819 
 820                         /* assume diff is num bytes we paged out */
 821                         diff = (prev_zone_rss - zone_rss) * 1024;
 822 
 823                         (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
 824                             &diff, 0);
 825                 }
 826                 prev_zone_rss = zone_rss;
 827 
 828                 if (new_excess > 0) {
 829                         uint64_t n = 1;
 830 
 831                         /* Increment "nover" kstat. */
 832                         (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
 833 
 834                         if (!skip_pf_throttle) {
 835                                 /*
 836                                  * Tell the kernel to start throttling page
 837                                  * faults by some number of usecs to help us
 838                                  * catch up. If we are persistently over the
 839                                  * cap the delay ramps up to a max of 2000usecs.
 840                                  * Note that for delays less than 1 tick
 841                                  * (i.e. all of these) we busy-wait in as_fault.
 842                                  *      delay   faults/sec
 843                                  *       125    8000
 844                                  *       250    4000
 845                                  *       500    2000
 846                                  *      1000    1000
 847                                  *      2000     500
 848                                  */
 849                                 if (pfdelay == 0)
 850                                         pfdelay = 125;
 851                                 else if (pfdelay < 2000)
 852                                         pfdelay *= 2;
 853 
 854                                 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 855                                     &pfdelay, 0);
 856                         }
 857 
 858                         /*
 859                          * Once we go over the cap, then we want to
 860                          * page out a little extra instead of stopping
 861                          * right at the cap. To do this we add 5% to
 862                          * the excess so that pageout_proces will work
 863                          * a little longer before stopping.
 864                          */
 865                         return ((int64_t)(new_excess + addon));
 866                 }
 867 
 868                 /*
 869                  * At this point we are under the cap.
 870                  *
 871                  * Tell the kernel to stop throttling page faults.
 872                  *
 873                  * Scale the amount of time we sleep before rechecking the
 874                  * zone's memory usage.  Also, scale the accpetable age of
 875                  * cached results from vm_getusage.  We do this based on the
 876                  * penetration into the capped limit.
 877                  */
 878                 if (pfdelay > 0) {
 879                         pfdelay = 0;
 880                         (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 881                             &pfdelay, 0);
 882                 }
 883 
 884                 if (zone_rss <= lo_thresh) {
 885                         sleep_time = 120;
 886                 } else if (zone_rss <= hi_thresh) {
 887                         sleep_time = 60;
 888                 } else {
 889                         sleep_time = 30;
 890                 }
 891 
 892                 debug("sleep %d seconds\n", sleep_time);
 893                 (void) sleep_shutdown(sleep_time);
 894         }
 895 
 896         /* Shutting down, tell the kernel so it doesn't throttle */
 897         if (pfdelay > 0) {
 898                 pfdelay = 0;
 899                 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
 900         }
 901 
 902         return (0);
 903 }
 904 
 905 static void
 906 get_mcap_tunables()
 907 {
 908         zone_dochandle_t handle;
 909         struct zone_attrtab attr;
 910 
 911         over_cmd[0] = '\0';
 912         if ((handle = zonecfg_init_handle()) == NULL)
 913                 return;
 914 
 915         if (zonecfg_get_handle(zone_name, handle) != Z_OK)
 916                 goto done;
 917 
 918         /* Reset to defaults in case rebooting and settings have changed */
 919         over_cmd[0] = '\0';
 920         skip_vmusage = B_FALSE;
 921         skip_pageout = B_FALSE;
 922         skip_pf_throttle = B_FALSE;
 923 
 924         if (zonecfg_setattrent(handle) != Z_OK)
 925                 goto done;
 926         while (zonecfg_getattrent(handle, &attr) == Z_OK) {
 927                 if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
 928                         (void) strlcpy(over_cmd, attr.zone_attr_value,
 929                             sizeof (over_cmd));
 930                 } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
 931                         if (strcmp("true", attr.zone_attr_value) == 0)
 932                                 skip_vmusage = B_TRUE;
 933                 } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
 934                         if (strcmp("true", attr.zone_attr_value) == 0)
 935                                 skip_pageout = B_TRUE;
 936                 } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
 937                         if (strcmp("true", attr.zone_attr_value) == 0)
 938                                 skip_pf_throttle = B_TRUE;
 939                 }
 940         }
 941         (void) zonecfg_endattrent(handle);
 942 
 943 done:
 944         zonecfg_fini_handle(handle);
 945 }
 946 
 947 /* ARGSUSED */
 948 static int
 949 chk_proc_fs(void *data, const char *spec, const char *dir,
 950     const char *fstype, const char *opt)
 951 {
 952         if (fstype != NULL && strcmp(fstype, "proc") == 0)
 953                 *((boolean_t *)data) = B_TRUE;
 954 
 955         return (0);
 956 }
 957 
 958 static boolean_t
 959 has_proc()
 960 {
 961         brand_handle_t bh;
 962         boolean_t fnd = B_FALSE;
 963 
 964         if ((bh = brand_open(brand_name)) != NULL) {
 965                 (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
 966         }
 967 
 968         brand_close(bh);
 969         return (fnd);
 970 }
 971 
 972 /*
 973  * We run this loop for brands with no /proc to simply update the RSS, using
 974  * the cheap GZ /proc data, every 5 minutes.
 975  */
 976 static void
 977 no_procfs()
 978 {
 979         DIR                     *pdir = NULL;
 980         struct dirent           *dent;
 981         uint64_t                zone_rss_bytes;
 982 
 983         (void) sleep_shutdown(30);
 984         while (!shutting_down) {
 985                 /*
 986                  * Just do the fast, cheap RSS calculation using the rss value
 987                  * in psinfo_t.  Because that's per-process, it can lead to
 988                  * double counting some memory and overestimating how much is
 989                  * being used. Since there is no /proc in the zone, we use the
 990                  * GZ /proc and check for the correct zone.
 991                  */
 992                 if ((pdir = opendir("/proc")) == NULL)
 993                         return;
 994 
 995                 fast_rss = 0;
 996                 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 997                         pid_t           pid;
 998                         int             psfd;
 999                         int64_t         rss;
1000                         char            pathbuf[MAXPATHLEN];
1001                         psinfo_t        psinfo;
1002 
1003                         if (strcmp(".", dent->d_name) == 0 ||
1004                             strcmp("..", dent->d_name) == 0)
1005                                 continue;
1006 
1007                         pid = atoi(dent->d_name);
1008                         if (pid == 0 || pid == 1)
1009                                 continue;
1010 
1011                         (void) snprintf(pathbuf, sizeof (pathbuf),
1012                             "/proc/%d/psinfo", pid);
1013 
1014                         rss = 0;
1015                         if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1016                                 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1017                                     sizeof (psinfo)) {
1018                                         if (psinfo.pr_zoneid == zid)
1019                                                 rss = (int64_t)psinfo.pr_rssize;
1020                                 }
1021 
1022                                 (void) close(psfd);
1023                         }
1024 
1025                         fast_rss += rss;
1026                 }
1027 
1028                 (void) closedir(pdir);
1029 
1030                 if (shutting_down)
1031                         return;
1032 
1033                 zone_rss_bytes = fast_rss * 1024;
1034                 /* Use the zone's approx. RSS in the kernel */
1035                 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1036 
1037                 (void) sleep_shutdown(300);
1038         }
1039 }
1040 
1041 /*
1042  * Thread that checks zone's memory usage and when over the cap, goes through
1043  * the zone's process list trying to pageout processes to get under the cap.
1044  */
1045 static void
1046 mcap_zone()
1047 {
1048         DIR *pdir = NULL;
1049         int64_t excess;
1050 
1051         debug("thread startup\n");
1052 
1053         get_mcap_tunables();
1054 
1055         /*
1056          * If the zone has no /proc filesystem, we can't use the fast algorithm
1057          * to check RSS or pageout any processes. All we can do is periodically
1058          * update it's RSS kstat using the expensive sycall.
1059          */
1060         if (!has_proc()) {
1061                 no_procfs();
1062                 debug("thread shutdown\n");
1063                 return;
1064         }
1065 
1066         /*
1067          * When first starting it is likely lots of other zones are starting
1068          * too because the system is booting.  Since we just started the zone
1069          * we're not worried about being over the cap right away, so we let
1070          * things settle a bit and tolerate some older data here to minimize
1071          * the load on the system.
1072          */
1073         (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1074 
1075         /* Wait until zone's /proc is mounted */
1076         while (!shutting_down) {
1077                 struct stat st;
1078 
1079                 if (stat(zoneproc, &st) == 0 &&
1080                     strcmp(st.st_fstype, "proc") == 0)
1081                         break;
1082                 sleep_shutdown(5);
1083         }
1084 
1085         /* Open zone's /proc and walk entries. */
1086         while (!shutting_down) {
1087                 if ((pdir = opendir(zoneproc)) != NULL)
1088                         break;
1089                 sleep_shutdown(5);
1090         }
1091 
1092         while (!shutting_down) {
1093                 struct dirent *dirent;
1094 
1095                 /* Wait until we've gone over the cap. */
1096                 excess = check_suspend();
1097 
1098                 debug("starting to scan, excess %lldk\n", (long long)excess);
1099 
1100                 if (over_cmd[0] != '\0') {
1101                         uint64_t zone_rss;      /* total RSS(KB) */
1102 
1103                         debug("run phys_mcap_cmd: %s\n", over_cmd);
1104                         run_over_cmd();
1105 
1106                         zone_rss = get_mem_info();
1107                         excess = zone_rss - zone_rss_cap;
1108                         debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1109                             zone_rss, zone_rss_cap, excess);
1110                         if (excess <= 0)
1111                                 continue;
1112                 }
1113 
1114                 while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1115                         pid_t pid;
1116 
1117                         if (strcmp(".", dirent->d_name) == 0 ||
1118                             strcmp("..", dirent->d_name) == 0)
1119                                 continue;
1120 
1121                         pid = atoi(dirent->d_name);
1122                         if (pid == 0 || pid == 1)
1123                                 continue;
1124 
1125                         if (skip_pageout)
1126                                 (void) sleep_shutdown(2);
1127                         else
1128                                 excess = pageout_process(pid, excess);
1129 
1130                         if (excess <= 0) {
1131                                 debug("apparently under; excess %lld\n",
1132                                     (long long)excess);
1133                                 /* Double check the current excess */
1134                                 excess = check_suspend();
1135                         }
1136                 }
1137 
1138                 debug("process pass done; excess %lld\n", (long long)excess);
1139                 rewinddir(pdir);
1140 
1141                 if (skip_pageout)
1142                         (void) sleep_shutdown(120);
1143         }
1144 
1145         if (pdir != NULL)
1146                 (void) closedir(pdir);
1147         debug("thread shutdown\n");
1148 }
1149 
1150 void
1151 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1152 {
1153         int             res;
1154 
1155         shutting_down = 0;
1156         zid = id;
1157 
1158         /* all but the lx brand currently use /proc */
1159         if (strcmp(brand_name, "lx") == 0) {
1160                 (void) snprintf(zoneproc, sizeof (zoneproc),
1161                     "%s/root/native/proc", zonepath);
1162         } else {
1163                 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1164                     zonepath);
1165         }
1166 
1167         (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1168             zonepath);
1169 
1170         res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1171             &mcap_tid);
1172         if (res != 0) {
1173                 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1174                     res);
1175                 mcap_tid = 0;
1176         }
1177 }
1178 
1179 void
1180 destroy_mcap_thread()
1181 {
1182         if (mcap_tid != 0) {
1183                 shutting_down = 1;
1184                 (void) cond_signal(&shutdown_cv);
1185                 (void) thr_join(mcap_tid, NULL, NULL);
1186                 mcap_tid = 0;
1187         }
1188 }