1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23  * Copyright 2014, Joyent, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  * This file implements the code which runs a thread inside zoneadmd to cap
  28  * the associated zone's physical memory.  A thread to do this is started
  29  * when the zone boots and is halted when the zone shuts down.
  30  *
  31  * Because of the way that the VM system is currently implemented, there is no
  32  * way to go from the bottom up (page to process to zone).  Thus, there is no
  33  * obvious way to hook an rctl into the kernel's paging code to enforce a hard
  34  * memory cap.  Instead, we implement a soft physical memory cap which looks
  35  * at the zone's overall rss and once it is over the cap, works from the top
  36  * down (zone to process to page), looking at zone processes, to determine
  37  * what to try to pageout to get the zone under its memory cap.
  38  *
  39  * The code uses the fast, cheap, but potentially very inaccurate sum of the
  40  * rss values from psinfo_t to first approximate the zone's rss and will
  41  * fallback to the vm_getusage syscall to determine the zone's rss if needed.
  42  * It then checks the rss against the zone's zone.max-physical-memory rctl.
  43  * Once the zone goes over its cap, then this thread will work through the
  44  * zone's /proc process list, Pgrab-bing each process and stepping through the
  45  * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
  46  * to pageout pages, until the zone is again under its cap.
  47  *
  48  * Although zone memory capping is implemented as a soft cap by this user-level
  49  * thread, the interfaces around memory caps that are exposed to the user are
  50  * the standard ones; an rctl and kstats.  This thread uses the rctl value
  51  * to obtain the cap and works with the zone kernel code to update the kstats.
  52  * If the implementation ever moves into the kernel, these exposed interfaces
  53  * do not need to change.
  54  *
  55  * The thread adaptively sleeps, periodically checking the state of the
  56  * zone.  As the zone's rss gets closer to the cap, the thread will wake up
  57  * more often to check the zone's status.  Once the zone is over the cap,
  58  * the thread will work to pageout until the zone is under the cap, as shown
  59  * by updated vm_usage data.
  60  *
  61  * NOTE: The pagedata page maps (at least on x86) are not useful.  Those flags
  62  * are set by hrm_setbits() and on x86 that code path is only executed by
  63  *     segvn_pagelock -> hat_setstat -> hrm_setbits
  64  *     segvn_softunlock -^
  65  * On SPARC there is an additional code path which may make this data
  66  * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
  67  * maps.  If we ever fix this issue, then we could generalize this mcap code to
  68  * do more with the data on active pages.
  69  *
  70  * For debugging, touch the file {zonepath}/mcap_debug.log.  This will
  71  * cause the thread to start logging its actions into that file (it may take
  72  * a minute or two if the thread is currently sleeping).  Removing that
  73  * file will cause logging to stop.
  74  */
  75 
  76 #include <sys/mman.h>
  77 #include <sys/param.h>
  78 #include <sys/stat.h>
  79 #include <sys/types.h>
  80 #include <assert.h>
  81 #include <errno.h>
  82 #include <fcntl.h>
  83 #include <libproc.h>
  84 #include <limits.h>
  85 #include <procfs.h>
  86 #include <stdio.h>
  87 #include <stdlib.h>
  88 #include <strings.h>
  89 #include <time.h>
  90 #include <unistd.h>
  91 #include <sys/priocntl.h>
  92 #include <dirent.h>
  93 #include <zone.h>
  94 #include <libzonecfg.h>
  95 #include <thread.h>
  96 #include <values.h>
  97 #include <sys/vm_usage.h>
  98 #include <sys/resource.h>
  99 #include <sys/debug.h>
 100 #include <synch.h>
 101 #include <wait.h>
 102 #include <libcontract.h>
 103 #include <libcontract_priv.h>
 104 #include <sys/contract/process.h>
 105 #include "zoneadmd.h"
 106 
 107                                         /* round up to next y = 2^n */
 108 #define ROUNDUP(x, y)   (((x) + ((y) - 1)) & ~((y) - 1))
 109 
 110 #define CAP_REFRESH     ((uint64_t)300 * NANOSEC) /* every 5 minutes */
 111 
 112 /*
 113  * zonecfg attribute tunables for memory capping.
 114  *    phys-mcap-cmd
 115  *      type: string
 116  *      specifies a command that can be run when over the cap
 117  *    phys-mcap-no-vmusage
 118  *      type: boolean
 119  *      true disables vm_getusage and just uses zone's proc. rss sum
 120  *    phys-mcap-no-pageout
 121  *      type: boolean
 122  *      true disables pageout when over
 123  *    phys-mcap-no-pf-throttle
 124  *      type: boolean
 125  *      true disables page fault throttling when over
 126  */
 127 #define TUNE_CMD        "phys-mcap-cmd"
 128 #define TUNE_NVMU       "phys-mcap-no-vmusage"
 129 #define TUNE_NPAGE      "phys-mcap-no-pageout"
 130 #define TUNE_NPFTHROT   "phys-mcap-no-pf-throttle"
 131 
 132 /*
 133  * The large mapping value was derived empirically by seeing that mappings
 134  * much bigger than 16mb sometimes take a relatively long time to invalidate
 135  * (significant fraction of a second).
 136  */
 137 #define SEC_INTERIM     4       /* num secs to pause after stopped too long */
 138 #define MSEC_TOO_LONG   100     /* release proc. after stopped for 100ms */
 139 #define LARGE_MAPPING   16384   /* >= 16MB in KB - pageout in chunks */
 140 
 141 /*
 142  * These are only used in get_mem_info but global. We always need scale_rss and
 143  * prev_fast_rss to be persistent but we also have the other two global so we
 144  * can easily see these with mdb.
 145  */
 146 uint64_t        scale_rss = 0;
 147 uint64_t        prev_fast_rss = 0;
 148 uint64_t        fast_rss = 0;
 149 uint64_t        accurate_rss = 0;
 150 
 151 static char     zoneproc[MAXPATHLEN];
 152 static char     debug_log[MAXPATHLEN];
 153 static zoneid_t zid;
 154 static mutex_t  shutdown_mx;
 155 static cond_t   shutdown_cv;
 156 static int      shutting_down = 0;
 157 static thread_t mcap_tid;
 158 static FILE     *debug_log_fp = NULL;
 159 static uint64_t zone_rss_cap;           /* RSS cap(KB) */
 160 static char     over_cmd[2 * BUFSIZ];   /* same size as zone_attr_value */
 161 static boolean_t skip_vmusage = B_FALSE;
 162 static boolean_t skip_pageout = B_FALSE;
 163 static boolean_t skip_pf_throttle = B_FALSE;
 164 
 165 static zlog_t   *logp;
 166 
 167 static int64_t check_suspend();
 168 static void get_mcap_tunables();
 169 
 170 /*
 171  * Structure to hold current state about a process address space that we're
 172  * working on.
 173  */
 174 typedef struct {
 175         int pr_curr;            /* the # of the mapping we're working on */
 176         int pr_nmap;            /* number of mappings in address space */
 177         prmap_t *pr_mapp;       /* process's map array */
 178 } proc_map_t;
 179 
 180 typedef struct zsd_vmusage64 {
 181         id_t vmu_zoneid;
 182         uint_t vmu_type;
 183         id_t vmu_id;
 184         /*
 185          * An amd64 kernel will align the following uint64_t members, but a
 186          * 32bit i386 process will not without help.
 187          */
 188         int vmu_align_next_members_on_8_bytes;
 189         uint64_t vmu_rss_all;
 190         uint64_t vmu_rss_private;
 191         uint64_t vmu_rss_shared;
 192         uint64_t vmu_swap_all;
 193         uint64_t vmu_swap_private;
 194         uint64_t vmu_swap_shared;
 195 } zsd_vmusage64_t;
 196 
 197 /*
 198  * Output a debug log message.
 199  */
 200 /*PRINTFLIKE1*/
 201 static void
 202 debug(char *fmt, ...)
 203 {
 204         va_list ap;
 205 
 206         if (debug_log_fp == NULL)
 207                 return;
 208 
 209         va_start(ap, fmt);
 210         (void) vfprintf(debug_log_fp, fmt, ap);
 211         va_end(ap);
 212         (void) fflush(debug_log_fp);
 213 }
 214 
 215 /*
 216  * Like sleep(3C) but can be interupted by cond_signal which is posted when
 217  * we're shutting down the mcap thread.
 218  */
 219 static void
 220 sleep_shutdown(int secs)
 221 {
 222         timestruc_t to;
 223 
 224         to.tv_sec = secs;
 225         to.tv_nsec = 0;
 226 
 227         (void) mutex_lock(&shutdown_mx);
 228         if (!shutting_down)
 229                 (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
 230         (void) mutex_unlock(&shutdown_mx);
 231 }
 232 
 233 static boolean_t
 234 proc_issystem(pid_t pid)
 235 {
 236         char pc_clname[PC_CLNMSZ];
 237 
 238         if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
 239             PC_KY_NULL) != -1)
 240                 return (strcmp(pc_clname, "SYS") == 0);
 241 
 242         return (B_TRUE);
 243 }
 244 
 245 /*
 246  * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
 247  */
 248 static void
 249 run_over_cmd()
 250 {
 251         int             ctfd;
 252         int             err;
 253         pid_t           childpid;
 254         siginfo_t       info;
 255         ctid_t          ct;
 256 
 257         /*
 258          * Before we enter the zone, we need to create a new process contract
 259          * for the child, as required by zone_enter().
 260          */
 261         if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
 262                 return;
 263         if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
 264             ct_tmpl_set_informative(ctfd, 0) != 0 ||
 265             ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
 266             ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
 267             ct_tmpl_activate(ctfd) != 0) {
 268                 (void) close(ctfd);
 269                 return;
 270         }
 271 
 272         childpid = fork();
 273         switch (childpid) {
 274         case -1:
 275                 (void) ct_tmpl_clear(ctfd);
 276                 (void) close(ctfd);
 277                 break;
 278         case 0: /* Child */
 279                 (void) ct_tmpl_clear(ctfd);
 280                 (void) close(ctfd);
 281                 if (zone_enter(zid) == -1)
 282                         _exit(errno);
 283                 err = system(over_cmd);
 284                 _exit(err);
 285                 break;
 286         default:        /* Parent */
 287                 if (contract_latest(&ct) == -1)
 288                         ct = -1;
 289                 (void) ct_tmpl_clear(ctfd);
 290                 (void) close(ctfd);
 291                 err = waitid(P_PID, childpid, &info, WEXITED);
 292                 (void) contract_abandon_id(ct);
 293                 if (err == -1 || info.si_status != 0)
 294                         debug("over_cmd failed");
 295                 break;
 296         }
 297 }
 298 
 299 /*
 300  * Get the next mapping.
 301  */
 302 static prmap_t *
 303 nextmapping(proc_map_t *pmp)
 304 {
 305         if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
 306                 return (NULL);
 307 
 308         return (&pmp->pr_mapp[pmp->pr_curr++]);
 309 }
 310 
 311 /*
 312  * Initialize the proc_map_t to access the first mapping of an address space.
 313  */
 314 static prmap_t *
 315 init_map(proc_map_t *pmp, pid_t pid)
 316 {
 317         int fd;
 318         int res;
 319         struct stat st;
 320         char pathbuf[MAXPATHLEN];
 321 
 322         bzero(pmp, sizeof (proc_map_t));
 323         pmp->pr_nmap = -1;
 324 
 325         (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
 326         if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
 327                 return (NULL);
 328 
 329 redo:
 330         errno = 0;
 331         if (fstat(fd, &st) != 0)
 332                 goto done;
 333 
 334         if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
 335                 debug("cannot malloc() %ld bytes for xmap", st.st_size);
 336                 goto done;
 337         }
 338         (void) bzero(pmp->pr_mapp, st.st_size);
 339 
 340         errno = 0;
 341         if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
 342                 free(pmp->pr_mapp);
 343                 pmp->pr_mapp = NULL;
 344                 if (res > 0 || errno == E2BIG) {
 345                         goto redo;
 346                 } else {
 347                         debug("pid %ld cannot read xmap\n", pid);
 348                         goto done;
 349                 }
 350         }
 351 
 352         pmp->pr_nmap = st.st_size / sizeof (prmap_t);
 353 
 354 done:
 355         (void) close(fd);
 356         return (nextmapping(pmp));
 357 }
 358 
 359 /*
 360  * Attempt to invalidate the entire mapping from within the given process's
 361  * address space. May return nonzero with errno as:
 362  *    ESRCH  - process not found
 363  *    ENOMEM - segment not found
 364  *    EINVAL - mapping exceeds a single segment
 365  */
 366 static int
 367 pageout_mapping(pid_t pid, prmap_t *pmp)
 368 {
 369         int res;
 370 
 371         if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
 372                 return (0);
 373 
 374         errno = 0;
 375         res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
 376             pmp->pr_size);
 377 
 378         return (res);
 379 }
 380 
 381 /*
 382  * Work through a process paging out mappings until the whole address space was
 383  * examined or the excess is < 0.  Return our estimate of the updated excess.
 384  */
 385 static int64_t
 386 pageout_process(pid_t pid, int64_t excess)
 387 {
 388         int                     psfd;
 389         prmap_t                 *pmap;
 390         proc_map_t              cur;
 391         int                     res;
 392         int64_t                 sum_d_rss, d_rss;
 393         int64_t                 old_rss;
 394         int                     map_cnt;
 395         psinfo_t                psinfo;
 396         char                    pathbuf[MAXPATHLEN];
 397 
 398         (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
 399             pid);
 400         if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
 401                 return (excess);
 402 
 403         cur.pr_mapp = NULL;
 404 
 405         if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
 406                 goto done;
 407 
 408         old_rss = (int64_t)psinfo.pr_rssize;
 409         map_cnt = 0;
 410 
 411         /* If unscannable, skip it. */
 412         if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
 413                 debug("pid %ld: system process, skipping %s\n",
 414                     pid, psinfo.pr_psargs);
 415                 goto done;
 416         }
 417 
 418         /* If tiny RSS (16KB), skip it. */
 419         if (old_rss <= 16) {
 420                 debug("pid %ld: skipping, RSS %lldKB %s\n",
 421                     pid, old_rss, psinfo.pr_psargs);
 422                 goto done;
 423         }
 424 
 425         /* Get segment residency information. */
 426         pmap = init_map(&cur, pid);
 427 
 428         /* Skip process if it has no mappings. */
 429         if (pmap == NULL) {
 430                 debug("pid %ld: map unreadable; ignoring\n", pid);
 431                 goto done;
 432         }
 433 
 434         debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
 435             pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
 436 
 437         /*
 438          * Within the process's address space, attempt to page out mappings.
 439          */
 440         sum_d_rss = 0;
 441         while (excess > 0 && pmap != NULL && !shutting_down) {
 442                 /* invalidate the entire mapping */
 443                 if ((res = pageout_mapping(pid, pmap)) < 0)
 444                         debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
 445                             pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
 446 
 447                 map_cnt++;
 448 
 449                 /*
 450                  * Re-check the process rss and get the delta.
 451                  */
 452                 if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 453                     != sizeof (psinfo)) {
 454                         excess -= old_rss;
 455                         goto done;
 456                 }
 457 
 458                 d_rss = (int64_t)psinfo.pr_rssize - old_rss;
 459                 old_rss = (int64_t)psinfo.pr_rssize;
 460                 sum_d_rss += d_rss;
 461 
 462                 /*
 463                  * d_rss hopefully should be negative (or 0 if nothing
 464                  * invalidated) but can be positive if more got paged in.
 465                  */
 466                 excess += d_rss;
 467 
 468                 if (excess <= 0) {
 469                         debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
 470                             "excess %lldKB\n", pid, map_cnt,
 471                             (unsigned long long)sum_d_rss, (long long)excess);
 472                         map_cnt = 0;
 473 
 474                         /*
 475                          * If we're actually under, this will suspend checking
 476                          * in the middle of this process's address space.
 477                          */
 478                         excess = check_suspend();
 479                         if (shutting_down)
 480                                 goto done;
 481 
 482                         /*
 483                          * since we might have suspended, re-read process's rss
 484                          */
 485                         if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 486                             != sizeof (psinfo)) {
 487                                 excess -= old_rss;
 488                                 goto done;
 489                         }
 490 
 491                         old_rss = (int64_t)psinfo.pr_rssize;
 492 
 493                         debug("pid %ld: resume pageout; excess %lld\n", pid,
 494                             (long long)excess);
 495                         sum_d_rss = 0;
 496                 }
 497 
 498                 pmap = nextmapping(&cur);
 499         }
 500 
 501         debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
 502             pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
 503 
 504 done:
 505         if (cur.pr_mapp != NULL)
 506                 free(cur.pr_mapp);
 507 
 508         (void) close(psfd);
 509 
 510         if (shutting_down)
 511                 return (0);
 512 
 513         return (excess);
 514 }
 515 
 516 /*
 517  * Get the zone's RSS data.
 518  */
 519 static uint64_t
 520 get_mem_info()
 521 {
 522         uint64_t                n = 1;
 523         zsd_vmusage64_t         buf;
 524         uint64_t                tmp_rss;
 525         DIR                     *pdir = NULL;
 526         struct dirent           *dent;
 527 
 528         /*
 529          * Start by doing the fast, cheap RSS calculation using the rss value
 530          * in psinfo_t.  Because that's per-process, it can lead to double
 531          * counting some memory and overestimating how much is being used, but
 532          * as long as that's not over the cap, then we don't need do the
 533          * expensive calculation.
 534          *
 535          * If we have to do the expensive calculation, we remember the scaling
 536          * factor so that we can try to use that on subsequent iterations for
 537          * the fast rss.
 538          */
 539         if (shutting_down)
 540                 return (0);
 541 
 542         if ((pdir = opendir(zoneproc)) == NULL)
 543                 return (0);
 544 
 545         accurate_rss = 0;
 546         fast_rss = 0;
 547         while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 548                 pid_t           pid;
 549                 int             psfd;
 550                 int64_t         rss;
 551                 char            pathbuf[MAXPATHLEN];
 552                 psinfo_t        psinfo;
 553 
 554                 if (strcmp(".", dent->d_name) == 0 ||
 555                     strcmp("..", dent->d_name) == 0)
 556                         continue;
 557 
 558                 pid = atoi(dent->d_name);
 559                 if (pid == 0 || pid == 1)
 560                         continue;
 561 
 562                 (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
 563                     zoneproc, pid);
 564 
 565                 rss = 0;
 566                 if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
 567                         if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
 568                             sizeof (psinfo))
 569                                 rss = (int64_t)psinfo.pr_rssize;
 570 
 571                         (void) close(psfd);
 572                 }
 573 
 574                 fast_rss += rss;
 575         }
 576 
 577         (void) closedir(pdir);
 578 
 579         if (shutting_down)
 580                 return (0);
 581 
 582         debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
 583             scale_rss, prev_fast_rss);
 584 
 585         /* see if we can get by with a scaled fast rss */
 586         tmp_rss = fast_rss;
 587         if (scale_rss > 1 && prev_fast_rss > 0) {
 588                 /*
 589                  * Only scale the fast value if it hasn't ballooned too much
 590                  * to trust.
 591                  */
 592                 if (fast_rss / prev_fast_rss < 2) {
 593                         fast_rss /= scale_rss;
 594                         debug("scaled fast rss: %lluKB\n", fast_rss);
 595                 }
 596         }
 597 
 598         if (fast_rss <= zone_rss_cap || skip_vmusage) {
 599                 uint64_t zone_rss_bytes;
 600 
 601                 zone_rss_bytes = fast_rss * 1024;
 602                 /* Use the zone's approx. RSS in the kernel */
 603                 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
 604                 return (fast_rss);
 605         }
 606 
 607         buf.vmu_id = zid;
 608 
 609         /* get accurate usage (cached data may be up to 5 seconds old) */
 610         if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
 611             (uintptr_t)&buf, (uintptr_t)&n) != 0) {
 612                 debug("vmusage failed\n");
 613                 (void) sleep_shutdown(1);
 614                 return (0);
 615         }
 616 
 617         if (n > 1) {
 618                 /* This should never happen */
 619                 debug("vmusage returned more than one result\n");
 620                 (void) sleep_shutdown(1);
 621                 return (0);
 622         }
 623 
 624         if (buf.vmu_id != zid) {
 625                 /* This should never happen */
 626                 debug("vmusage returned the incorrect zone\n");
 627                 (void) sleep_shutdown(1);
 628                 return (0);
 629         }
 630 
 631         accurate_rss = buf.vmu_rss_all / 1024;
 632 
 633         /* calculate scaling factor to use for fast_rss from now on */
 634         if (accurate_rss > 0) {
 635                 scale_rss = fast_rss / accurate_rss;
 636                 debug("new scaling factor: %llu\n", scale_rss);
 637                 /* remember the fast rss when we had to get the accurate rss */
 638                 prev_fast_rss = tmp_rss;
 639         }
 640 
 641         debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
 642             scale_rss, prev_fast_rss);
 643         return (accurate_rss);
 644 }
 645 
 646 /*
 647  * Needed to read the zones physical-memory-cap rctl.
 648  */
 649 static struct ps_prochandle *
 650 grab_zone_proc()
 651 {
 652         DIR *dirp;
 653         struct dirent *dentp;
 654         struct ps_prochandle *ph = NULL;
 655         int tmp;
 656 
 657         if ((dirp = opendir(zoneproc)) == NULL)
 658                 return (NULL);
 659 
 660         while (!shutting_down && (dentp = readdir(dirp))) {
 661                 int pid;
 662 
 663                 if (strcmp(".", dentp->d_name) == 0 ||
 664                     strcmp("..", dentp->d_name) == 0)
 665                         continue;
 666 
 667                 pid = atoi(dentp->d_name);
 668                 /* attempt to grab process */
 669                 if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
 670                         if (Psetflags(ph, PR_RLC) == 0) {
 671                                 if (Pcreate_agent(ph) == 0) {
 672                                         (void) closedir(dirp);
 673                                         return (ph);
 674                                 }
 675                         }
 676                         Prelease(ph, 0);
 677                 }
 678         }
 679 
 680         (void) closedir(dirp);
 681         return (NULL);
 682 }
 683 
 684 static uint64_t
 685 get_zone_cap()
 686 {
 687         rctlblk_t *rblk;
 688         uint64_t mcap;
 689         struct ps_prochandle *ph;
 690 
 691         if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
 692                 return (UINT64_MAX);
 693 
 694         if ((ph = grab_zone_proc()) == NULL) {
 695                 free(rblk);
 696                 return (UINT64_MAX);
 697         }
 698 
 699         if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
 700             RCTL_FIRST)) {
 701                 Pdestroy_agent(ph);
 702                 Prelease(ph, 0);
 703                 free(rblk);
 704                 return (UINT64_MAX);
 705         }
 706 
 707         Pdestroy_agent(ph);
 708         Prelease(ph, 0);
 709 
 710         mcap = rctlblk_get_value(rblk);
 711         free(rblk);
 712         return (mcap);
 713 }
 714 
 715 /*
 716  * check_suspend is invoked at the beginning of every pass through the process
 717  * list or after we've paged out enough so that we think the excess is under
 718  * the cap.  The purpose is to periodically check the zone's rss and return
 719  * the excess when the zone is over the cap.  The rest of the time this
 720  * function will sleep, periodically waking up to check the current rss.
 721  *
 722  * Depending on the percentage of penetration of the zone's rss into the
 723  * cap we sleep for longer or shorter amounts. This reduces the impact of this
 724  * work on the system, which is important considering that each zone will be
 725  * monitoring its rss.
 726  */
 727 static int64_t
 728 check_suspend()
 729 {
 730         static hrtime_t last_cap_read = 0;
 731         static uint64_t addon;
 732         static uint64_t lo_thresh;      /* Thresholds for how long to  sleep */
 733         static uint64_t hi_thresh;      /* when under the cap (80% & 90%). */
 734         static uint64_t prev_zone_rss = 0;
 735         static uint32_t pfdelay = 0;    /* usec page fault delay when over */
 736 
 737         /* Wait a second to give the async pageout a chance to catch up. */
 738         (void) sleep_shutdown(1);
 739 
 740         while (!shutting_down) {
 741                 int64_t new_excess;
 742                 int sleep_time;
 743                 hrtime_t now;
 744                 struct stat st;
 745                 uint64_t zone_rss;              /* total RSS(KB) */
 746 
 747                 /*
 748                  * Check if the debug log files exists and enable or disable
 749                  * debug.
 750                  */
 751                 if (debug_log_fp == NULL) {
 752                         if (stat(debug_log, &st) == 0)
 753                                 debug_log_fp = fopen(debug_log, "w");
 754                 } else {
 755                         if (stat(debug_log, &st) == -1) {
 756                                 (void) fclose(debug_log_fp);
 757                                 debug_log_fp = NULL;
 758                         }
 759                 }
 760 
 761                 /*
 762                  * If the CAP_REFRESH interval has passed, re-get the current
 763                  * cap in case it has been dynamically updated.
 764                  */
 765                 now = gethrtime();
 766                 if (now - last_cap_read > CAP_REFRESH) {
 767                         uint64_t mcap;
 768 
 769                         last_cap_read = now;
 770 
 771                         mcap = get_zone_cap();
 772                         if (mcap != 0 && mcap != UINT64_MAX)
 773                                 zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
 774                         else
 775                                 zone_rss_cap = UINT64_MAX;
 776 
 777                         lo_thresh = (uint64_t)(zone_rss_cap * .8);
 778                         hi_thresh = (uint64_t)(zone_rss_cap * .9);
 779                         addon = (uint64_t)(zone_rss_cap * 0.05);
 780 
 781                         /*
 782                          * We allow the memory cap tunables to be changed on
 783                          * the fly.
 784                          */
 785                         get_mcap_tunables();
 786 
 787                         debug("%s: %s\n", TUNE_CMD, over_cmd);
 788                         debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
 789                         debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
 790                         debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
 791                         debug("current cap %lluKB lo %lluKB hi %lluKB\n",
 792                             zone_rss_cap, lo_thresh, hi_thresh);
 793                 }
 794 
 795                 /* No cap, nothing to do. */
 796                 if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
 797                         debug("no cap, sleep 120 seconds\n");
 798                         (void) sleep_shutdown(120);
 799                         continue;
 800                 }
 801 
 802                 zone_rss = get_mem_info();
 803 
 804                 /* calculate excess */
 805                 new_excess = zone_rss - zone_rss_cap;
 806 
 807                 debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
 808                     zone_rss, zone_rss_cap, new_excess);
 809 
 810                 /*
 811                  * If necessary, updates stats.
 812                  */
 813 
 814                 /*
 815                  * If it looks like we did some paging out since last over the
 816                  * cap then update the kstat so we can approximate how much was
 817                  * paged out.
 818                  */
 819                 if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
 820                         uint64_t diff;
 821 
 822                         /* assume diff is num bytes we paged out */
 823                         diff = (prev_zone_rss - zone_rss) * 1024;
 824 
 825                         (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
 826                             &diff, 0);
 827                 }
 828                 prev_zone_rss = zone_rss;
 829 
 830                 if (new_excess > 0) {
 831                         uint64_t n = 1;
 832 
 833                         /* Increment "nover" kstat. */
 834                         (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
 835 
 836                         if (!skip_pf_throttle) {
 837                                 /*
 838                                  * Tell the kernel to start throttling page
 839                                  * faults by some number of usecs to help us
 840                                  * catch up. If we are persistently over the
 841                                  * cap the delay ramps up to a max of 2000usecs.
 842                                  * Note that for delays less than 1 tick
 843                                  * (i.e. all of these) we busy-wait in as_fault.
 844                                  *      delay   faults/sec
 845                                  *       125    8000
 846                                  *       250    4000
 847                                  *       500    2000
 848                                  *      1000    1000
 849                                  *      2000     500
 850                                  */
 851                                 if (pfdelay == 0)
 852                                         pfdelay = 125;
 853                                 else if (pfdelay < 2000)
 854                                         pfdelay *= 2;
 855 
 856                                 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 857                                     &pfdelay, 0);
 858                         }
 859 
 860                         /*
 861                          * Once we go over the cap, then we want to
 862                          * page out a little extra instead of stopping
 863                          * right at the cap. To do this we add 5% to
 864                          * the excess so that pageout_proces will work
 865                          * a little longer before stopping.
 866                          */
 867                         return ((int64_t)(new_excess + addon));
 868                 }
 869 
 870                 /*
 871                  * At this point we are under the cap.
 872                  *
 873                  * Tell the kernel to stop throttling page faults.
 874                  *
 875                  * Scale the amount of time we sleep before rechecking the
 876                  * zone's memory usage.  Also, scale the accpetable age of
 877                  * cached results from vm_getusage.  We do this based on the
 878                  * penetration into the capped limit.
 879                  */
 880                 if (pfdelay > 0) {
 881                         pfdelay = 0;
 882                         (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 883                             &pfdelay, 0);
 884                 }
 885 
 886                 if (zone_rss <= lo_thresh) {
 887                         sleep_time = 120;
 888                 } else if (zone_rss <= hi_thresh) {
 889                         sleep_time = 60;
 890                 } else {
 891                         sleep_time = 30;
 892                 }
 893 
 894                 debug("sleep %d seconds\n", sleep_time);
 895                 (void) sleep_shutdown(sleep_time);
 896         }
 897 
 898         /* Shutting down, tell the kernel so it doesn't throttle */
 899         if (pfdelay > 0) {
 900                 pfdelay = 0;
 901                 (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
 902         }
 903 
 904         return (0);
 905 }
 906 
 907 static void
 908 get_mcap_tunables()
 909 {
 910         zone_dochandle_t handle;
 911         struct zone_attrtab attr;
 912 
 913         over_cmd[0] = '\0';
 914         if ((handle = zonecfg_init_handle()) == NULL)
 915                 return;
 916 
 917         if (zonecfg_get_handle(zone_name, handle) != Z_OK)
 918                 goto done;
 919 
 920         /* Reset to defaults in case rebooting and settings have changed */
 921         over_cmd[0] = '\0';
 922         skip_vmusage = B_FALSE;
 923         skip_pageout = B_FALSE;
 924         skip_pf_throttle = B_FALSE;
 925 
 926         if (zonecfg_setattrent(handle) != Z_OK)
 927                 goto done;
 928         while (zonecfg_getattrent(handle, &attr) == Z_OK) {
 929                 if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
 930                         (void) strlcpy(over_cmd, attr.zone_attr_value,
 931                             sizeof (over_cmd));
 932                 } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
 933                         if (strcmp("true", attr.zone_attr_value) == 0)
 934                                 skip_vmusage = B_TRUE;
 935                 } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
 936                         if (strcmp("true", attr.zone_attr_value) == 0)
 937                                 skip_pageout = B_TRUE;
 938                 } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
 939                         if (strcmp("true", attr.zone_attr_value) == 0)
 940                                 skip_pf_throttle = B_TRUE;
 941                 }
 942         }
 943         (void) zonecfg_endattrent(handle);
 944 
 945 done:
 946         zonecfg_fini_handle(handle);
 947 }
 948 
 949 /* ARGSUSED */
 950 static int
 951 chk_proc_fs(void *data, const char *spec, const char *dir,
 952     const char *fstype, const char *opt)
 953 {
 954         if (fstype != NULL && strcmp(fstype, "proc") == 0)
 955                 *((boolean_t *)data) = B_TRUE;
 956 
 957         return (0);
 958 }
 959 
 960 static boolean_t
 961 has_proc()
 962 {
 963         brand_handle_t bh;
 964         boolean_t fnd = B_FALSE;
 965 
 966         if ((bh = brand_open(brand_name)) != NULL) {
 967                 (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
 968         }
 969 
 970         brand_close(bh);
 971         return (fnd);
 972 }
 973 
 974 /*
 975  * We run this loop for brands with no /proc to simply update the RSS, using
 976  * the cheap GZ /proc data, every 5 minutes.
 977  */
 978 static void
 979 no_procfs()
 980 {
 981         DIR                     *pdir = NULL;
 982         struct dirent           *dent;
 983         uint64_t                zone_rss_bytes;
 984 
 985         (void) sleep_shutdown(30);
 986         while (!shutting_down) {
 987                 /*
 988                  * Just do the fast, cheap RSS calculation using the rss value
 989                  * in psinfo_t.  Because that's per-process, it can lead to
 990                  * double counting some memory and overestimating how much is
 991                  * being used. Since there is no /proc in the zone, we use the
 992                  * GZ /proc and check for the correct zone.
 993                  */
 994                 if ((pdir = opendir("/proc")) == NULL)
 995                         return;
 996 
 997                 fast_rss = 0;
 998                 while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 999                         pid_t           pid;
1000                         int             psfd;
1001                         int64_t         rss;
1002                         char            pathbuf[MAXPATHLEN];
1003                         psinfo_t        psinfo;
1004 
1005                         if (strcmp(".", dent->d_name) == 0 ||
1006                             strcmp("..", dent->d_name) == 0)
1007                                 continue;
1008 
1009                         pid = atoi(dent->d_name);
1010                         if (pid == 0 || pid == 1)
1011                                 continue;
1012 
1013                         (void) snprintf(pathbuf, sizeof (pathbuf),
1014                             "/proc/%d/psinfo", pid);
1015 
1016                         rss = 0;
1017                         if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1018                                 if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1019                                     sizeof (psinfo)) {
1020                                         if (psinfo.pr_zoneid == zid)
1021                                                 rss = (int64_t)psinfo.pr_rssize;
1022                                 }
1023 
1024                                 (void) close(psfd);
1025                         }
1026 
1027                         fast_rss += rss;
1028                 }
1029 
1030                 (void) closedir(pdir);
1031 
1032                 if (shutting_down)
1033                         return;
1034 
1035                 zone_rss_bytes = fast_rss * 1024;
1036                 /* Use the zone's approx. RSS in the kernel */
1037                 (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1038 
1039                 (void) sleep_shutdown(300);
1040         }
1041 }
1042 
1043 /*
1044  * Thread that checks zone's memory usage and when over the cap, goes through
1045  * the zone's process list trying to pageout processes to get under the cap.
1046  */
1047 static void
1048 mcap_zone()
1049 {
1050         DIR *pdir = NULL;
1051         int64_t excess;
1052 
1053         debug("thread startup\n");
1054 
1055         get_mcap_tunables();
1056 
1057         /*
1058          * If the zone has no /proc filesystem, we can't use the fast algorithm
1059          * to check RSS or pageout any processes. All we can do is periodically
1060          * update it's RSS kstat using the expensive sycall.
1061          */
1062         if (!has_proc()) {
1063                 no_procfs();
1064                 debug("thread shutdown\n");
1065                 return;
1066         }
1067 
1068         /*
1069          * When first starting it is likely lots of other zones are starting
1070          * too because the system is booting.  Since we just started the zone
1071          * we're not worried about being over the cap right away, so we let
1072          * things settle a bit and tolerate some older data here to minimize
1073          * the load on the system.
1074          */
1075         (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1076 
1077         /* Wait until zone's /proc is mounted */
1078         while (!shutting_down) {
1079                 struct stat st;
1080 
1081                 if (stat(zoneproc, &st) == 0 &&
1082                     strcmp(st.st_fstype, "proc") == 0)
1083                         break;
1084                 sleep_shutdown(5);
1085         }
1086 
1087         /* Open zone's /proc and walk entries. */
1088         while (!shutting_down) {
1089                 if ((pdir = opendir(zoneproc)) != NULL)
1090                         break;
1091                 sleep_shutdown(5);
1092         }
1093 
1094         while (!shutting_down) {
1095                 struct dirent *dirent;
1096 
1097                 /* Wait until we've gone over the cap. */
1098                 excess = check_suspend();
1099 
1100                 debug("starting to scan, excess %lldk\n", (long long)excess);
1101 
1102                 if (over_cmd[0] != '\0') {
1103                         uint64_t zone_rss;      /* total RSS(KB) */
1104 
1105                         debug("run phys_mcap_cmd: %s\n", over_cmd);
1106                         run_over_cmd();
1107 
1108                         zone_rss = get_mem_info();
1109                         excess = zone_rss - zone_rss_cap;
1110                         debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1111                             zone_rss, zone_rss_cap, excess);
1112                         if (excess <= 0)
1113                                 continue;
1114                 }
1115 
1116                 while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1117                         pid_t pid;
1118 
1119                         if (strcmp(".", dirent->d_name) == 0 ||
1120                             strcmp("..", dirent->d_name) == 0)
1121                                 continue;
1122 
1123                         pid = atoi(dirent->d_name);
1124                         if (pid == 0 || pid == 1)
1125                                 continue;
1126 
1127                         if (skip_pageout)
1128                                 (void) sleep_shutdown(2);
1129                         else
1130                                 excess = pageout_process(pid, excess);
1131 
1132                         if (excess <= 0) {
1133                                 debug("apparently under; excess %lld\n",
1134                                     (long long)excess);
1135                                 /* Double check the current excess */
1136                                 excess = check_suspend();
1137                         }
1138                 }
1139 
1140                 debug("process pass done; excess %lld\n", (long long)excess);
1141                 rewinddir(pdir);
1142 
1143                 if (skip_pageout)
1144                         (void) sleep_shutdown(120);
1145         }
1146 
1147         if (pdir != NULL)
1148                 (void) closedir(pdir);
1149         debug("thread shutdown\n");
1150 }
1151 
1152 void
1153 create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1154 {
1155         int             res;
1156 
1157         shutting_down = 0;
1158         zid = id;
1159         logp = zlogp;
1160 
1161         /* all but the lx brand currently use /proc */
1162         if (strcmp(brand_name, "lx") == 0) {
1163                 (void) snprintf(zoneproc, sizeof (zoneproc),
1164                     "%s/root/native/proc", zonepath);
1165         } else {
1166                 (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1167                     zonepath);
1168         }
1169 
1170         (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1171             zonepath);
1172 
1173         res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1174             &mcap_tid);
1175         if (res != 0) {
1176                 zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1177                     res);
1178                 mcap_tid = 0;
1179         }
1180 }
1181 
1182 void
1183 destroy_mcap_thread()
1184 {
1185         if (mcap_tid != 0) {
1186                 shutting_down = 1;
1187                 (void) cond_signal(&shutdown_cv);
1188                 (void) thr_join(mcap_tid, NULL, NULL);
1189                 mcap_tid = 0;
1190         }
1191 }