io-lx-lint Wdiff usr/src/cmd/zoneadmd/mcap.c

Print this page

Reduce lint

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/zoneadmd/mcap.c
          +++ new/usr/src/cmd/zoneadmd/mcap.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Copyright 2014, Joyent, Inc.  All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * This file implements the code which runs a thread inside zoneadmd to cap
  28   28   * the associated zone's physical memory.  A thread to do this is started
  29   29   * when the zone boots and is halted when the zone shuts down.
  30   30   *
  31   31   * Because of the way that the VM system is currently implemented, there is no
  32   32   * way to go from the bottom up (page to process to zone).  Thus, there is no
  33   33   * obvious way to hook an rctl into the kernel's paging code to enforce a hard
  34   34   * memory cap.  Instead, we implement a soft physical memory cap which looks
  35   35   * at the zone's overall rss and once it is over the cap, works from the top
  36   36   * down (zone to process to page), looking at zone processes, to determine
  37   37   * what to try to pageout to get the zone under its memory cap.
  38   38   *
  39   39   * The code uses the fast, cheap, but potentially very inaccurate sum of the
  40   40   * rss values from psinfo_t to first approximate the zone's rss and will
  41   41   * fallback to the vm_getusage syscall to determine the zone's rss if needed.
  42   42   * It then checks the rss against the zone's zone.max-physical-memory rctl.
  43   43   * Once the zone goes over its cap, then this thread will work through the
  44   44   * zone's /proc process list, Pgrab-bing each process and stepping through the
  45   45   * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
  46   46   * to pageout pages, until the zone is again under its cap.
  47   47   *
  48   48   * Although zone memory capping is implemented as a soft cap by this user-level
  49   49   * thread, the interfaces around memory caps that are exposed to the user are
  50   50   * the standard ones; an rctl and kstats.  This thread uses the rctl value
  51   51   * to obtain the cap and works with the zone kernel code to update the kstats.
  52   52   * If the implementation ever moves into the kernel, these exposed interfaces
  53   53   * do not need to change.
  54   54   *
  55   55   * The thread adaptively sleeps, periodically checking the state of the
  56   56   * zone.  As the zone's rss gets closer to the cap, the thread will wake up
  57   57   * more often to check the zone's status.  Once the zone is over the cap,
  58   58   * the thread will work to pageout until the zone is under the cap, as shown
  59   59   * by updated vm_usage data.
  60   60   *
  61   61   * NOTE: The pagedata page maps (at least on x86) are not useful.  Those flags
  62   62   * are set by hrm_setbits() and on x86 that code path is only executed by
  63   63   *     segvn_pagelock -> hat_setstat -> hrm_setbits
  64   64   *     segvn_softunlock -^
  65   65   * On SPARC there is an additional code path which may make this data
  66   66   * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
  67   67   * maps.  If we ever fix this issue, then we could generalize this mcap code to
  68   68   * do more with the data on active pages.
  69   69   *
  70   70   * For debugging, touch the file {zonepath}/mcap_debug.log.  This will
  71   71   * cause the thread to start logging its actions into that file (it may take
  72   72   * a minute or two if the thread is currently sleeping).  Removing that
  73   73   * file will cause logging to stop.
  74   74   */
  75   75  
  76   76  #include <sys/mman.h>
  77   77  #include <sys/param.h>
  78   78  #include <sys/stat.h>
  79   79  #include <sys/types.h>
  80   80  #include <assert.h>
  81   81  #include <errno.h>
  82   82  #include <fcntl.h>
  83   83  #include <libproc.h>
  84   84  #include <limits.h>
  85   85  #include <procfs.h>
  86   86  #include <stdio.h>
  87   87  #include <stdlib.h>
  88   88  #include <strings.h>
  89   89  #include <time.h>
  90   90  #include <unistd.h>
  91   91  #include <sys/priocntl.h>
  92   92  #include <dirent.h>
  93   93  #include <zone.h>
  94   94  #include <libzonecfg.h>
  95   95  #include <thread.h>
  96   96  #include <values.h>
  97   97  #include <sys/vm_usage.h>
  98   98  #include <sys/resource.h>
  99   99  #include <sys/debug.h>
 100  100  #include <synch.h>
 101  101  #include <wait.h>
 102  102  #include <libcontract.h>
 103  103  #include <libcontract_priv.h>
 104  104  #include <sys/contract/process.h>
 105  105  #include "zoneadmd.h"
 106  106  
 107  107                                          /* round up to next y = 2^n */
 108  108  #define ROUNDUP(x, y)   (((x) + ((y) - 1)) & ~((y) - 1))
 109  109  
 110  110  #define CAP_REFRESH     ((uint64_t)300 * NANOSEC) /* every 5 minutes */
 111  111  
 112  112  /*
 113  113   * zonecfg attribute tunables for memory capping.
 114  114   *    phys-mcap-cmd
 115  115   *      type: string
 116  116   *      specifies a command that can be run when over the cap
 117  117   *    phys-mcap-no-vmusage
 118  118   *      type: boolean
 119  119   *      true disables vm_getusage and just uses zone's proc. rss sum
 120  120   *    phys-mcap-no-pageout
 121  121   *      type: boolean
 122  122   *      true disables pageout when over
 123  123   *    phys-mcap-no-pf-throttle
 124  124   *      type: boolean
 125  125   *      true disables page fault throttling when over
 126  126   */
 127  127  #define TUNE_CMD        "phys-mcap-cmd"
 128  128  #define TUNE_NVMU       "phys-mcap-no-vmusage"
 129  129  #define TUNE_NPAGE      "phys-mcap-no-pageout"
 130  130  #define TUNE_NPFTHROT   "phys-mcap-no-pf-throttle"
 131  131  
 132  132  /*
 133  133   * The large mapping value was derived empirically by seeing that mappings
 134  134   * much bigger than 16mb sometimes take a relatively long time to invalidate
 135  135   * (significant fraction of a second).
 136  136   */
 137  137  #define SEC_INTERIM     4       /* num secs to pause after stopped too long */
 138  138  #define MSEC_TOO_LONG   100     /* release proc. after stopped for 100ms */
 139  139  #define LARGE_MAPPING   16384   /* >= 16MB in KB - pageout in chunks */
 140  140  
 141  141  /*
 142  142   * These are only used in get_mem_info but global. We always need scale_rss and
 143  143   * prev_fast_rss to be persistent but we also have the other two global so we
 144  144   * can easily see these with mdb.
 145  145   */
 146  146  uint64_t        scale_rss = 0;
 147  147  uint64_t        prev_fast_rss = 0;
 148  148  uint64_t        fast_rss = 0;
 149  149  uint64_t        accurate_rss = 0;
 150  150  
 151  151  static char     zoneproc[MAXPATHLEN];
 152  152  static char     debug_log[MAXPATHLEN];
 153  153  static zoneid_t zid;
 154  154  static mutex_t  shutdown_mx;

↓ open down ↓

154 lines elided

↑ open up ↑

 155  155  static cond_t   shutdown_cv;
 156  156  static int      shutting_down = 0;
 157  157  static thread_t mcap_tid;
 158  158  static FILE     *debug_log_fp = NULL;
 159  159  static uint64_t zone_rss_cap;           /* RSS cap(KB) */
 160  160  static char     over_cmd[2 * BUFSIZ];   /* same size as zone_attr_value */
 161  161  static boolean_t skip_vmusage = B_FALSE;
 162  162  static boolean_t skip_pageout = B_FALSE;
 163  163  static boolean_t skip_pf_throttle = B_FALSE;
 164  164  
 165      -static zlog_t   *logp;
 166      -
 167  165  static int64_t check_suspend();
 168  166  static void get_mcap_tunables();
 169  167  
 170  168  /*
 171  169   * Structure to hold current state about a process address space that we're
 172  170   * working on.
 173  171   */
 174  172  typedef struct {
 175  173          int pr_curr;            /* the # of the mapping we're working on */
 176  174          int pr_nmap;            /* number of mappings in address space */

 177  175          prmap_t *pr_mapp;       /* process's map array */
 178  176  } proc_map_t;
 179  177  
 180  178  typedef struct zsd_vmusage64 {
 181  179          id_t vmu_zoneid;
 182  180          uint_t vmu_type;
 183  181          id_t vmu_id;
 184  182          /*
 185  183           * An amd64 kernel will align the following uint64_t members, but a
 186  184           * 32bit i386 process will not without help.
 187  185           */
 188  186          int vmu_align_next_members_on_8_bytes;
 189  187          uint64_t vmu_rss_all;
 190  188          uint64_t vmu_rss_private;
 191  189          uint64_t vmu_rss_shared;
 192  190          uint64_t vmu_swap_all;
 193  191          uint64_t vmu_swap_private;
 194  192          uint64_t vmu_swap_shared;
 195  193  } zsd_vmusage64_t;
 196  194  
 197  195  /*
 198  196   * Output a debug log message.
 199  197   */
 200  198  /*PRINTFLIKE1*/
 201  199  static void
 202  200  debug(char *fmt, ...)
 203  201  {
 204  202          va_list ap;
 205  203  
 206  204          if (debug_log_fp == NULL)
 207  205                  return;
 208  206  
 209  207          va_start(ap, fmt);
 210  208          (void) vfprintf(debug_log_fp, fmt, ap);
 211  209          va_end(ap);
 212  210          (void) fflush(debug_log_fp);
 213  211  }
 214  212  
 215  213  /*
 216  214   * Like sleep(3C) but can be interupted by cond_signal which is posted when
 217  215   * we're shutting down the mcap thread.
 218  216   */
 219  217  static void
 220  218  sleep_shutdown(int secs)
 221  219  {
 222  220          timestruc_t to;
 223  221  
 224  222          to.tv_sec = secs;
 225  223          to.tv_nsec = 0;
 226  224  
 227  225          (void) mutex_lock(&shutdown_mx);
 228  226          if (!shutting_down)
 229  227                  (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
 230  228          (void) mutex_unlock(&shutdown_mx);
 231  229  }
 232  230  
 233  231  static boolean_t
 234  232  proc_issystem(pid_t pid)
 235  233  {
 236  234          char pc_clname[PC_CLNMSZ];
 237  235  
 238  236          if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
 239  237              PC_KY_NULL) != -1)
 240  238                  return (strcmp(pc_clname, "SYS") == 0);
 241  239  
 242  240          return (B_TRUE);
 243  241  }
 244  242  
 245  243  /*
 246  244   * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
 247  245   */
 248  246  static void
 249  247  run_over_cmd()
 250  248  {
 251  249          int             ctfd;
 252  250          int             err;
 253  251          pid_t           childpid;
 254  252          siginfo_t       info;
 255  253          ctid_t          ct;
 256  254  
 257  255          /*
 258  256           * Before we enter the zone, we need to create a new process contract
 259  257           * for the child, as required by zone_enter().
 260  258           */
 261  259          if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
 262  260                  return;
 263  261          if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
 264  262              ct_tmpl_set_informative(ctfd, 0) != 0 ||
 265  263              ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
 266  264              ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
 267  265              ct_tmpl_activate(ctfd) != 0) {
 268  266                  (void) close(ctfd);
 269  267                  return;
 270  268          }
 271  269  
 272  270          childpid = fork();
 273  271          switch (childpid) {
 274  272          case -1:
 275  273                  (void) ct_tmpl_clear(ctfd);
 276  274                  (void) close(ctfd);
 277  275                  break;
 278  276          case 0: /* Child */
 279  277                  (void) ct_tmpl_clear(ctfd);
 280  278                  (void) close(ctfd);
 281  279                  if (zone_enter(zid) == -1)
 282  280                          _exit(errno);
 283  281                  err = system(over_cmd);
 284  282                  _exit(err);
 285  283                  break;
 286  284          default:        /* Parent */
 287  285                  if (contract_latest(&ct) == -1)
 288  286                          ct = -1;
 289  287                  (void) ct_tmpl_clear(ctfd);
 290  288                  (void) close(ctfd);
 291  289                  err = waitid(P_PID, childpid, &info, WEXITED);
 292  290                  (void) contract_abandon_id(ct);
 293  291                  if (err == -1 || info.si_status != 0)
 294  292                          debug("over_cmd failed");
 295  293                  break;
 296  294          }
 297  295  }
 298  296  
 299  297  /*
 300  298   * Get the next mapping.
 301  299   */
 302  300  static prmap_t *
 303  301  nextmapping(proc_map_t *pmp)
 304  302  {
 305  303          if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
 306  304                  return (NULL);
 307  305  
 308  306          return (&pmp->pr_mapp[pmp->pr_curr++]);
 309  307  }
 310  308  
 311  309  /*
 312  310   * Initialize the proc_map_t to access the first mapping of an address space.
 313  311   */
 314  312  static prmap_t *
 315  313  init_map(proc_map_t *pmp, pid_t pid)
 316  314  {
 317  315          int fd;
 318  316          int res;
 319  317          struct stat st;
 320  318          char pathbuf[MAXPATHLEN];
 321  319  
 322  320          bzero(pmp, sizeof (proc_map_t));
 323  321          pmp->pr_nmap = -1;
 324  322  
 325  323          (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
 326  324          if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
 327  325                  return (NULL);
 328  326  
 329  327  redo:
 330  328          errno = 0;
 331  329          if (fstat(fd, &st) != 0)
 332  330                  goto done;
 333  331  
 334  332          if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
 335  333                  debug("cannot malloc() %ld bytes for xmap", st.st_size);
 336  334                  goto done;
 337  335          }
 338  336          (void) bzero(pmp->pr_mapp, st.st_size);
 339  337  
 340  338          errno = 0;
 341  339          if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
 342  340                  free(pmp->pr_mapp);
 343  341                  pmp->pr_mapp = NULL;
 344  342                  if (res > 0 || errno == E2BIG) {
 345  343                          goto redo;
 346  344                  } else {
 347  345                          debug("pid %ld cannot read xmap\n", pid);
 348  346                          goto done;
 349  347                  }
 350  348          }
 351  349  
 352  350          pmp->pr_nmap = st.st_size / sizeof (prmap_t);
 353  351  
 354  352  done:
 355  353          (void) close(fd);
 356  354          return (nextmapping(pmp));
 357  355  }
 358  356  
 359  357  /*
 360  358   * Attempt to invalidate the entire mapping from within the given process's
 361  359   * address space. May return nonzero with errno as:
 362  360   *    ESRCH  - process not found
 363  361   *    ENOMEM - segment not found
 364  362   *    EINVAL - mapping exceeds a single segment
 365  363   */
 366  364  static int
 367  365  pageout_mapping(pid_t pid, prmap_t *pmp)
 368  366  {
 369  367          int res;
 370  368  
 371  369          if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
 372  370                  return (0);
 373  371  
 374  372          errno = 0;
 375  373          res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
 376  374              pmp->pr_size);
 377  375  
 378  376          return (res);
 379  377  }
 380  378

↓ open down ↓

204 lines elided

↑ open up ↑

 381  379  /*
 382  380   * Work through a process paging out mappings until the whole address space was
 383  381   * examined or the excess is < 0.  Return our estimate of the updated excess.
 384  382   */
 385  383  static int64_t
 386  384  pageout_process(pid_t pid, int64_t excess)
 387  385  {
 388  386          int                     psfd;
 389  387          prmap_t                 *pmap;
 390  388          proc_map_t              cur;
 391      -        int                     res;
 392  389          int64_t                 sum_d_rss, d_rss;
 393  390          int64_t                 old_rss;
 394  391          int                     map_cnt;
 395  392          psinfo_t                psinfo;
 396  393          char                    pathbuf[MAXPATHLEN];
 397  394  
 398  395          (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
 399  396              pid);
 400  397          if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
 401  398                  return (excess);

 402  399  
 403  400          cur.pr_mapp = NULL;
 404  401  
 405  402          if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
 406  403                  goto done;
 407  404  
 408  405          old_rss = (int64_t)psinfo.pr_rssize;
 409  406          map_cnt = 0;
 410  407  
 411  408          /* If unscannable, skip it. */
 412  409          if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
 413  410                  debug("pid %ld: system process, skipping %s\n",
 414  411                      pid, psinfo.pr_psargs);
 415  412                  goto done;
 416  413          }
 417  414  
 418  415          /* If tiny RSS (16KB), skip it. */
 419  416          if (old_rss <= 16) {
 420  417                  debug("pid %ld: skipping, RSS %lldKB %s\n",
 421  418                      pid, old_rss, psinfo.pr_psargs);
 422  419                  goto done;
 423  420          }
 424  421  
 425  422          /* Get segment residency information. */
 426  423          pmap = init_map(&cur, pid);
 427  424  
 428  425          /* Skip process if it has no mappings. */
 429  426          if (pmap == NULL) {
 430  427                  debug("pid %ld: map unreadable; ignoring\n", pid);
 431  428                  goto done;
 432  429          }

↓ open down ↓

31 lines elided

↑ open up ↑

 433  430  
 434  431          debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
 435  432              pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
 436  433  
 437  434          /*
 438  435           * Within the process's address space, attempt to page out mappings.
 439  436           */
 440  437          sum_d_rss = 0;
 441  438          while (excess > 0 && pmap != NULL && !shutting_down) {
 442  439                  /* invalidate the entire mapping */
 443      -                if ((res = pageout_mapping(pid, pmap)) < 0)
      440 +                if (pageout_mapping(pid, pmap) < 0)
 444  441                          debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
 445      -                            pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
      442 +                            pid, (void *)pmap->pr_vaddr,
      443 +                            (long)pmap->pr_size / 1024L, errno);
 446  444  
 447  445                  map_cnt++;
 448  446  
 449  447                  /*
 450  448                   * Re-check the process rss and get the delta.
 451  449                   */
 452  450                  if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 453  451                      != sizeof (psinfo)) {
 454  452                          excess -= old_rss;
 455  453                          goto done;

 456  454                  }
 457  455  
 458  456                  d_rss = (int64_t)psinfo.pr_rssize - old_rss;
 459  457                  old_rss = (int64_t)psinfo.pr_rssize;
 460  458                  sum_d_rss += d_rss;
 461  459  
 462  460                  /*
 463  461                   * d_rss hopefully should be negative (or 0 if nothing
 464  462                   * invalidated) but can be positive if more got paged in.
 465  463                   */
 466  464                  excess += d_rss;
 467  465  
 468  466                  if (excess <= 0) {
 469  467                          debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
 470  468                              "excess %lldKB\n", pid, map_cnt,
 471  469                              (unsigned long long)sum_d_rss, (long long)excess);
 472  470                          map_cnt = 0;
 473  471  
 474  472                          /*
 475  473                           * If we're actually under, this will suspend checking
 476  474                           * in the middle of this process's address space.
 477  475                           */
 478  476                          excess = check_suspend();
 479  477                          if (shutting_down)
 480  478                                  goto done;
 481  479  
 482  480                          /*
 483  481                           * since we might have suspended, re-read process's rss
 484  482                           */
 485  483                          if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 486  484                              != sizeof (psinfo)) {
 487  485                                  excess -= old_rss;
 488  486                                  goto done;
 489  487                          }
 490  488  
 491  489                          old_rss = (int64_t)psinfo.pr_rssize;
 492  490  
 493  491                          debug("pid %ld: resume pageout; excess %lld\n", pid,
 494  492                              (long long)excess);
 495  493                          sum_d_rss = 0;
 496  494                  }
 497  495  
 498  496                  pmap = nextmapping(&cur);
 499  497          }
 500  498  
 501  499          debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
 502  500              pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
 503  501  
 504  502  done:
 505  503          if (cur.pr_mapp != NULL)
 506  504                  free(cur.pr_mapp);
 507  505  
 508  506          (void) close(psfd);
 509  507  
 510  508          if (shutting_down)
 511  509                  return (0);
 512  510  
 513  511          return (excess);
 514  512  }
 515  513  
 516  514  /*
 517  515   * Get the zone's RSS data.
 518  516   */
 519  517  static uint64_t
 520  518  get_mem_info()
 521  519  {
 522  520          uint64_t                n = 1;
 523  521          zsd_vmusage64_t         buf;
 524  522          uint64_t                tmp_rss;
 525  523          DIR                     *pdir = NULL;
 526  524          struct dirent           *dent;
 527  525  
 528  526          /*
 529  527           * Start by doing the fast, cheap RSS calculation using the rss value
 530  528           * in psinfo_t.  Because that's per-process, it can lead to double
 531  529           * counting some memory and overestimating how much is being used, but
 532  530           * as long as that's not over the cap, then we don't need do the
 533  531           * expensive calculation.
 534  532           *
 535  533           * If we have to do the expensive calculation, we remember the scaling
 536  534           * factor so that we can try to use that on subsequent iterations for
 537  535           * the fast rss.
 538  536           */
 539  537          if (shutting_down)
 540  538                  return (0);
 541  539  
 542  540          if ((pdir = opendir(zoneproc)) == NULL)
 543  541                  return (0);
 544  542  
 545  543          accurate_rss = 0;
 546  544          fast_rss = 0;
 547  545          while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 548  546                  pid_t           pid;
 549  547                  int             psfd;
 550  548                  int64_t         rss;
 551  549                  char            pathbuf[MAXPATHLEN];
 552  550                  psinfo_t        psinfo;
 553  551  
 554  552                  if (strcmp(".", dent->d_name) == 0 ||
 555  553                      strcmp("..", dent->d_name) == 0)
 556  554                          continue;
 557  555  
 558  556                  pid = atoi(dent->d_name);
 559  557                  if (pid == 0 || pid == 1)
 560  558                          continue;
 561  559  
 562  560                  (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
 563  561                      zoneproc, pid);
 564  562  
 565  563                  rss = 0;
 566  564                  if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
 567  565                          if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
 568  566                              sizeof (psinfo))
 569  567                                  rss = (int64_t)psinfo.pr_rssize;
 570  568  
 571  569                          (void) close(psfd);
 572  570                  }
 573  571  
 574  572                  fast_rss += rss;
 575  573          }
 576  574  
 577  575          (void) closedir(pdir);
 578  576  
 579  577          if (shutting_down)
 580  578                  return (0);
 581  579  
 582  580          debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
 583  581              scale_rss, prev_fast_rss);
 584  582  
 585  583          /* see if we can get by with a scaled fast rss */
 586  584          tmp_rss = fast_rss;
 587  585          if (scale_rss > 1 && prev_fast_rss > 0) {
 588  586                  /*
 589  587                   * Only scale the fast value if it hasn't ballooned too much
 590  588                   * to trust.
 591  589                   */
 592  590                  if (fast_rss / prev_fast_rss < 2) {
 593  591                          fast_rss /= scale_rss;
 594  592                          debug("scaled fast rss: %lluKB\n", fast_rss);
 595  593                  }
 596  594          }
 597  595  
 598  596          if (fast_rss <= zone_rss_cap || skip_vmusage) {
 599  597                  uint64_t zone_rss_bytes;
 600  598  
 601  599                  zone_rss_bytes = fast_rss * 1024;
 602  600                  /* Use the zone's approx. RSS in the kernel */
 603  601                  (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
 604  602                  return (fast_rss);
 605  603          }
 606  604  
 607  605          buf.vmu_id = zid;
 608  606  
 609  607          /* get accurate usage (cached data may be up to 5 seconds old) */
 610  608          if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
 611  609              (uintptr_t)&buf, (uintptr_t)&n) != 0) {
 612  610                  debug("vmusage failed\n");
 613  611                  (void) sleep_shutdown(1);
 614  612                  return (0);
 615  613          }
 616  614  
 617  615          if (n > 1) {
 618  616                  /* This should never happen */
 619  617                  debug("vmusage returned more than one result\n");
 620  618                  (void) sleep_shutdown(1);
 621  619                  return (0);
 622  620          }
 623  621  
 624  622          if (buf.vmu_id != zid) {
 625  623                  /* This should never happen */
 626  624                  debug("vmusage returned the incorrect zone\n");
 627  625                  (void) sleep_shutdown(1);
 628  626                  return (0);
 629  627          }
 630  628  
 631  629          accurate_rss = buf.vmu_rss_all / 1024;
 632  630  
 633  631          /* calculate scaling factor to use for fast_rss from now on */
 634  632          if (accurate_rss > 0) {
 635  633                  scale_rss = fast_rss / accurate_rss;
 636  634                  debug("new scaling factor: %llu\n", scale_rss);
 637  635                  /* remember the fast rss when we had to get the accurate rss */
 638  636                  prev_fast_rss = tmp_rss;
 639  637          }
 640  638  
 641  639          debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
 642  640              scale_rss, prev_fast_rss);
 643  641          return (accurate_rss);
 644  642  }
 645  643  
 646  644  /*
 647  645   * Needed to read the zones physical-memory-cap rctl.
 648  646   */
 649  647  static struct ps_prochandle *
 650  648  grab_zone_proc()
 651  649  {
 652  650          DIR *dirp;
 653  651          struct dirent *dentp;
 654  652          struct ps_prochandle *ph = NULL;
 655  653          int tmp;
 656  654  
 657  655          if ((dirp = opendir(zoneproc)) == NULL)
 658  656                  return (NULL);
 659  657  
 660  658          while (!shutting_down && (dentp = readdir(dirp))) {
 661  659                  int pid;
 662  660  
 663  661                  if (strcmp(".", dentp->d_name) == 0 ||
 664  662                      strcmp("..", dentp->d_name) == 0)
 665  663                          continue;
 666  664  
 667  665                  pid = atoi(dentp->d_name);
 668  666                  /* attempt to grab process */
 669  667                  if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
 670  668                          if (Psetflags(ph, PR_RLC) == 0) {
 671  669                                  if (Pcreate_agent(ph) == 0) {
 672  670                                          (void) closedir(dirp);
 673  671                                          return (ph);
 674  672                                  }
 675  673                          }
 676  674                          Prelease(ph, 0);
 677  675                  }
 678  676          }
 679  677  
 680  678          (void) closedir(dirp);
 681  679          return (NULL);
 682  680  }
 683  681  
 684  682  static uint64_t
 685  683  get_zone_cap()
 686  684  {
 687  685          rctlblk_t *rblk;
 688  686          uint64_t mcap;
 689  687          struct ps_prochandle *ph;
 690  688  
 691  689          if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
 692  690                  return (UINT64_MAX);
 693  691  
 694  692          if ((ph = grab_zone_proc()) == NULL) {
 695  693                  free(rblk);
 696  694                  return (UINT64_MAX);
 697  695          }
 698  696  
 699  697          if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
 700  698              RCTL_FIRST)) {
 701  699                  Pdestroy_agent(ph);
 702  700                  Prelease(ph, 0);
 703  701                  free(rblk);
 704  702                  return (UINT64_MAX);
 705  703          }
 706  704  
 707  705          Pdestroy_agent(ph);
 708  706          Prelease(ph, 0);
 709  707  
 710  708          mcap = rctlblk_get_value(rblk);
 711  709          free(rblk);
 712  710          return (mcap);
 713  711  }
 714  712  
 715  713  /*
 716  714   * check_suspend is invoked at the beginning of every pass through the process
 717  715   * list or after we've paged out enough so that we think the excess is under
 718  716   * the cap.  The purpose is to periodically check the zone's rss and return
 719  717   * the excess when the zone is over the cap.  The rest of the time this
 720  718   * function will sleep, periodically waking up to check the current rss.
 721  719   *
 722  720   * Depending on the percentage of penetration of the zone's rss into the
 723  721   * cap we sleep for longer or shorter amounts. This reduces the impact of this
 724  722   * work on the system, which is important considering that each zone will be
 725  723   * monitoring its rss.
 726  724   */
 727  725  static int64_t
 728  726  check_suspend()
 729  727  {
 730  728          static hrtime_t last_cap_read = 0;
 731  729          static uint64_t addon;
 732  730          static uint64_t lo_thresh;      /* Thresholds for how long to  sleep */
 733  731          static uint64_t hi_thresh;      /* when under the cap (80% & 90%). */
 734  732          static uint64_t prev_zone_rss = 0;
 735  733          static uint32_t pfdelay = 0;    /* usec page fault delay when over */
 736  734  
 737  735          /* Wait a second to give the async pageout a chance to catch up. */
 738  736          (void) sleep_shutdown(1);
 739  737  
 740  738          while (!shutting_down) {
 741  739                  int64_t new_excess;
 742  740                  int sleep_time;
 743  741                  hrtime_t now;
 744  742                  struct stat st;
 745  743                  uint64_t zone_rss;              /* total RSS(KB) */
 746  744  
 747  745                  /*
 748  746                   * Check if the debug log files exists and enable or disable
 749  747                   * debug.
 750  748                   */
 751  749                  if (debug_log_fp == NULL) {
 752  750                          if (stat(debug_log, &st) == 0)
 753  751                                  debug_log_fp = fopen(debug_log, "w");
 754  752                  } else {
 755  753                          if (stat(debug_log, &st) == -1) {
 756  754                                  (void) fclose(debug_log_fp);
 757  755                                  debug_log_fp = NULL;
 758  756                          }
 759  757                  }
 760  758  
 761  759                  /*
 762  760                   * If the CAP_REFRESH interval has passed, re-get the current
 763  761                   * cap in case it has been dynamically updated.
 764  762                   */
 765  763                  now = gethrtime();
 766  764                  if (now - last_cap_read > CAP_REFRESH) {
 767  765                          uint64_t mcap;
 768  766  
 769  767                          last_cap_read = now;
 770  768  
 771  769                          mcap = get_zone_cap();
 772  770                          if (mcap != 0 && mcap != UINT64_MAX)
 773  771                                  zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
 774  772                          else
 775  773                                  zone_rss_cap = UINT64_MAX;
 776  774  
 777  775                          lo_thresh = (uint64_t)(zone_rss_cap * .8);
 778  776                          hi_thresh = (uint64_t)(zone_rss_cap * .9);
 779  777                          addon = (uint64_t)(zone_rss_cap * 0.05);
 780  778  
 781  779                          /*
 782  780                           * We allow the memory cap tunables to be changed on
 783  781                           * the fly.
 784  782                           */
 785  783                          get_mcap_tunables();
 786  784  
 787  785                          debug("%s: %s\n", TUNE_CMD, over_cmd);
 788  786                          debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
 789  787                          debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
 790  788                          debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
 791  789                          debug("current cap %lluKB lo %lluKB hi %lluKB\n",
 792  790                              zone_rss_cap, lo_thresh, hi_thresh);
 793  791                  }
 794  792  
 795  793                  /* No cap, nothing to do. */
 796  794                  if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
 797  795                          debug("no cap, sleep 120 seconds\n");
 798  796                          (void) sleep_shutdown(120);
 799  797                          continue;
 800  798                  }
 801  799  
 802  800                  zone_rss = get_mem_info();
 803  801  
 804  802                  /* calculate excess */
 805  803                  new_excess = zone_rss - zone_rss_cap;
 806  804  
 807  805                  debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
 808  806                      zone_rss, zone_rss_cap, new_excess);
 809  807  
 810  808                  /*
 811  809                   * If necessary, updates stats.
 812  810                   */
 813  811  
 814  812                  /*
 815  813                   * If it looks like we did some paging out since last over the
 816  814                   * cap then update the kstat so we can approximate how much was
 817  815                   * paged out.
 818  816                   */
 819  817                  if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
 820  818                          uint64_t diff;
 821  819  
 822  820                          /* assume diff is num bytes we paged out */
 823  821                          diff = (prev_zone_rss - zone_rss) * 1024;
 824  822  
 825  823                          (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
 826  824                              &diff, 0);
 827  825                  }
 828  826                  prev_zone_rss = zone_rss;
 829  827  
 830  828                  if (new_excess > 0) {
 831  829                          uint64_t n = 1;
 832  830  
 833  831                          /* Increment "nover" kstat. */
 834  832                          (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
 835  833  
 836  834                          if (!skip_pf_throttle) {
 837  835                                  /*
 838  836                                   * Tell the kernel to start throttling page
 839  837                                   * faults by some number of usecs to help us
 840  838                                   * catch up. If we are persistently over the
 841  839                                   * cap the delay ramps up to a max of 2000usecs.
 842  840                                   * Note that for delays less than 1 tick
 843  841                                   * (i.e. all of these) we busy-wait in as_fault.
 844  842                                   *      delay   faults/sec
 845  843                                   *       125    8000
 846  844                                   *       250    4000
 847  845                                   *       500    2000
 848  846                                   *      1000    1000
 849  847                                   *      2000     500
 850  848                                   */
 851  849                                  if (pfdelay == 0)
 852  850                                          pfdelay = 125;
 853  851                                  else if (pfdelay < 2000)
 854  852                                          pfdelay *= 2;
 855  853  
 856  854                                  (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 857  855                                      &pfdelay, 0);
 858  856                          }
 859  857  
 860  858                          /*
 861  859                           * Once we go over the cap, then we want to
 862  860                           * page out a little extra instead of stopping
 863  861                           * right at the cap. To do this we add 5% to
 864  862                           * the excess so that pageout_proces will work
 865  863                           * a little longer before stopping.
 866  864                           */
 867  865                          return ((int64_t)(new_excess + addon));
 868  866                  }
 869  867  
 870  868                  /*
 871  869                   * At this point we are under the cap.
 872  870                   *
 873  871                   * Tell the kernel to stop throttling page faults.
 874  872                   *
 875  873                   * Scale the amount of time we sleep before rechecking the
 876  874                   * zone's memory usage.  Also, scale the accpetable age of
 877  875                   * cached results from vm_getusage.  We do this based on the
 878  876                   * penetration into the capped limit.
 879  877                   */
 880  878                  if (pfdelay > 0) {
 881  879                          pfdelay = 0;
 882  880                          (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 883  881                              &pfdelay, 0);
 884  882                  }
 885  883  
 886  884                  if (zone_rss <= lo_thresh) {
 887  885                          sleep_time = 120;
 888  886                  } else if (zone_rss <= hi_thresh) {
 889  887                          sleep_time = 60;
 890  888                  } else {
 891  889                          sleep_time = 30;
 892  890                  }
 893  891  
 894  892                  debug("sleep %d seconds\n", sleep_time);
 895  893                  (void) sleep_shutdown(sleep_time);
 896  894          }
 897  895  
 898  896          /* Shutting down, tell the kernel so it doesn't throttle */
 899  897          if (pfdelay > 0) {
 900  898                  pfdelay = 0;
 901  899                  (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
 902  900          }
 903  901  
 904  902          return (0);
 905  903  }
 906  904  
 907  905  static void
 908  906  get_mcap_tunables()
 909  907  {
 910  908          zone_dochandle_t handle;
 911  909          struct zone_attrtab attr;
 912  910  
 913  911          over_cmd[0] = '\0';
 914  912          if ((handle = zonecfg_init_handle()) == NULL)
 915  913                  return;
 916  914  
 917  915          if (zonecfg_get_handle(zone_name, handle) != Z_OK)
 918  916                  goto done;
 919  917  
 920  918          /* Reset to defaults in case rebooting and settings have changed */
 921  919          over_cmd[0] = '\0';
 922  920          skip_vmusage = B_FALSE;
 923  921          skip_pageout = B_FALSE;
 924  922          skip_pf_throttle = B_FALSE;
 925  923  
 926  924          if (zonecfg_setattrent(handle) != Z_OK)
 927  925                  goto done;
 928  926          while (zonecfg_getattrent(handle, &attr) == Z_OK) {
 929  927                  if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
 930  928                          (void) strlcpy(over_cmd, attr.zone_attr_value,
 931  929                              sizeof (over_cmd));
 932  930                  } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
 933  931                          if (strcmp("true", attr.zone_attr_value) == 0)
 934  932                                  skip_vmusage = B_TRUE;
 935  933                  } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
 936  934                          if (strcmp("true", attr.zone_attr_value) == 0)
 937  935                                  skip_pageout = B_TRUE;
 938  936                  } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
 939  937                          if (strcmp("true", attr.zone_attr_value) == 0)
 940  938                                  skip_pf_throttle = B_TRUE;
 941  939                  }
 942  940          }
 943  941          (void) zonecfg_endattrent(handle);
 944  942  
 945  943  done:
 946  944          zonecfg_fini_handle(handle);
 947  945  }
 948  946  
 949  947  /* ARGSUSED */
 950  948  static int
 951  949  chk_proc_fs(void *data, const char *spec, const char *dir,
 952  950      const char *fstype, const char *opt)
 953  951  {
 954  952          if (fstype != NULL && strcmp(fstype, "proc") == 0)
 955  953                  *((boolean_t *)data) = B_TRUE;
 956  954  
 957  955          return (0);
 958  956  }
 959  957  
 960  958  static boolean_t
 961  959  has_proc()
 962  960  {
 963  961          brand_handle_t bh;
 964  962          boolean_t fnd = B_FALSE;
 965  963  
 966  964          if ((bh = brand_open(brand_name)) != NULL) {
 967  965                  (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
 968  966          }
 969  967  
 970  968          brand_close(bh);
 971  969          return (fnd);
 972  970  }
 973  971  
 974  972  /*
 975  973   * We run this loop for brands with no /proc to simply update the RSS, using
 976  974   * the cheap GZ /proc data, every 5 minutes.
 977  975   */
 978  976  static void
 979  977  no_procfs()
 980  978  {
 981  979          DIR                     *pdir = NULL;
 982  980          struct dirent           *dent;
 983  981          uint64_t                zone_rss_bytes;
 984  982  
 985  983          (void) sleep_shutdown(30);
 986  984          while (!shutting_down) {
 987  985                  /*
 988  986                   * Just do the fast, cheap RSS calculation using the rss value
 989  987                   * in psinfo_t.  Because that's per-process, it can lead to
 990  988                   * double counting some memory and overestimating how much is
 991  989                   * being used. Since there is no /proc in the zone, we use the
 992  990                   * GZ /proc and check for the correct zone.
 993  991                   */
 994  992                  if ((pdir = opendir("/proc")) == NULL)
 995  993                          return;
 996  994  
 997  995                  fast_rss = 0;
 998  996                  while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 999  997                          pid_t           pid;
1000  998                          int             psfd;
1001  999                          int64_t         rss;
1002 1000                          char            pathbuf[MAXPATHLEN];
1003 1001                          psinfo_t        psinfo;
1004 1002  
1005 1003                          if (strcmp(".", dent->d_name) == 0 ||
1006 1004                              strcmp("..", dent->d_name) == 0)
1007 1005                                  continue;
1008 1006  
1009 1007                          pid = atoi(dent->d_name);
1010 1008                          if (pid == 0 || pid == 1)
1011 1009                                  continue;
1012 1010  
1013 1011                          (void) snprintf(pathbuf, sizeof (pathbuf),
1014 1012                              "/proc/%d/psinfo", pid);
1015 1013  
1016 1014                          rss = 0;
1017 1015                          if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1018 1016                                  if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1019 1017                                      sizeof (psinfo)) {
1020 1018                                          if (psinfo.pr_zoneid == zid)
1021 1019                                                  rss = (int64_t)psinfo.pr_rssize;
1022 1020                                  }
1023 1021  
1024 1022                                  (void) close(psfd);
1025 1023                          }
1026 1024  
1027 1025                          fast_rss += rss;
1028 1026                  }
1029 1027  
1030 1028                  (void) closedir(pdir);
1031 1029  
1032 1030                  if (shutting_down)
1033 1031                          return;
1034 1032  
1035 1033                  zone_rss_bytes = fast_rss * 1024;
1036 1034                  /* Use the zone's approx. RSS in the kernel */
1037 1035                  (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1038 1036  
1039 1037                  (void) sleep_shutdown(300);
1040 1038          }
1041 1039  }
1042 1040  
1043 1041  /*
1044 1042   * Thread that checks zone's memory usage and when over the cap, goes through
1045 1043   * the zone's process list trying to pageout processes to get under the cap.
1046 1044   */
1047 1045  static void
1048 1046  mcap_zone()
1049 1047  {
1050 1048          DIR *pdir = NULL;
1051 1049          int64_t excess;
1052 1050  
1053 1051          debug("thread startup\n");
1054 1052  
1055 1053          get_mcap_tunables();
1056 1054  
1057 1055          /*
1058 1056           * If the zone has no /proc filesystem, we can't use the fast algorithm
1059 1057           * to check RSS or pageout any processes. All we can do is periodically
1060 1058           * update it's RSS kstat using the expensive sycall.
1061 1059           */
1062 1060          if (!has_proc()) {
1063 1061                  no_procfs();
1064 1062                  debug("thread shutdown\n");
1065 1063                  return;
1066 1064          }
1067 1065  
1068 1066          /*
1069 1067           * When first starting it is likely lots of other zones are starting
1070 1068           * too because the system is booting.  Since we just started the zone
1071 1069           * we're not worried about being over the cap right away, so we let
1072 1070           * things settle a bit and tolerate some older data here to minimize
1073 1071           * the load on the system.
1074 1072           */
1075 1073          (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1076 1074  
1077 1075          /* Wait until zone's /proc is mounted */
1078 1076          while (!shutting_down) {
1079 1077                  struct stat st;
1080 1078  
1081 1079                  if (stat(zoneproc, &st) == 0 &&
1082 1080                      strcmp(st.st_fstype, "proc") == 0)
1083 1081                          break;
1084 1082                  sleep_shutdown(5);
1085 1083          }
1086 1084  
1087 1085          /* Open zone's /proc and walk entries. */
1088 1086          while (!shutting_down) {
1089 1087                  if ((pdir = opendir(zoneproc)) != NULL)
1090 1088                          break;
1091 1089                  sleep_shutdown(5);
1092 1090          }
1093 1091  
1094 1092          while (!shutting_down) {
1095 1093                  struct dirent *dirent;
1096 1094  
1097 1095                  /* Wait until we've gone over the cap. */
1098 1096                  excess = check_suspend();
1099 1097  
1100 1098                  debug("starting to scan, excess %lldk\n", (long long)excess);
1101 1099  
1102 1100                  if (over_cmd[0] != '\0') {
1103 1101                          uint64_t zone_rss;      /* total RSS(KB) */
1104 1102  
1105 1103                          debug("run phys_mcap_cmd: %s\n", over_cmd);
1106 1104                          run_over_cmd();
1107 1105  
1108 1106                          zone_rss = get_mem_info();
1109 1107                          excess = zone_rss - zone_rss_cap;
1110 1108                          debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1111 1109                              zone_rss, zone_rss_cap, excess);
1112 1110                          if (excess <= 0)
1113 1111                                  continue;
1114 1112                  }
1115 1113  
1116 1114                  while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1117 1115                          pid_t pid;
1118 1116  
1119 1117                          if (strcmp(".", dirent->d_name) == 0 ||
1120 1118                              strcmp("..", dirent->d_name) == 0)
1121 1119                                  continue;
1122 1120  
1123 1121                          pid = atoi(dirent->d_name);
1124 1122                          if (pid == 0 || pid == 1)
1125 1123                                  continue;
1126 1124  
1127 1125                          if (skip_pageout)
1128 1126                                  (void) sleep_shutdown(2);
1129 1127                          else
1130 1128                                  excess = pageout_process(pid, excess);
1131 1129  
1132 1130                          if (excess <= 0) {
1133 1131                                  debug("apparently under; excess %lld\n",
1134 1132                                      (long long)excess);
1135 1133                                  /* Double check the current excess */
1136 1134                                  excess = check_suspend();
1137 1135                          }
1138 1136                  }
1139 1137  
1140 1138                  debug("process pass done; excess %lld\n", (long long)excess);
1141 1139                  rewinddir(pdir);
1142 1140  
1143 1141                  if (skip_pageout)
1144 1142                          (void) sleep_shutdown(120);
1145 1143          }
1146 1144  
1147 1145          if (pdir != NULL)
1148 1146                  (void) closedir(pdir);

↓ open down ↓

693 lines elided

↑ open up ↑

1149 1147          debug("thread shutdown\n");
1150 1148  }
1151 1149  
1152 1150  void
1153 1151  create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1154 1152  {
1155 1153          int             res;
1156 1154  
1157 1155          shutting_down = 0;
1158 1156          zid = id;
1159      -        logp = zlogp;
1160 1157  
1161 1158          /* all but the lx brand currently use /proc */
1162 1159          if (strcmp(brand_name, "lx") == 0) {
1163 1160                  (void) snprintf(zoneproc, sizeof (zoneproc),
1164 1161                      "%s/root/native/proc", zonepath);
1165 1162          } else {
1166 1163                  (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1167 1164                      zonepath);
1168 1165          }
1169 1166

1170 1167          (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1171 1168              zonepath);
1172 1169  
1173 1170          res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1174 1171              &mcap_tid);
1175 1172          if (res != 0) {
1176 1173                  zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1177 1174                      res);
1178 1175                  mcap_tid = 0;
1179 1176          }
1180 1177  }
1181 1178  
1182 1179  void
1183 1180  destroy_mcap_thread()
1184 1181  {
1185 1182          if (mcap_tid != 0) {
1186 1183                  shutting_down = 1;
1187 1184                  (void) cond_signal(&shutdown_cv);
1188 1185                  (void) thr_join(mcap_tid, NULL, NULL);
1189 1186                  mcap_tid = 0;
1190 1187          }
1191 1188  }

↓ open down ↓

22 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX