io-lx-public-vs-joyent Wdiff usr/src/cmd/zoneadmd/mcap.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/cmd/zoneadmd/mcap.c
          +++ new/usr/src/cmd/zoneadmd/mcap.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Copyright 2014, Joyent, Inc.  All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * This file implements the code which runs a thread inside zoneadmd to cap
  28   28   * the associated zone's physical memory.  A thread to do this is started
  29   29   * when the zone boots and is halted when the zone shuts down.
  30   30   *
  31   31   * Because of the way that the VM system is currently implemented, there is no
  32   32   * way to go from the bottom up (page to process to zone).  Thus, there is no
  33   33   * obvious way to hook an rctl into the kernel's paging code to enforce a hard
  34   34   * memory cap.  Instead, we implement a soft physical memory cap which looks
  35   35   * at the zone's overall rss and once it is over the cap, works from the top
  36   36   * down (zone to process to page), looking at zone processes, to determine
  37   37   * what to try to pageout to get the zone under its memory cap.
  38   38   *
  39   39   * The code uses the fast, cheap, but potentially very inaccurate sum of the
  40   40   * rss values from psinfo_t to first approximate the zone's rss and will
  41   41   * fallback to the vm_getusage syscall to determine the zone's rss if needed.
  42   42   * It then checks the rss against the zone's zone.max-physical-memory rctl.
  43   43   * Once the zone goes over its cap, then this thread will work through the
  44   44   * zone's /proc process list, Pgrab-bing each process and stepping through the
  45   45   * address space segments attempting to use pr_memcntl(...MS_INVALCURPROC...)
  46   46   * to pageout pages, until the zone is again under its cap.
  47   47   *
  48   48   * Although zone memory capping is implemented as a soft cap by this user-level
  49   49   * thread, the interfaces around memory caps that are exposed to the user are
  50   50   * the standard ones; an rctl and kstats.  This thread uses the rctl value
  51   51   * to obtain the cap and works with the zone kernel code to update the kstats.
  52   52   * If the implementation ever moves into the kernel, these exposed interfaces
  53   53   * do not need to change.
  54   54   *
  55   55   * The thread adaptively sleeps, periodically checking the state of the
  56   56   * zone.  As the zone's rss gets closer to the cap, the thread will wake up
  57   57   * more often to check the zone's status.  Once the zone is over the cap,
  58   58   * the thread will work to pageout until the zone is under the cap, as shown
  59   59   * by updated vm_usage data.
  60   60   *
  61   61   * NOTE: The pagedata page maps (at least on x86) are not useful.  Those flags
  62   62   * are set by hrm_setbits() and on x86 that code path is only executed by
  63   63   *     segvn_pagelock -> hat_setstat -> hrm_setbits
  64   64   *     segvn_softunlock -^
  65   65   * On SPARC there is an additional code path which may make this data
  66   66   * useful (sfmmu_ttesync), but since it is not generic, we ignore the page
  67   67   * maps.  If we ever fix this issue, then we could generalize this mcap code to
  68   68   * do more with the data on active pages.
  69   69   *
  70   70   * For debugging, touch the file {zonepath}/mcap_debug.log.  This will
  71   71   * cause the thread to start logging its actions into that file (it may take
  72   72   * a minute or two if the thread is currently sleeping).  Removing that
  73   73   * file will cause logging to stop.
  74   74   */
  75   75  
  76   76  #include <sys/mman.h>
  77   77  #include <sys/param.h>
  78   78  #include <sys/stat.h>
  79   79  #include <sys/types.h>
  80   80  #include <assert.h>
  81   81  #include <errno.h>
  82   82  #include <fcntl.h>
  83   83  #include <libproc.h>
  84   84  #include <limits.h>
  85   85  #include <procfs.h>
  86   86  #include <stdio.h>
  87   87  #include <stdlib.h>
  88   88  #include <strings.h>
  89   89  #include <time.h>
  90   90  #include <unistd.h>
  91   91  #include <sys/priocntl.h>
  92   92  #include <dirent.h>
  93   93  #include <zone.h>
  94   94  #include <libzonecfg.h>
  95   95  #include <thread.h>
  96   96  #include <values.h>
  97   97  #include <sys/vm_usage.h>
  98   98  #include <sys/resource.h>
  99   99  #include <sys/debug.h>
 100  100  #include <synch.h>
 101  101  #include <wait.h>
 102  102  #include <libcontract.h>
 103  103  #include <libcontract_priv.h>
 104  104  #include <sys/contract/process.h>
 105  105  #include "zoneadmd.h"
 106  106  
 107  107                                          /* round up to next y = 2^n */
 108  108  #define ROUNDUP(x, y)   (((x) + ((y) - 1)) & ~((y) - 1))
 109  109  
 110  110  #define CAP_REFRESH     ((uint64_t)300 * NANOSEC) /* every 5 minutes */
 111  111  
 112  112  /*
 113  113   * zonecfg attribute tunables for memory capping.
 114  114   *    phys-mcap-cmd
 115  115   *      type: string
 116  116   *      specifies a command that can be run when over the cap
 117  117   *    phys-mcap-no-vmusage
 118  118   *      type: boolean
 119  119   *      true disables vm_getusage and just uses zone's proc. rss sum
 120  120   *    phys-mcap-no-pageout
 121  121   *      type: boolean
 122  122   *      true disables pageout when over

↓ open down ↓

122 lines elided

↑ open up ↑

 123  123   *    phys-mcap-no-pf-throttle
 124  124   *      type: boolean
 125  125   *      true disables page fault throttling when over
 126  126   */
 127  127  #define TUNE_CMD        "phys-mcap-cmd"
 128  128  #define TUNE_NVMU       "phys-mcap-no-vmusage"
 129  129  #define TUNE_NPAGE      "phys-mcap-no-pageout"
 130  130  #define TUNE_NPFTHROT   "phys-mcap-no-pf-throttle"
 131  131  
 132  132  /*
      133 + * The large mapping value was derived empirically by seeing that mappings
      134 + * much bigger than 16mb sometimes take a relatively long time to invalidate
      135 + * (significant fraction of a second).
      136 + */
      137 +#define SEC_INTERIM     4       /* num secs to pause after stopped too long */
      138 +#define MSEC_TOO_LONG   100     /* release proc. after stopped for 100ms */
      139 +#define LARGE_MAPPING   16384   /* >= 16MB in KB - pageout in chunks */
      140 +
      141 +/*
 133  142   * These are only used in get_mem_info but global. We always need scale_rss and
 134  143   * prev_fast_rss to be persistent but we also have the other two global so we
 135  144   * can easily see these with mdb.
 136  145   */
 137  146  uint64_t        scale_rss = 0;
 138  147  uint64_t        prev_fast_rss = 0;
 139  148  uint64_t        fast_rss = 0;
 140  149  uint64_t        accurate_rss = 0;
 141  150  
 142  151  static char     zoneproc[MAXPATHLEN];

 143  152  static char     debug_log[MAXPATHLEN];
 144  153  static zoneid_t zid;
 145  154  static mutex_t  shutdown_mx;

↓ open down ↓

3 lines elided

↑ open up ↑

 146  155  static cond_t   shutdown_cv;
 147  156  static int      shutting_down = 0;
 148  157  static thread_t mcap_tid;
 149  158  static FILE     *debug_log_fp = NULL;
 150  159  static uint64_t zone_rss_cap;           /* RSS cap(KB) */
 151  160  static char     over_cmd[2 * BUFSIZ];   /* same size as zone_attr_value */
 152  161  static boolean_t skip_vmusage = B_FALSE;
 153  162  static boolean_t skip_pageout = B_FALSE;
 154  163  static boolean_t skip_pf_throttle = B_FALSE;
 155  164  
 156      -static zlog_t   *logp;
 157      -
 158  165  static int64_t check_suspend();
 159  166  static void get_mcap_tunables();
 160  167  
 161  168  /*
 162  169   * Structure to hold current state about a process address space that we're
 163  170   * working on.
 164  171   */
 165  172  typedef struct {
 166  173          int pr_curr;            /* the # of the mapping we're working on */
 167  174          int pr_nmap;            /* number of mappings in address space */

 168  175          prmap_t *pr_mapp;       /* process's map array */
 169  176  } proc_map_t;
 170  177  
 171  178  typedef struct zsd_vmusage64 {
 172  179          id_t vmu_zoneid;
 173  180          uint_t vmu_type;
 174  181          id_t vmu_id;
 175  182          /*
 176  183           * An amd64 kernel will align the following uint64_t members, but a
 177  184           * 32bit i386 process will not without help.
 178  185           */
 179  186          int vmu_align_next_members_on_8_bytes;
 180  187          uint64_t vmu_rss_all;
 181  188          uint64_t vmu_rss_private;
 182  189          uint64_t vmu_rss_shared;
 183  190          uint64_t vmu_swap_all;
 184  191          uint64_t vmu_swap_private;
 185  192          uint64_t vmu_swap_shared;
 186  193  } zsd_vmusage64_t;
 187  194  
 188  195  /*
 189  196   * Output a debug log message.
 190  197   */
 191  198  /*PRINTFLIKE1*/
 192  199  static void
 193  200  debug(char *fmt, ...)
 194  201  {
 195  202          va_list ap;
 196  203  
 197  204          if (debug_log_fp == NULL)
 198  205                  return;
 199  206  
 200  207          va_start(ap, fmt);
 201  208          (void) vfprintf(debug_log_fp, fmt, ap);
 202  209          va_end(ap);
 203  210          (void) fflush(debug_log_fp);
 204  211  }
 205  212  
 206  213  /*
 207  214   * Like sleep(3C) but can be interupted by cond_signal which is posted when
 208  215   * we're shutting down the mcap thread.
 209  216   */
 210  217  static void
 211  218  sleep_shutdown(int secs)
 212  219  {
 213  220          timestruc_t to;
 214  221  
 215  222          to.tv_sec = secs;
 216  223          to.tv_nsec = 0;
 217  224  
 218  225          (void) mutex_lock(&shutdown_mx);
 219  226          if (!shutting_down)
 220  227                  (void) cond_reltimedwait(&shutdown_cv, &shutdown_mx, &to);
 221  228          (void) mutex_unlock(&shutdown_mx);
 222  229  }
 223  230  
 224  231  static boolean_t
 225  232  proc_issystem(pid_t pid)
 226  233  {
 227  234          char pc_clname[PC_CLNMSZ];
 228  235  
 229  236          if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
 230  237              PC_KY_NULL) != -1)
 231  238                  return (strcmp(pc_clname, "SYS") == 0);
 232  239  
 233  240          return (B_TRUE);
 234  241  }
 235  242  
 236  243  /*
 237  244   * Fork a child that enters the zone and runs the "phys-mcap-cmd" command.
 238  245   */
 239  246  static void
 240  247  run_over_cmd()
 241  248  {
 242  249          int             ctfd;
 243  250          int             err;
 244  251          pid_t           childpid;
 245  252          siginfo_t       info;
 246  253          ctid_t          ct;
 247  254  
 248  255          /*
 249  256           * Before we enter the zone, we need to create a new process contract
 250  257           * for the child, as required by zone_enter().
 251  258           */
 252  259          if ((ctfd = open64("/system/contract/process/template", O_RDWR)) == -1)
 253  260                  return;
 254  261          if (ct_tmpl_set_critical(ctfd, 0) != 0 ||
 255  262              ct_tmpl_set_informative(ctfd, 0) != 0 ||
 256  263              ct_pr_tmpl_set_fatal(ctfd, CT_PR_EV_HWERR) != 0 ||
 257  264              ct_pr_tmpl_set_param(ctfd, CT_PR_PGRPONLY) != 0 ||
 258  265              ct_tmpl_activate(ctfd) != 0) {
 259  266                  (void) close(ctfd);
 260  267                  return;
 261  268          }
 262  269  
 263  270          childpid = fork();
 264  271          switch (childpid) {
 265  272          case -1:
 266  273                  (void) ct_tmpl_clear(ctfd);
 267  274                  (void) close(ctfd);
 268  275                  break;
 269  276          case 0: /* Child */
 270  277                  (void) ct_tmpl_clear(ctfd);
 271  278                  (void) close(ctfd);
 272  279                  if (zone_enter(zid) == -1)
 273  280                          _exit(errno);
 274  281                  err = system(over_cmd);
 275  282                  _exit(err);
 276  283                  break;
 277  284          default:        /* Parent */
 278  285                  if (contract_latest(&ct) == -1)
 279  286                          ct = -1;
 280  287                  (void) ct_tmpl_clear(ctfd);
 281  288                  (void) close(ctfd);
 282  289                  err = waitid(P_PID, childpid, &info, WEXITED);
 283  290                  (void) contract_abandon_id(ct);
 284  291                  if (err == -1 || info.si_status != 0)
 285  292                          debug("over_cmd failed");
 286  293                  break;
 287  294          }
 288  295  }
 289  296  
 290  297  /*
 291  298   * Get the next mapping.
 292  299   */
 293  300  static prmap_t *
 294  301  nextmapping(proc_map_t *pmp)
 295  302  {
 296  303          if (pmp->pr_mapp == NULL || pmp->pr_curr >= pmp->pr_nmap)
 297  304                  return (NULL);
 298  305  
 299  306          return (&pmp->pr_mapp[pmp->pr_curr++]);
 300  307  }
 301  308  
 302  309  /*
 303  310   * Initialize the proc_map_t to access the first mapping of an address space.
 304  311   */
 305  312  static prmap_t *
 306  313  init_map(proc_map_t *pmp, pid_t pid)
 307  314  {
 308  315          int fd;
 309  316          int res;
 310  317          struct stat st;
 311  318          char pathbuf[MAXPATHLEN];
 312  319  
 313  320          bzero(pmp, sizeof (proc_map_t));
 314  321          pmp->pr_nmap = -1;
 315  322  
 316  323          (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/map", zoneproc, pid);
 317  324          if ((fd = open(pathbuf, O_RDONLY, 0)) < 0)
 318  325                  return (NULL);
 319  326  
 320  327  redo:
 321  328          errno = 0;
 322  329          if (fstat(fd, &st) != 0)
 323  330                  goto done;
 324  331  
 325  332          if ((pmp->pr_mapp = malloc(st.st_size)) == NULL) {
 326  333                  debug("cannot malloc() %ld bytes for xmap", st.st_size);
 327  334                  goto done;
 328  335          }
 329  336          (void) bzero(pmp->pr_mapp, st.st_size);
 330  337  
 331  338          errno = 0;
 332  339          if ((res = pread(fd, pmp->pr_mapp, st.st_size, 0)) != st.st_size) {
 333  340                  free(pmp->pr_mapp);
 334  341                  pmp->pr_mapp = NULL;
 335  342                  if (res > 0 || errno == E2BIG) {
 336  343                          goto redo;
 337  344                  } else {
 338  345                          debug("pid %ld cannot read xmap\n", pid);
 339  346                          goto done;
 340  347                  }
 341  348          }
 342  349  
 343  350          pmp->pr_nmap = st.st_size / sizeof (prmap_t);
 344  351  
 345  352  done:
 346  353          (void) close(fd);
 347  354          return (nextmapping(pmp));
 348  355  }
 349  356  
 350  357  /*
 351  358   * Attempt to invalidate the entire mapping from within the given process's
 352  359   * address space. May return nonzero with errno as:
 353  360   *    ESRCH  - process not found
 354  361   *    ENOMEM - segment not found
 355  362   *    EINVAL - mapping exceeds a single segment
 356  363   */
 357  364  static int
 358  365  pageout_mapping(pid_t pid, prmap_t *pmp)
 359  366  {
 360  367          int res;
 361  368  
 362  369          if (pmp->pr_mflags & MA_ISM || pmp->pr_mflags & MA_SHM)
 363  370                  return (0);
 364  371  
 365  372          errno = 0;
 366  373          res = syscall(SYS_rusagesys, _RUSAGESYS_INVALMAP, pid, pmp->pr_vaddr,
 367  374              pmp->pr_size);
 368  375  
 369  376          return (res);
 370  377  }
 371  378

↓ open down ↓

204 lines elided

↑ open up ↑

 372  379  /*
 373  380   * Work through a process paging out mappings until the whole address space was
 374  381   * examined or the excess is < 0.  Return our estimate of the updated excess.
 375  382   */
 376  383  static int64_t
 377  384  pageout_process(pid_t pid, int64_t excess)
 378  385  {
 379  386          int                     psfd;
 380  387          prmap_t                 *pmap;
 381  388          proc_map_t              cur;
 382      -        int                     res;
 383  389          int64_t                 sum_d_rss, d_rss;
 384  390          int64_t                 old_rss;
 385  391          int                     map_cnt;
 386  392          psinfo_t                psinfo;
 387  393          char                    pathbuf[MAXPATHLEN];
 388  394  
 389  395          (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo", zoneproc,
 390  396              pid);
 391  397          if ((psfd = open(pathbuf, O_RDONLY, 0000)) < 0)
 392  398                  return (excess);

 393  399  
 394  400          cur.pr_mapp = NULL;
 395  401  
 396  402          if (pread(psfd, &psinfo, sizeof (psinfo), 0) != sizeof (psinfo))
 397  403                  goto done;
 398  404  
 399  405          old_rss = (int64_t)psinfo.pr_rssize;
 400  406          map_cnt = 0;
 401  407  
 402  408          /* If unscannable, skip it. */
 403  409          if (psinfo.pr_nlwp == 0 || proc_issystem(pid)) {
 404  410                  debug("pid %ld: system process, skipping %s\n",
 405  411                      pid, psinfo.pr_psargs);
 406  412                  goto done;
 407  413          }
 408  414  
 409  415          /* If tiny RSS (16KB), skip it. */
 410  416          if (old_rss <= 16) {
 411  417                  debug("pid %ld: skipping, RSS %lldKB %s\n",
 412  418                      pid, old_rss, psinfo.pr_psargs);
 413  419                  goto done;
 414  420          }
 415  421  
 416  422          /* Get segment residency information. */
 417  423          pmap = init_map(&cur, pid);
 418  424  
 419  425          /* Skip process if it has no mappings. */
 420  426          if (pmap == NULL) {
 421  427                  debug("pid %ld: map unreadable; ignoring\n", pid);
 422  428                  goto done;
 423  429          }

↓ open down ↓

31 lines elided

↑ open up ↑

 424  430  
 425  431          debug("pid %ld: nmap %d sz %dKB rss %lldKB %s\n",
 426  432              pid, cur.pr_nmap, psinfo.pr_size, old_rss, psinfo.pr_psargs);
 427  433  
 428  434          /*
 429  435           * Within the process's address space, attempt to page out mappings.
 430  436           */
 431  437          sum_d_rss = 0;
 432  438          while (excess > 0 && pmap != NULL && !shutting_down) {
 433  439                  /* invalidate the entire mapping */
 434      -                if ((res = pageout_mapping(pid, pmap)) < 0)
      440 +                if (pageout_mapping(pid, pmap) < 0)
 435  441                          debug("pid %ld: mapping 0x%p %ldkb unpageable (%d)\n",
 436      -                            pid, pmap->pr_vaddr, pmap->pr_size / 1024, errno);
      442 +                            pid, (void *)pmap->pr_vaddr,
      443 +                            (long)pmap->pr_size / 1024L, errno);
 437  444  
 438  445                  map_cnt++;
 439  446  
 440  447                  /*
 441  448                   * Re-check the process rss and get the delta.
 442  449                   */
 443  450                  if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 444  451                      != sizeof (psinfo)) {
 445  452                          excess -= old_rss;
 446  453                          goto done;

 447  454                  }
 448  455  
 449  456                  d_rss = (int64_t)psinfo.pr_rssize - old_rss;
 450  457                  old_rss = (int64_t)psinfo.pr_rssize;
 451  458                  sum_d_rss += d_rss;
 452  459  
 453  460                  /*
 454  461                   * d_rss hopefully should be negative (or 0 if nothing
 455  462                   * invalidated) but can be positive if more got paged in.
 456  463                   */
 457  464                  excess += d_rss;
 458  465  
 459  466                  if (excess <= 0) {
 460  467                          debug("pid %ld: (part.) nmap %d delta_rss %lldKB "
 461  468                              "excess %lldKB\n", pid, map_cnt,
 462  469                              (unsigned long long)sum_d_rss, (long long)excess);
 463  470                          map_cnt = 0;
 464  471  
 465  472                          /*
 466  473                           * If we're actually under, this will suspend checking
 467  474                           * in the middle of this process's address space.
 468  475                           */
 469  476                          excess = check_suspend();
 470  477                          if (shutting_down)
 471  478                                  goto done;
 472  479  
 473  480                          /*
 474  481                           * since we might have suspended, re-read process's rss
 475  482                           */
 476  483                          if (pread(psfd, &psinfo, sizeof (psinfo), 0)
 477  484                              != sizeof (psinfo)) {
 478  485                                  excess -= old_rss;
 479  486                                  goto done;
 480  487                          }
 481  488  
 482  489                          old_rss = (int64_t)psinfo.pr_rssize;
 483  490  
 484  491                          debug("pid %ld: resume pageout; excess %lld\n", pid,
 485  492                              (long long)excess);
 486  493                          sum_d_rss = 0;
 487  494                  }
 488  495  
 489  496                  pmap = nextmapping(&cur);
 490  497          }
 491  498  
 492  499          debug("pid %ld: nmap %d delta_rss %lldKB excess %lldKB\n",
 493  500              pid, map_cnt, (unsigned long long)sum_d_rss, (long long)excess);
 494  501  
 495  502  done:
 496  503          if (cur.pr_mapp != NULL)
 497  504                  free(cur.pr_mapp);
 498  505  
 499  506          (void) close(psfd);
 500  507  
 501  508          if (shutting_down)
 502  509                  return (0);
 503  510  
 504  511          return (excess);
 505  512  }
 506  513  
 507  514  /*
 508  515   * Get the zone's RSS data.
 509  516   */
 510  517  static uint64_t
 511  518  get_mem_info()
 512  519  {
 513  520          uint64_t                n = 1;
 514  521          zsd_vmusage64_t         buf;
 515  522          uint64_t                tmp_rss;
 516  523          DIR                     *pdir = NULL;
 517  524          struct dirent           *dent;
 518  525  
 519  526          /*
 520  527           * Start by doing the fast, cheap RSS calculation using the rss value
 521  528           * in psinfo_t.  Because that's per-process, it can lead to double
 522  529           * counting some memory and overestimating how much is being used, but
 523  530           * as long as that's not over the cap, then we don't need do the
 524  531           * expensive calculation.
 525  532           *
 526  533           * If we have to do the expensive calculation, we remember the scaling
 527  534           * factor so that we can try to use that on subsequent iterations for
 528  535           * the fast rss.
 529  536           */
 530  537          if (shutting_down)
 531  538                  return (0);
 532  539  
 533  540          if ((pdir = opendir(zoneproc)) == NULL)
 534  541                  return (0);
 535  542  
 536  543          accurate_rss = 0;
 537  544          fast_rss = 0;
 538  545          while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 539  546                  pid_t           pid;
 540  547                  int             psfd;
 541  548                  int64_t         rss;
 542  549                  char            pathbuf[MAXPATHLEN];
 543  550                  psinfo_t        psinfo;
 544  551  
 545  552                  if (strcmp(".", dent->d_name) == 0 ||
 546  553                      strcmp("..", dent->d_name) == 0)
 547  554                          continue;
 548  555  
 549  556                  pid = atoi(dent->d_name);
 550  557                  if (pid == 0 || pid == 1)
 551  558                          continue;
 552  559  
 553  560                  (void) snprintf(pathbuf, sizeof (pathbuf), "%s/%d/psinfo",
 554  561                      zoneproc, pid);
 555  562  
 556  563                  rss = 0;
 557  564                  if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
 558  565                          if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
 559  566                              sizeof (psinfo))
 560  567                                  rss = (int64_t)psinfo.pr_rssize;
 561  568  
 562  569                          (void) close(psfd);
 563  570                  }
 564  571  
 565  572                  fast_rss += rss;
 566  573          }
 567  574  
 568  575          (void) closedir(pdir);
 569  576  
 570  577          if (shutting_down)
 571  578                  return (0);
 572  579  
 573  580          debug("fast rss: %lluKB, scale: %llu, prev: %lluKB\n", fast_rss,
 574  581              scale_rss, prev_fast_rss);
 575  582  
 576  583          /* see if we can get by with a scaled fast rss */
 577  584          tmp_rss = fast_rss;
 578  585          if (scale_rss > 1 && prev_fast_rss > 0) {
 579  586                  /*
 580  587                   * Only scale the fast value if it hasn't ballooned too much
 581  588                   * to trust.
 582  589                   */
 583  590                  if (fast_rss / prev_fast_rss < 2) {
 584  591                          fast_rss /= scale_rss;
 585  592                          debug("scaled fast rss: %lluKB\n", fast_rss);
 586  593                  }
 587  594          }
 588  595  
 589  596          if (fast_rss <= zone_rss_cap || skip_vmusage) {
 590  597                  uint64_t zone_rss_bytes;
 591  598  
 592  599                  zone_rss_bytes = fast_rss * 1024;
 593  600                  /* Use the zone's approx. RSS in the kernel */
 594  601                  (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
 595  602                  return (fast_rss);
 596  603          }
 597  604  
 598  605          buf.vmu_id = zid;
 599  606  
 600  607          /* get accurate usage (cached data may be up to 5 seconds old) */
 601  608          if (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE, VMUSAGE_A_ZONE, 5,
 602  609              (uintptr_t)&buf, (uintptr_t)&n) != 0) {
 603  610                  debug("vmusage failed\n");
 604  611                  (void) sleep_shutdown(1);
 605  612                  return (0);
 606  613          }
 607  614  
 608  615          if (n > 1) {
 609  616                  /* This should never happen */
 610  617                  debug("vmusage returned more than one result\n");
 611  618                  (void) sleep_shutdown(1);
 612  619                  return (0);
 613  620          }
 614  621  
 615  622          if (buf.vmu_id != zid) {
 616  623                  /* This should never happen */
 617  624                  debug("vmusage returned the incorrect zone\n");
 618  625                  (void) sleep_shutdown(1);
 619  626                  return (0);
 620  627          }
 621  628  
 622  629          accurate_rss = buf.vmu_rss_all / 1024;
 623  630  
 624  631          /* calculate scaling factor to use for fast_rss from now on */
 625  632          if (accurate_rss > 0) {
 626  633                  scale_rss = fast_rss / accurate_rss;
 627  634                  debug("new scaling factor: %llu\n", scale_rss);
 628  635                  /* remember the fast rss when we had to get the accurate rss */
 629  636                  prev_fast_rss = tmp_rss;
 630  637          }
 631  638  
 632  639          debug("accurate rss: %lluKB, scale: %llu, prev: %lluKB\n", accurate_rss,
 633  640              scale_rss, prev_fast_rss);
 634  641          return (accurate_rss);
 635  642  }
 636  643  
 637  644  /*
 638  645   * Needed to read the zones physical-memory-cap rctl.
 639  646   */
 640  647  static struct ps_prochandle *
 641  648  grab_zone_proc()
 642  649  {
 643  650          DIR *dirp;
 644  651          struct dirent *dentp;
 645  652          struct ps_prochandle *ph = NULL;
 646  653          int tmp;
 647  654  
 648  655          if ((dirp = opendir(zoneproc)) == NULL)
 649  656                  return (NULL);
 650  657  
 651  658          while (!shutting_down && (dentp = readdir(dirp))) {
 652  659                  int pid;
 653  660  
 654  661                  if (strcmp(".", dentp->d_name) == 0 ||
 655  662                      strcmp("..", dentp->d_name) == 0)
 656  663                          continue;
 657  664  
 658  665                  pid = atoi(dentp->d_name);
 659  666                  /* attempt to grab process */
 660  667                  if ((ph = Pgrab(pid, 0, &tmp)) != NULL) {
 661  668                          if (Psetflags(ph, PR_RLC) == 0) {
 662  669                                  if (Pcreate_agent(ph) == 0) {
 663  670                                          (void) closedir(dirp);
 664  671                                          return (ph);
 665  672                                  }
 666  673                          }
 667  674                          Prelease(ph, 0);
 668  675                  }
 669  676          }
 670  677  
 671  678          (void) closedir(dirp);
 672  679          return (NULL);
 673  680  }
 674  681  
 675  682  static uint64_t
 676  683  get_zone_cap()
 677  684  {
 678  685          rctlblk_t *rblk;
 679  686          uint64_t mcap;
 680  687          struct ps_prochandle *ph;
 681  688  
 682  689          if ((rblk = (rctlblk_t *)malloc(rctlblk_size())) == NULL)
 683  690                  return (UINT64_MAX);
 684  691  
 685  692          if ((ph = grab_zone_proc()) == NULL) {
 686  693                  free(rblk);
 687  694                  return (UINT64_MAX);
 688  695          }
 689  696  
 690  697          if (pr_getrctl(ph, "zone.max-physical-memory", NULL, rblk,
 691  698              RCTL_FIRST)) {
 692  699                  Pdestroy_agent(ph);
 693  700                  Prelease(ph, 0);
 694  701                  free(rblk);
 695  702                  return (UINT64_MAX);
 696  703          }
 697  704  
 698  705          Pdestroy_agent(ph);
 699  706          Prelease(ph, 0);
 700  707  
 701  708          mcap = rctlblk_get_value(rblk);
 702  709          free(rblk);
 703  710          return (mcap);
 704  711  }
 705  712  
 706  713  /*
 707  714   * check_suspend is invoked at the beginning of every pass through the process
 708  715   * list or after we've paged out enough so that we think the excess is under
 709  716   * the cap.  The purpose is to periodically check the zone's rss and return
 710  717   * the excess when the zone is over the cap.  The rest of the time this
 711  718   * function will sleep, periodically waking up to check the current rss.
 712  719   *
 713  720   * Depending on the percentage of penetration of the zone's rss into the
 714  721   * cap we sleep for longer or shorter amounts. This reduces the impact of this
 715  722   * work on the system, which is important considering that each zone will be
 716  723   * monitoring its rss.
 717  724   */
 718  725  static int64_t
 719  726  check_suspend()
 720  727  {
 721  728          static hrtime_t last_cap_read = 0;
 722  729          static uint64_t addon;
 723  730          static uint64_t lo_thresh;      /* Thresholds for how long to  sleep */
 724  731          static uint64_t hi_thresh;      /* when under the cap (80% & 90%). */
 725  732          static uint64_t prev_zone_rss = 0;
 726  733          static uint32_t pfdelay = 0;    /* usec page fault delay when over */
 727  734  
 728  735          /* Wait a second to give the async pageout a chance to catch up. */
 729  736          (void) sleep_shutdown(1);
 730  737  
 731  738          while (!shutting_down) {
 732  739                  int64_t new_excess;
 733  740                  int sleep_time;
 734  741                  hrtime_t now;
 735  742                  struct stat st;
 736  743                  uint64_t zone_rss;              /* total RSS(KB) */
 737  744  
 738  745                  /*
 739  746                   * Check if the debug log files exists and enable or disable
 740  747                   * debug.
 741  748                   */
 742  749                  if (debug_log_fp == NULL) {
 743  750                          if (stat(debug_log, &st) == 0)
 744  751                                  debug_log_fp = fopen(debug_log, "w");
 745  752                  } else {
 746  753                          if (stat(debug_log, &st) == -1) {
 747  754                                  (void) fclose(debug_log_fp);
 748  755                                  debug_log_fp = NULL;
 749  756                          }
 750  757                  }
 751  758  
 752  759                  /*
 753  760                   * If the CAP_REFRESH interval has passed, re-get the current
 754  761                   * cap in case it has been dynamically updated.
 755  762                   */
 756  763                  now = gethrtime();
 757  764                  if (now - last_cap_read > CAP_REFRESH) {
 758  765                          uint64_t mcap;
 759  766  
 760  767                          last_cap_read = now;
 761  768  
 762  769                          mcap = get_zone_cap();
 763  770                          if (mcap != 0 && mcap != UINT64_MAX)
 764  771                                  zone_rss_cap = ROUNDUP(mcap, 1024) / 1024;
 765  772                          else
 766  773                                  zone_rss_cap = UINT64_MAX;
 767  774  
 768  775                          lo_thresh = (uint64_t)(zone_rss_cap * .8);
 769  776                          hi_thresh = (uint64_t)(zone_rss_cap * .9);
 770  777                          addon = (uint64_t)(zone_rss_cap * 0.05);
 771  778  
 772  779                          /*
 773  780                           * We allow the memory cap tunables to be changed on
 774  781                           * the fly.
 775  782                           */
 776  783                          get_mcap_tunables();
 777  784  
 778  785                          debug("%s: %s\n", TUNE_CMD, over_cmd);
 779  786                          debug("%s: %d\n", TUNE_NVMU, skip_vmusage);
 780  787                          debug("%s: %d\n", TUNE_NPAGE, skip_pageout);
 781  788                          debug("%s: %d\n", TUNE_NPFTHROT, skip_pf_throttle);
 782  789                          debug("current cap %lluKB lo %lluKB hi %lluKB\n",
 783  790                              zone_rss_cap, lo_thresh, hi_thresh);
 784  791                  }
 785  792  
 786  793                  /* No cap, nothing to do. */
 787  794                  if (zone_rss_cap == 0 || zone_rss_cap == UINT64_MAX) {
 788  795                          debug("no cap, sleep 120 seconds\n");
 789  796                          (void) sleep_shutdown(120);
 790  797                          continue;
 791  798                  }
 792  799  
 793  800                  zone_rss = get_mem_info();
 794  801  
 795  802                  /* calculate excess */
 796  803                  new_excess = zone_rss - zone_rss_cap;
 797  804  
 798  805                  debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
 799  806                      zone_rss, zone_rss_cap, new_excess);
 800  807  
 801  808                  /*
 802  809                   * If necessary, updates stats.
 803  810                   */
 804  811  
 805  812                  /*
 806  813                   * If it looks like we did some paging out since last over the
 807  814                   * cap then update the kstat so we can approximate how much was
 808  815                   * paged out.
 809  816                   */
 810  817                  if (prev_zone_rss > zone_rss_cap && zone_rss < prev_zone_rss) {
 811  818                          uint64_t diff;
 812  819  
 813  820                          /* assume diff is num bytes we paged out */
 814  821                          diff = (prev_zone_rss - zone_rss) * 1024;
 815  822  
 816  823                          (void) zone_setattr(zid, ZONE_ATTR_PMCAP_PAGEOUT,
 817  824                              &diff, 0);
 818  825                  }
 819  826                  prev_zone_rss = zone_rss;
 820  827  
 821  828                  if (new_excess > 0) {
 822  829                          uint64_t n = 1;
 823  830  
 824  831                          /* Increment "nover" kstat. */
 825  832                          (void) zone_setattr(zid, ZONE_ATTR_PMCAP_NOVER, &n, 0);
 826  833  
 827  834                          if (!skip_pf_throttle) {
 828  835                                  /*
 829  836                                   * Tell the kernel to start throttling page
 830  837                                   * faults by some number of usecs to help us
 831  838                                   * catch up. If we are persistently over the
 832  839                                   * cap the delay ramps up to a max of 2000usecs.
 833  840                                   * Note that for delays less than 1 tick
 834  841                                   * (i.e. all of these) we busy-wait in as_fault.
 835  842                                   *      delay   faults/sec
 836  843                                   *       125    8000
 837  844                                   *       250    4000
 838  845                                   *       500    2000
 839  846                                   *      1000    1000
 840  847                                   *      2000     500
 841  848                                   */
 842  849                                  if (pfdelay == 0)
 843  850                                          pfdelay = 125;
 844  851                                  else if (pfdelay < 2000)
 845  852                                          pfdelay *= 2;
 846  853  
 847  854                                  (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 848  855                                      &pfdelay, 0);
 849  856                          }
 850  857  
 851  858                          /*
 852  859                           * Once we go over the cap, then we want to
 853  860                           * page out a little extra instead of stopping
 854  861                           * right at the cap. To do this we add 5% to
 855  862                           * the excess so that pageout_proces will work
 856  863                           * a little longer before stopping.
 857  864                           */
 858  865                          return ((int64_t)(new_excess + addon));
 859  866                  }
 860  867  
 861  868                  /*
 862  869                   * At this point we are under the cap.
 863  870                   *
 864  871                   * Tell the kernel to stop throttling page faults.
 865  872                   *
 866  873                   * Scale the amount of time we sleep before rechecking the
 867  874                   * zone's memory usage.  Also, scale the accpetable age of
 868  875                   * cached results from vm_getusage.  We do this based on the
 869  876                   * penetration into the capped limit.
 870  877                   */
 871  878                  if (pfdelay > 0) {
 872  879                          pfdelay = 0;
 873  880                          (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY,
 874  881                              &pfdelay, 0);
 875  882                  }
 876  883  
 877  884                  if (zone_rss <= lo_thresh) {
 878  885                          sleep_time = 120;
 879  886                  } else if (zone_rss <= hi_thresh) {
 880  887                          sleep_time = 60;
 881  888                  } else {
 882  889                          sleep_time = 30;
 883  890                  }
 884  891  
 885  892                  debug("sleep %d seconds\n", sleep_time);
 886  893                  (void) sleep_shutdown(sleep_time);
 887  894          }
 888  895  
 889  896          /* Shutting down, tell the kernel so it doesn't throttle */
 890  897          if (pfdelay > 0) {
 891  898                  pfdelay = 0;
 892  899                  (void) zone_setattr(zid, ZONE_ATTR_PG_FLT_DELAY, &pfdelay, 0);
 893  900          }
 894  901  
 895  902          return (0);
 896  903  }
 897  904  
 898  905  static void
 899  906  get_mcap_tunables()
 900  907  {
 901  908          zone_dochandle_t handle;
 902  909          struct zone_attrtab attr;
 903  910  
 904  911          over_cmd[0] = '\0';
 905  912          if ((handle = zonecfg_init_handle()) == NULL)
 906  913                  return;
 907  914  
 908  915          if (zonecfg_get_handle(zone_name, handle) != Z_OK)
 909  916                  goto done;
 910  917  
 911  918          /* Reset to defaults in case rebooting and settings have changed */
 912  919          over_cmd[0] = '\0';
 913  920          skip_vmusage = B_FALSE;
 914  921          skip_pageout = B_FALSE;
 915  922          skip_pf_throttle = B_FALSE;
 916  923  
 917  924          if (zonecfg_setattrent(handle) != Z_OK)
 918  925                  goto done;
 919  926          while (zonecfg_getattrent(handle, &attr) == Z_OK) {
 920  927                  if (strcmp(TUNE_CMD, attr.zone_attr_name) == 0) {
 921  928                          (void) strlcpy(over_cmd, attr.zone_attr_value,
 922  929                              sizeof (over_cmd));
 923  930                  } else if (strcmp(TUNE_NVMU, attr.zone_attr_name) == 0) {
 924  931                          if (strcmp("true", attr.zone_attr_value) == 0)
 925  932                                  skip_vmusage = B_TRUE;
 926  933                  } else if (strcmp(TUNE_NPAGE, attr.zone_attr_name) == 0) {
 927  934                          if (strcmp("true", attr.zone_attr_value) == 0)
 928  935                                  skip_pageout = B_TRUE;
 929  936                  } else if (strcmp(TUNE_NPFTHROT, attr.zone_attr_name) == 0) {
 930  937                          if (strcmp("true", attr.zone_attr_value) == 0)
 931  938                                  skip_pf_throttle = B_TRUE;
 932  939                  }
 933  940          }
 934  941          (void) zonecfg_endattrent(handle);
 935  942  
 936  943  done:
 937  944          zonecfg_fini_handle(handle);
 938  945  }
 939  946  
 940  947  /* ARGSUSED */
 941  948  static int
 942  949  chk_proc_fs(void *data, const char *spec, const char *dir,
 943  950      const char *fstype, const char *opt)
 944  951  {
 945  952          if (fstype != NULL && strcmp(fstype, "proc") == 0)
 946  953                  *((boolean_t *)data) = B_TRUE;
 947  954  
 948  955          return (0);
 949  956  }
 950  957  
 951  958  static boolean_t
 952  959  has_proc()
 953  960  {
 954  961          brand_handle_t bh;
 955  962          boolean_t fnd = B_FALSE;
 956  963  
 957  964          if ((bh = brand_open(brand_name)) != NULL) {
 958  965                  (void) brand_platform_iter_mounts(bh, chk_proc_fs, &fnd);
 959  966          }
 960  967  
 961  968          brand_close(bh);
 962  969          return (fnd);
 963  970  }
 964  971  
 965  972  /*
 966  973   * We run this loop for brands with no /proc to simply update the RSS, using
 967  974   * the cheap GZ /proc data, every 5 minutes.
 968  975   */
 969  976  static void
 970  977  no_procfs()
 971  978  {
 972  979          DIR                     *pdir = NULL;
 973  980          struct dirent           *dent;
 974  981          uint64_t                zone_rss_bytes;
 975  982  
 976  983          (void) sleep_shutdown(30);
 977  984          while (!shutting_down) {
 978  985                  /*
 979  986                   * Just do the fast, cheap RSS calculation using the rss value
 980  987                   * in psinfo_t.  Because that's per-process, it can lead to
 981  988                   * double counting some memory and overestimating how much is
 982  989                   * being used. Since there is no /proc in the zone, we use the
 983  990                   * GZ /proc and check for the correct zone.
 984  991                   */
 985  992                  if ((pdir = opendir("/proc")) == NULL)
 986  993                          return;
 987  994  
 988  995                  fast_rss = 0;
 989  996                  while (!shutting_down && (dent = readdir(pdir)) != NULL) {
 990  997                          pid_t           pid;
 991  998                          int             psfd;
 992  999                          int64_t         rss;
 993 1000                          char            pathbuf[MAXPATHLEN];
 994 1001                          psinfo_t        psinfo;
 995 1002  
 996 1003                          if (strcmp(".", dent->d_name) == 0 ||
 997 1004                              strcmp("..", dent->d_name) == 0)
 998 1005                                  continue;
 999 1006  
1000 1007                          pid = atoi(dent->d_name);
1001 1008                          if (pid == 0 || pid == 1)
1002 1009                                  continue;
1003 1010  
1004 1011                          (void) snprintf(pathbuf, sizeof (pathbuf),
1005 1012                              "/proc/%d/psinfo", pid);
1006 1013  
1007 1014                          rss = 0;
1008 1015                          if ((psfd = open(pathbuf, O_RDONLY, 0000)) != -1) {
1009 1016                                  if (pread(psfd, &psinfo, sizeof (psinfo), 0) ==
1010 1017                                      sizeof (psinfo)) {
1011 1018                                          if (psinfo.pr_zoneid == zid)
1012 1019                                                  rss = (int64_t)psinfo.pr_rssize;
1013 1020                                  }
1014 1021  
1015 1022                                  (void) close(psfd);
1016 1023                          }
1017 1024  
1018 1025                          fast_rss += rss;
1019 1026                  }
1020 1027  
1021 1028                  (void) closedir(pdir);
1022 1029  
1023 1030                  if (shutting_down)
1024 1031                          return;
1025 1032  
1026 1033                  zone_rss_bytes = fast_rss * 1024;
1027 1034                  /* Use the zone's approx. RSS in the kernel */
1028 1035                  (void) zone_setattr(zid, ZONE_ATTR_RSS, &zone_rss_bytes, 0);
1029 1036  
1030 1037                  (void) sleep_shutdown(300);
1031 1038          }
1032 1039  }
1033 1040  
1034 1041  /*
1035 1042   * Thread that checks zone's memory usage and when over the cap, goes through
1036 1043   * the zone's process list trying to pageout processes to get under the cap.
1037 1044   */
1038 1045  static void
1039 1046  mcap_zone()
1040 1047  {
1041 1048          DIR *pdir = NULL;
1042 1049          int64_t excess;
1043 1050  
1044 1051          debug("thread startup\n");
1045 1052  
1046 1053          get_mcap_tunables();
1047 1054  
1048 1055          /*
1049 1056           * If the zone has no /proc filesystem, we can't use the fast algorithm
1050 1057           * to check RSS or pageout any processes. All we can do is periodically
1051 1058           * update it's RSS kstat using the expensive sycall.
1052 1059           */
1053 1060          if (!has_proc()) {
1054 1061                  no_procfs();
1055 1062                  debug("thread shutdown\n");
1056 1063                  return;
1057 1064          }
1058 1065  
1059 1066          /*
1060 1067           * When first starting it is likely lots of other zones are starting
1061 1068           * too because the system is booting.  Since we just started the zone
1062 1069           * we're not worried about being over the cap right away, so we let
1063 1070           * things settle a bit and tolerate some older data here to minimize
1064 1071           * the load on the system.
1065 1072           */
1066 1073          (void) sleep_shutdown(15); /* wait 15 secs. so the zone can get going */
1067 1074  
1068 1075          /* Wait until zone's /proc is mounted */
1069 1076          while (!shutting_down) {
1070 1077                  struct stat st;
1071 1078  
1072 1079                  if (stat(zoneproc, &st) == 0 &&
1073 1080                      strcmp(st.st_fstype, "proc") == 0)
1074 1081                          break;
1075 1082                  sleep_shutdown(5);
1076 1083          }
1077 1084  
1078 1085          /* Open zone's /proc and walk entries. */
1079 1086          while (!shutting_down) {
1080 1087                  if ((pdir = opendir(zoneproc)) != NULL)
1081 1088                          break;
1082 1089                  sleep_shutdown(5);
1083 1090          }
1084 1091  
1085 1092          while (!shutting_down) {
1086 1093                  struct dirent *dirent;
1087 1094  
1088 1095                  /* Wait until we've gone over the cap. */
1089 1096                  excess = check_suspend();
1090 1097  
1091 1098                  debug("starting to scan, excess %lldk\n", (long long)excess);
1092 1099  
1093 1100                  if (over_cmd[0] != '\0') {
1094 1101                          uint64_t zone_rss;      /* total RSS(KB) */
1095 1102  
1096 1103                          debug("run phys_mcap_cmd: %s\n", over_cmd);
1097 1104                          run_over_cmd();
1098 1105  
1099 1106                          zone_rss = get_mem_info();
1100 1107                          excess = zone_rss - zone_rss_cap;
1101 1108                          debug("rss %lluKB, cap %lluKB, excess %lldKB\n",
1102 1109                              zone_rss, zone_rss_cap, excess);
1103 1110                          if (excess <= 0)
1104 1111                                  continue;
1105 1112                  }
1106 1113  
1107 1114                  while (!shutting_down && (dirent = readdir(pdir)) != NULL) {
1108 1115                          pid_t pid;
1109 1116  
1110 1117                          if (strcmp(".", dirent->d_name) == 0 ||
1111 1118                              strcmp("..", dirent->d_name) == 0)
1112 1119                                  continue;
1113 1120  
1114 1121                          pid = atoi(dirent->d_name);
1115 1122                          if (pid == 0 || pid == 1)
1116 1123                                  continue;
1117 1124  
1118 1125                          if (skip_pageout)
1119 1126                                  (void) sleep_shutdown(2);
1120 1127                          else
1121 1128                                  excess = pageout_process(pid, excess);
1122 1129  
1123 1130                          if (excess <= 0) {
1124 1131                                  debug("apparently under; excess %lld\n",
1125 1132                                      (long long)excess);
1126 1133                                  /* Double check the current excess */
1127 1134                                  excess = check_suspend();
1128 1135                          }
1129 1136                  }
1130 1137  
1131 1138                  debug("process pass done; excess %lld\n", (long long)excess);
1132 1139                  rewinddir(pdir);
1133 1140  
1134 1141                  if (skip_pageout)
1135 1142                          (void) sleep_shutdown(120);
1136 1143          }
1137 1144  
1138 1145          if (pdir != NULL)
1139 1146                  (void) closedir(pdir);

↓ open down ↓

693 lines elided

↑ open up ↑

1140 1147          debug("thread shutdown\n");
1141 1148  }
1142 1149  
1143 1150  void
1144 1151  create_mcap_thread(zlog_t *zlogp, zoneid_t id)
1145 1152  {
1146 1153          int             res;
1147 1154  
1148 1155          shutting_down = 0;
1149 1156          zid = id;
1150      -        logp = zlogp;
1151 1157  
1152 1158          /* all but the lx brand currently use /proc */
1153 1159          if (strcmp(brand_name, "lx") == 0) {
1154 1160                  (void) snprintf(zoneproc, sizeof (zoneproc),
1155 1161                      "%s/root/native/proc", zonepath);
1156 1162          } else {
1157 1163                  (void) snprintf(zoneproc, sizeof (zoneproc), "%s/root/proc",
1158 1164                      zonepath);
1159 1165          }
1160 1166

1161 1167          (void) snprintf(debug_log, sizeof (debug_log), "%s/mcap_debug.log",
1162 1168              zonepath);
1163 1169  
1164 1170          res = thr_create(NULL, NULL, (void *(*)(void *))mcap_zone, NULL, NULL,
1165 1171              &mcap_tid);
1166 1172          if (res != 0) {
1167 1173                  zerror(zlogp, B_FALSE, "error %d creating memory cap thread",
1168 1174                      res);
1169 1175                  mcap_tid = 0;
1170 1176          }
1171 1177  }
1172 1178  
1173 1179  void
1174 1180  destroy_mcap_thread()
1175 1181  {
1176 1182          if (mcap_tid != 0) {
1177 1183                  shutting_down = 1;
1178 1184                  (void) cond_signal(&shutdown_cv);
1179 1185                  (void) thr_join(mcap_tid, NULL, NULL);
1180 1186                  mcap_tid = 0;
1181 1187          }
1182 1188  }

↓ open down ↓

22 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX