Print this page
    
OS-399 zone phys. mem. cap should be a rctl and have associated kstat
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/cmd/zonestat/zonestatd/zonestatd.c
          +++ new/usr/src/cmd/zonestat/zonestatd/zonestatd.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
       24 + * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  24   25   */
  25   26  #include <alloca.h>
  26   27  #include <assert.h>
  27   28  #include <dirent.h>
  28   29  #include <dlfcn.h>
  29   30  #include <door.h>
  30   31  #include <errno.h>
  31   32  #include <exacct.h>
  32   33  #include <ctype.h>
  33   34  #include <fcntl.h>
  34   35  #include <kstat.h>
  35   36  #include <libcontract.h>
  36   37  #include <libintl.h>
  37   38  #include <libscf.h>
  38   39  #include <zonestat.h>
  39   40  #include <zonestat_impl.h>
  40   41  #include <limits.h>
  41   42  #include <pool.h>
  42   43  #include <procfs.h>
  43   44  #include <rctl.h>
  44   45  #include <thread.h>
  45   46  #include <signal.h>
  46   47  #include <stdarg.h>
  47   48  #include <stddef.h>
  48   49  #include <stdio.h>
  49   50  #include <stdlib.h>
  50   51  #include <strings.h>
  51   52  #include <synch.h>
  52   53  #include <sys/acctctl.h>
  53   54  #include <sys/contract/process.h>
  54   55  #include <sys/ctfs.h>
  55   56  #include <sys/fork.h>
  56   57  #include <sys/param.h>
  57   58  #include <sys/priocntl.h>
  58   59  #include <sys/fxpriocntl.h>
  59   60  #include <sys/processor.h>
  60   61  #include <sys/pset.h>
  61   62  #include <sys/socket.h>
  62   63  #include <sys/stat.h>
  63   64  #include <sys/statvfs.h>
  64   65  #include <sys/swap.h>
  65   66  #include <sys/systeminfo.h>
  66   67  #include <thread.h>
  67   68  #include <sys/list.h>
  68   69  #include <sys/time.h>
  69   70  #include <sys/types.h>
  70   71  #include <sys/vm_usage.h>
  71   72  #include <sys/wait.h>
  72   73  #include <sys/zone.h>
  73   74  #include <time.h>
  74   75  #include <ucred.h>
  75   76  #include <unistd.h>
  76   77  #include <vm/anon.h>
  77   78  #include <zone.h>
  78   79  #include <zonestat.h>
  79   80  
  80   81  #define MAX_PSET_NAME   1024    /* Taken from PV_NAME_MAX_LEN */
  81   82  #define ZSD_PSET_UNLIMITED      UINT16_MAX
  82   83  #define ZONESTAT_EXACCT_FILE    "/var/adm/exacct/zonestat-process"
  83   84  
  84   85  /*
  85   86   * zonestatd implements gathering cpu and memory utilization data for
  86   87   * running zones.  It has these components:
  87   88   *
  88   89   * zsd_server:
  89   90   *      Door server to respond to client connections.  Each client
  90   91   *      will connect using libzonestat.so, which will open and
  91   92   *      call /var/tmp/.zonestat_door.  Each connecting client is given
  92   93   *      a file descriptor to the stat server.
  93   94   *
  94   95   *      The zsd_server also responds to zoneadmd, which reports when a
  95   96   *      new zone is booted.  This is used to fattach the zsd_server door
  96   97   *      into the new zone.
  97   98   *
  98   99   * zsd_stat_server:
  99  100   *      Receives client requests for the current utilization data.  Each
 100  101   *      client request will cause zonestatd to update the current utilization
 101  102   *      data by kicking the stat_thread.
 102  103   *
 103  104   *      If the client is in a non-global zone, the utilization data will
 104  105   *      be filtered to only show the given zone.  The usage by all other zones
 105  106   *      will be added to the system utilization.
 106  107   *
 107  108   * stat_thread:
 108  109   *      The stat thread implements querying the system to determine the
 109  110   *      current utilization data for each running zone.  This includes
 110  111   *      inspecting the system's processor set configuration, as well as details
 111  112   *      of each zone, such as their configured limits, and which processor
 112  113   *      sets they are running in.
 113  114   *
 114  115   *      The stat_thread will only update memory utilization data as often as
 115  116   *      the configured config/sample_interval on the zones-monitoring service.
 116  117   */
 117  118  
 118  119  /*
 119  120   * The private vmusage structure unfortunately uses size_t types, and assumes
 120  121   * the caller's bitness matches the kernel's bitness.  Since the getvmusage()
 121  122   * system call is contracted, and zonestatd is 32 bit, the following structures
 122  123   * are used to interact with a 32bit or 64 bit kernel.
 123  124   */
 124  125  typedef struct zsd_vmusage32 {
 125  126          id_t vmu_zoneid;
 126  127          uint_t vmu_type;
 127  128          id_t vmu_id;
 128  129  
 129  130          uint32_t vmu_rss_all;
 130  131          uint32_t vmu_rss_private;
 131  132          uint32_t vmu_rss_shared;
 132  133          uint32_t vmu_swap_all;
 133  134          uint32_t vmu_swap_private;
 134  135          uint32_t vmu_swap_shared;
 135  136  } zsd_vmusage32_t;
 136  137  
 137  138  typedef struct zsd_vmusage64 {
 138  139          id_t vmu_zoneid;
 139  140          uint_t vmu_type;
 140  141          id_t vmu_id;
 141  142          /*
 142  143           * An amd64 kernel will align the following uint64_t members, but a
 143  144           * 32bit i386 process will not without help.
 144  145           */
 145  146          int vmu_align_next_members_on_8_bytes;
 146  147          uint64_t vmu_rss_all;
 147  148          uint64_t vmu_rss_private;
 148  149          uint64_t vmu_rss_shared;
 149  150          uint64_t vmu_swap_all;
 150  151          uint64_t vmu_swap_private;
 151  152          uint64_t vmu_swap_shared;
 152  153  } zsd_vmusage64_t;
 153  154  
 154  155  struct zsd_zone;
 155  156  
 156  157  /* Used to store a zone's usage of a pset */
 157  158  typedef struct zsd_pset_usage {
 158  159          struct zsd_zone *zsu_zone;
 159  160          struct zsd_pset *zsu_pset;
 160  161  
 161  162          list_node_t     zsu_next;
 162  163  
 163  164          zoneid_t        zsu_zoneid;
 164  165          boolean_t       zsu_found;      /* zone bound at end of interval */
 165  166          boolean_t       zsu_active;     /* zone was bound during interval */
 166  167          boolean_t       zsu_new;        /* zone newly bound in this interval */
 167  168          boolean_t       zsu_deleted;    /* zone was unbound in this interval */
 168  169          boolean_t       zsu_empty;      /* no procs in pset in this interval */
 169  170          time_t          zsu_start;      /* time when zone was found in pset */
 170  171          hrtime_t        zsu_hrstart;    /* time when zone  was found in pset */
 171  172          uint64_t        zsu_cpu_shares;
 172  173          uint_t          zsu_scheds;     /* schedulers found in this pass */
 173  174          timestruc_t     zsu_cpu_usage;  /* cpu time used */
 174  175  } zsd_pset_usage_t;
 175  176  
 176  177  /* Used to store a pset's utilization */
 177  178  typedef struct zsd_pset {
 178  179          psetid_t        zsp_id;
 179  180          list_node_t     zsp_next;
 180  181          char            zsp_name[ZS_PSETNAME_MAX];
 181  182  
 182  183          uint_t          zsp_cputype;    /* default, dedicated or shared */
 183  184          boolean_t       zsp_found;      /* pset found at end of interval */
 184  185          boolean_t       zsp_new;        /* pset new in this interval */
 185  186          boolean_t       zsp_deleted;    /* pset deleted in this interval */
 186  187          boolean_t       zsp_active;     /* pset existed during interval */
 187  188          boolean_t       zsp_empty;      /* no processes in pset */
 188  189          time_t          zsp_start;
 189  190          hrtime_t        zsp_hrstart;
 190  191  
 191  192          uint64_t        zsp_online;     /* online cpus in interval */
 192  193          uint64_t        zsp_size;       /* size in this interval */
 193  194          uint64_t        zsp_min;        /* configured min in this interval */
 194  195          uint64_t        zsp_max;        /* configured max in this interval */
 195  196          int64_t         zsp_importance; /* configured max in this interval */
 196  197  
 197  198          uint_t          zsp_scheds;     /* scheds of processes found in pset */
 198  199          uint64_t        zsp_cpu_shares; /* total shares in this interval */
 199  200  
 200  201          timestruc_t     zsp_total_time;
 201  202          timestruc_t     zsp_usage_kern;
 202  203          timestruc_t     zsp_usage_zones;
 203  204  
 204  205          /* Individual zone usages of pset */
 205  206          list_t          zsp_usage_list;
 206  207          int             zsp_nusage;
 207  208  
 208  209          /* Summed kstat values from individual cpus in pset */
 209  210          timestruc_t     zsp_idle;
 210  211          timestruc_t     zsp_intr;
 211  212          timestruc_t     zsp_kern;
 212  213          timestruc_t     zsp_user;
 213  214  
 214  215  } zsd_pset_t;
 215  216  
 216  217  /* Used to track an individual cpu's utilization as reported by kstats */
 217  218  typedef struct zsd_cpu {
 218  219          processorid_t   zsc_id;
 219  220          list_node_t     zsc_next;
 220  221          psetid_t        zsc_psetid;
 221  222          psetid_t        zsc_psetid_prev;
 222  223          zsd_pset_t      *zsc_pset;
 223  224  
 224  225          boolean_t       zsc_found;      /* cpu online in this interval */
 225  226          boolean_t       zsc_onlined;    /* cpu onlined during this interval */
 226  227          boolean_t       zsc_offlined;   /* cpu offlined during this interval */
 227  228          boolean_t       zsc_active;     /* cpu online during this interval */
 228  229          boolean_t       zsc_allocated;  /* True if cpu has ever been found */
 229  230  
 230  231          /* kstats this interval */
 231  232          uint64_t        zsc_nsec_idle;
 232  233          uint64_t        zsc_nsec_intr;
 233  234          uint64_t        zsc_nsec_kern;
 234  235          uint64_t        zsc_nsec_user;
 235  236  
 236  237          /* kstats in most recent interval */
 237  238          uint64_t        zsc_nsec_idle_prev;
 238  239          uint64_t        zsc_nsec_intr_prev;
 239  240          uint64_t        zsc_nsec_kern_prev;
 240  241          uint64_t        zsc_nsec_user_prev;
 241  242  
 242  243          /* Total kstat increases since zonestatd started reading kstats */
 243  244          timestruc_t     zsc_idle;
 244  245          timestruc_t     zsc_intr;
 245  246          timestruc_t     zsc_kern;
 246  247          timestruc_t     zsc_user;
 247  248  
 248  249  } zsd_cpu_t;
 249  250  
 250  251  /* Used to describe an individual zone and its utilization */
 251  252  typedef struct zsd_zone {
 252  253          zoneid_t        zsz_id;
 253  254          list_node_t     zsz_next;
 254  255          char            zsz_name[ZS_ZONENAME_MAX];
 255  256          uint_t          zsz_cputype;
 256  257          uint_t          zsz_iptype;
 257  258          time_t          zsz_start;
 258  259          hrtime_t        zsz_hrstart;
 259  260  
 260  261          char            zsz_pool[ZS_POOLNAME_MAX];
 261  262          char            zsz_pset[ZS_PSETNAME_MAX];
 262  263          int             zsz_default_sched;
 263  264          /* These are deduced by inspecting processes */
 264  265          psetid_t        zsz_psetid;
 265  266          uint_t          zsz_scheds;
 266  267  
 267  268          boolean_t       zsz_new;        /* zone booted during this interval */
 268  269          boolean_t       zsz_deleted;    /* halted during this interval */
 269  270          boolean_t       zsz_active;     /* running in this interval */
 270  271          boolean_t       zsz_empty;      /* no processes in this interval */
 271  272          boolean_t       zsz_gone;       /* not installed in this interval */
 272  273          boolean_t       zsz_found;      /* Running at end of this interval */
 273  274  
 274  275          uint64_t        zsz_cpu_shares;
 275  276          uint64_t        zsz_cpu_cap;
 276  277          uint64_t        zsz_ram_cap;
 277  278          uint64_t        zsz_locked_cap;
 278  279          uint64_t        zsz_vm_cap;
 279  280  
 280  281          uint64_t        zsz_cpus_online;
 281  282          timestruc_t     zsz_cpu_usage;  /* cpu time of cpu cap */
 282  283          timestruc_t     zsz_cap_time;   /* cpu time of cpu cap */
 283  284          timestruc_t     zsz_share_time; /* cpu time of share of cpu */
 284  285          timestruc_t     zsz_pset_time;  /* time of all psets zone is bound to */
 285  286  
 286  287          uint64_t        zsz_usage_ram;
 287  288          uint64_t        zsz_usage_locked;
 288  289          uint64_t        zsz_usage_vm;
 289  290  
 290  291          uint64_t        zsz_processes_cap;
 291  292          uint64_t        zsz_lwps_cap;
 292  293          uint64_t        zsz_shm_cap;
 293  294          uint64_t        zsz_shmids_cap;
 294  295          uint64_t        zsz_semids_cap;
 295  296          uint64_t        zsz_msgids_cap;
 296  297          uint64_t        zsz_lofi_cap;
 297  298  
 298  299          uint64_t        zsz_processes;
 299  300          uint64_t        zsz_lwps;
 300  301          uint64_t        zsz_shm;
 301  302          uint64_t        zsz_shmids;
 302  303          uint64_t        zsz_semids;
 303  304          uint64_t        zsz_msgids;
 304  305          uint64_t        zsz_lofi;
 305  306  
 306  307  } zsd_zone_t;
 307  308  
 308  309  /*
 309  310   * Used to track the cpu usage of an individual processes.
 310  311   *
 311  312   * zonestatd sweeps /proc each interval and charges the cpu usage of processes.
 312  313   * to their zone.  As processes exit, their extended accounting records are
 313  314   * read and the difference of their total and known usage is charged to their
 314  315   * zone.
 315  316   *
 316  317   * If a process is never seen in /proc, the total usage on its extended
 317  318   * accounting record will be charged to its zone.
 318  319   */
 319  320  typedef struct zsd_proc {
 320  321          list_node_t     zspr_next;
 321  322          pid_t           zspr_ppid;
 322  323          psetid_t        zspr_psetid;
 323  324          zoneid_t        zspr_zoneid;
 324  325          int             zspr_sched;
 325  326          timestruc_t     zspr_usage;
 326  327  } zsd_proc_t;
 327  328  
 328  329  /* Used to track the overall resource usage of the system */
 329  330  typedef struct zsd_system {
 330  331  
 331  332          uint64_t zss_ram_total;
 332  333          uint64_t zss_ram_kern;
 333  334          uint64_t zss_ram_zones;
 334  335  
 335  336          uint64_t zss_locked_kern;
 336  337          uint64_t zss_locked_zones;
 337  338  
 338  339          uint64_t zss_vm_total;
 339  340          uint64_t zss_vm_kern;
 340  341          uint64_t zss_vm_zones;
 341  342  
 342  343          uint64_t zss_swap_total;
 343  344          uint64_t zss_swap_used;
 344  345  
 345  346          timestruc_t zss_idle;
 346  347          timestruc_t zss_intr;
 347  348          timestruc_t zss_kern;
 348  349          timestruc_t zss_user;
 349  350  
 350  351          timestruc_t zss_cpu_total_time;
 351  352          timestruc_t zss_cpu_usage_kern;
 352  353          timestruc_t zss_cpu_usage_zones;
 353  354  
 354  355          uint64_t zss_maxpid;
 355  356          uint64_t zss_processes_max;
 356  357          uint64_t zss_lwps_max;
 357  358          uint64_t zss_shm_max;
 358  359          uint64_t zss_shmids_max;
 359  360          uint64_t zss_semids_max;
 360  361          uint64_t zss_msgids_max;
 361  362          uint64_t zss_lofi_max;
 362  363  
 363  364          uint64_t zss_processes;
 364  365          uint64_t zss_lwps;
 365  366          uint64_t zss_shm;
 366  367          uint64_t zss_shmids;
 367  368          uint64_t zss_semids;
 368  369          uint64_t zss_msgids;
 369  370          uint64_t zss_lofi;
 370  371  
 371  372          uint64_t zss_ncpus;
 372  373          uint64_t zss_ncpus_online;
 373  374  
 374  375  } zsd_system_t;
 375  376  
 376  377  /*
 377  378   * A dumping ground for various information and structures used to compute
 378  379   * utilization.
 379  380   *
 380  381   * This structure is used to track the system while clients are connected.
 381  382   * When The first client connects, a zsd_ctl is allocated and configured by
 382  383   * zsd_open().  When all clients disconnect, the zsd_ctl is closed.
 383  384   */
 384  385  typedef struct zsd_ctl {
 385  386          kstat_ctl_t     *zsctl_kstat_ctl;
 386  387  
 387  388          /* To track extended accounting */
 388  389          int             zsctl_proc_fd;          /* Log currently being used */
 389  390          ea_file_t       zsctl_proc_eaf;
 390  391          struct stat64   zsctl_proc_stat;
 391  392          int             zsctl_proc_open;
 392  393          int             zsctl_proc_fd_next;     /* Log file to use next */
 393  394          ea_file_t       zsctl_proc_eaf_next;
 394  395          struct stat64   zsctl_proc_stat_next;
 395  396          int             zsctl_proc_open_next;
 396  397  
 397  398          /* pool configuration handle */
 398  399          pool_conf_t     *zsctl_pool_conf;
 399  400          int             zsctl_pool_status;
 400  401          int             zsctl_pool_changed;
 401  402  
 402  403          /* The above usage tacking structures */
 403  404          zsd_system_t    *zsctl_system;
 404  405          list_t          zsctl_zones;
 405  406          list_t          zsctl_psets;
 406  407          list_t          zsctl_cpus;
 407  408          zsd_cpu_t       *zsctl_cpu_array;
 408  409          zsd_proc_t      *zsctl_proc_array;
 409  410  
 410  411          /* Various system info */
 411  412          uint64_t        zsctl_maxcpuid;
 412  413          uint64_t        zsctl_maxproc;
 413  414          uint64_t        zsctl_kern_bits;
 414  415          uint64_t        zsctl_pagesize;
 415  416  
 416  417          /* Used to track time available under a cpu cap. */
 417  418          uint64_t        zsctl_hrtime;
 418  419          uint64_t        zsctl_hrtime_prev;
 419  420          timestruc_t     zsctl_hrtime_total;
 420  421  
 421  422          struct timeval  zsctl_timeofday;
 422  423  
 423  424          /* Caches for arrays allocated for use by various system calls */
 424  425          psetid_t        *zsctl_pset_cache;
 425  426          uint_t          zsctl_pset_ncache;
 426  427          processorid_t   *zsctl_cpu_cache;
 427  428          uint_t          zsctl_cpu_ncache;
 428  429          zoneid_t        *zsctl_zone_cache;
 429  430          uint_t          zsctl_zone_ncache;
 430  431          struct swaptable *zsctl_swap_cache;
 431  432          uint64_t        zsctl_swap_cache_size;
 432  433          uint64_t        zsctl_swap_cache_num;
 433  434          zsd_vmusage64_t *zsctl_vmusage_cache;
 434  435          uint64_t        zsctl_vmusage_cache_num;
 435  436  
 436  437          /* Info about procfs for scanning /proc */
 437  438          struct dirent   *zsctl_procfs_dent;
 438  439          long            zsctl_procfs_dent_size;
 439  440          pool_value_t    *zsctl_pool_vals[3];
 440  441  
 441  442          /* Counts on tracked entities */
 442  443          uint_t          zsctl_nzones;
 443  444          uint_t          zsctl_npsets;
 444  445          uint_t          zsctl_npset_usages;
 445  446  } zsd_ctl_t;
 446  447  
 447  448  zsd_ctl_t               *g_ctl;
 448  449  boolean_t               g_open;         /* True if g_ctl is open */
 449  450  int                     g_hasclient;    /* True if any clients are connected */
 450  451  
 451  452  /*
 452  453   * The usage cache is updated by the stat_thread, and copied to clients by
 453  454   * the zsd_stat_server.  Mutex and cond are to synchronize between the
 454  455   * stat_thread and the stat_server.
 455  456   */
 456  457  zs_usage_cache_t        *g_usage_cache;
 457  458  mutex_t                 g_usage_cache_lock;
 458  459  cond_t                  g_usage_cache_kick;
 459  460  uint_t                  g_usage_cache_kickers;
 460  461  cond_t                  g_usage_cache_wait;
 461  462  char                    *g_usage_cache_buf;
 462  463  uint_t                  g_usage_cache_bufsz;
 463  464  uint64_t                g_gen_next;
 464  465  
 465  466  /* fds of door servers */
 466  467  int                     g_server_door;
 467  468  int                     g_stat_door;
 468  469  
 469  470  /*
 470  471   * Starting and current time.  Used to throttle memory calculation, and to
 471  472   * mark new zones and psets with their boot and creation time.
 472  473   */
 473  474  time_t                  g_now;
 474  475  time_t                  g_start;
 475  476  hrtime_t                g_hrnow;
 476  477  hrtime_t                g_hrstart;
 477  478  uint64_t                g_interval;
 478  479  
 479  480  /*
 480  481   * main() thread.
 481  482   */
 482  483  thread_t                g_main;
 483  484  
 484  485  /* PRINTFLIKE1 */
 485  486  static void
 486  487  zsd_warn(const char *fmt, ...)
 487  488  {
 488  489          va_list alist;
 489  490  
 490  491          va_start(alist, fmt);
 491  492  
 492  493          (void) fprintf(stderr, gettext("zonestat: Warning: "));
 493  494          (void) vfprintf(stderr, fmt, alist);
 494  495          (void) fprintf(stderr, "\n");
 495  496          va_end(alist);
 496  497  }
 497  498  
 498  499  /* PRINTFLIKE1 */
 499  500  static void
 500  501  zsd_error(const char *fmt, ...)
 501  502  {
 502  503          va_list alist;
 503  504  
 504  505          va_start(alist, fmt);
 505  506  
 506  507          (void) fprintf(stderr, gettext("zonestat: Error: "));
 507  508          (void) vfprintf(stderr, fmt, alist);
 508  509          (void) fprintf(stderr, "\n");
 509  510          va_end(alist);
 510  511          exit(1);
 511  512  }
 512  513  
 513  514  /* Turns on extended accounting if not configured externally */
 514  515  int
 515  516  zsd_enable_cpu_stats()
 516  517  {
 517  518          char *path = ZONESTAT_EXACCT_FILE;
 518  519          char oldfile[MAXPATHLEN];
 519  520          int ret, state = AC_ON;
 520  521          ac_res_t res[6];
 521  522  
 522  523          /*
 523  524           * Start a new accounting file  if accounting not configured
 524  525           * externally.
 525  526           */
 526  527  
 527  528          res[0].ar_id = AC_PROC_PID;
 528  529          res[0].ar_state = AC_ON;
 529  530          res[1].ar_id = AC_PROC_ANCPID;
 530  531          res[1].ar_state = AC_ON;
 531  532          res[2].ar_id = AC_PROC_CPU;
 532  533          res[2].ar_state = AC_ON;
 533  534          res[3].ar_id = AC_PROC_TIME;
 534  535          res[3].ar_state = AC_ON;
 535  536          res[4].ar_id = AC_PROC_ZONENAME;
 536  537          res[4].ar_state = AC_ON;
 537  538          res[5].ar_id = AC_NONE;
 538  539          res[5].ar_state = AC_ON;
 539  540          if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
 540  541                  zsd_warn(gettext("Unable to set accounting resources"));
 541  542                  return (-1);
 542  543          }
 543  544          /* Only set accounting file if none is configured */
 544  545          ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
 545  546          if (ret < 0) {
 546  547  
 547  548                  (void) unlink(path);
 548  549                  if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1)
 549  550                      == -1) {
 550  551                          zsd_warn(gettext("Unable to set accounting file"));
 551  552                          return (-1);
 552  553                  }
 553  554          }
 554  555          if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
 555  556                  zsd_warn(gettext("Unable to enable accounting"));
 556  557                  return (-1);
 557  558          }
 558  559          return (0);
 559  560  }
 560  561  
 561  562  /* Turns off extended accounting if not configured externally */
 562  563  int
 563  564  zsd_disable_cpu_stats()
 564  565  {
 565  566          char *path = ZONESTAT_EXACCT_FILE;
 566  567          int ret, state = AC_OFF;
 567  568          ac_res_t res[6];
 568  569          char oldfile[MAXPATHLEN];
 569  570  
 570  571          /* If accounting file is externally configured, leave it alone */
 571  572          ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
 572  573          if (ret == 0 && strcmp(oldfile, path) != 0)
 573  574                  return (0);
 574  575  
 575  576          res[0].ar_id = AC_PROC_PID;
 576  577          res[0].ar_state = AC_OFF;
 577  578          res[1].ar_id = AC_PROC_ANCPID;
 578  579          res[1].ar_state = AC_OFF;
 579  580          res[2].ar_id = AC_PROC_CPU;
 580  581          res[2].ar_state = AC_OFF;
 581  582          res[3].ar_id = AC_PROC_TIME;
 582  583          res[3].ar_state = AC_OFF;
 583  584          res[4].ar_id = AC_PROC_ZONENAME;
 584  585          res[4].ar_state = AC_OFF;
 585  586          res[5].ar_id = AC_NONE;
 586  587          res[5].ar_state = AC_OFF;
 587  588          if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
 588  589                  zsd_warn(gettext("Unable to clear accounting resources"));
 589  590                  return (-1);
 590  591          }
 591  592          if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) {
 592  593                  zsd_warn(gettext("Unable to clear accounting file"));
 593  594                  return (-1);
 594  595          }
 595  596          if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
 596  597                  zsd_warn(gettext("Unable to diable accounting"));
 597  598                  return (-1);
 598  599          }
 599  600  
 600  601          (void) unlink(path);
 601  602          return (0);
 602  603  }
 603  604  
 604  605  /*
 605  606   * If not configured externally, deletes the current extended accounting file
 606  607   * and starts a new one.
 607  608   *
 608  609   * Since the stat_thread holds an open handle to the accounting file, it will
 609  610   * read all remaining entries from the old file before switching to
 610  611   * read the new one.
 611  612   */
 612  613  int
 613  614  zsd_roll_exacct(void)
 614  615  {
 615  616          int ret;
 616  617          char *path = ZONESTAT_EXACCT_FILE;
 617  618          char oldfile[MAXPATHLEN];
 618  619  
 619  620          /* If accounting file is externally configured, leave it alone */
 620  621          ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
 621  622          if (ret == 0 && strcmp(oldfile, path) != 0)
 622  623                  return (0);
 623  624  
 624  625          if (unlink(path) != 0)
 625  626                  /* Roll it next time */
 626  627                  return (0);
 627  628  
 628  629          if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) {
 629  630                  zsd_warn(gettext("Unable to set accounting file"));
 630  631                  return (-1);
 631  632          }
 632  633          return (0);
 633  634  }
 634  635  
 635  636  /* Contract stuff for zone_enter() */
 636  637  int
 637  638  init_template(void)
 638  639  {
 639  640          int fd;
 640  641          int err = 0;
 641  642  
 642  643          fd = open64(CTFS_ROOT "/process/template", O_RDWR);
 643  644          if (fd == -1)
 644  645                  return (-1);
 645  646  
 646  647          /*
 647  648           * For now, zoneadmd doesn't do anything with the contract.
 648  649           * Deliver no events, don't inherit, and allow it to be orphaned.
 649  650           */
 650  651          err |= ct_tmpl_set_critical(fd, 0);
 651  652          err |= ct_tmpl_set_informative(fd, 0);
 652  653          err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
 653  654          err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
 654  655          if (err || ct_tmpl_activate(fd)) {
 655  656                  (void) close(fd);
 656  657                  return (-1);
 657  658          }
 658  659  
 659  660          return (fd);
 660  661  }
 661  662  
 662  663  /*
 663  664   * Contract stuff for zone_enter()
 664  665   */
 665  666  int
 666  667  contract_latest(ctid_t *id)
 667  668  {
 668  669          int cfd, r;
 669  670          ct_stathdl_t st;
 670  671          ctid_t result;
 671  672  
 672  673          if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1)
 673  674                  return (errno);
 674  675  
 675  676          if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) {
 676  677                  (void) close(cfd);
 677  678                  return (r);
 678  679          }
 679  680  
 680  681          result = ct_status_get_id(st);
 681  682          ct_status_free(st);
 682  683          (void) close(cfd);
 683  684  
 684  685          *id = result;
 685  686          return (0);
 686  687  }
 687  688  
 688  689  static int
 689  690  close_on_exec(int fd)
 690  691  {
 691  692          int flags = fcntl(fd, F_GETFD, 0);
 692  693          if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1))
 693  694                  return (0);
 694  695          return (-1);
 695  696  }
 696  697  
 697  698  int
 698  699  contract_open(ctid_t ctid, const char *type, const char *file, int oflag)
 699  700  {
 700  701          char path[PATH_MAX];
 701  702          int n, fd;
 702  703  
 703  704          if (type == NULL)
 704  705                  type = "all";
 705  706  
 706  707          n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file);
 707  708          if (n >= sizeof (path)) {
 708  709                  errno = ENAMETOOLONG;
 709  710                  return (-1);
 710  711          }
 711  712  
 712  713          fd = open64(path, oflag);
 713  714          if (fd != -1) {
 714  715                  if (close_on_exec(fd) == -1) {
 715  716                          int err = errno;
 716  717                          (void) close(fd);
 717  718                          errno = err;
 718  719                          return (-1);
 719  720                  }
 720  721          }
 721  722          return (fd);
 722  723  }
 723  724  
 724  725  int
 725  726  contract_abandon_id(ctid_t ctid)
 726  727  {
 727  728          int fd, err;
 728  729  
 729  730          fd = contract_open(ctid, "all", "ctl", O_WRONLY);
 730  731          if (fd == -1)
 731  732                  return (errno);
 732  733  
 733  734          err = ct_ctl_abandon(fd);
 734  735          (void) close(fd);
 735  736  
 736  737          return (err);
 737  738  }
 738  739  /*
 739  740   * Attach the zsd_server to a zone.  Called for each zone when zonestatd
 740  741   * starts, and for each newly booted zone when zoneadmd contacts the zsd_server
 741  742   *
 742  743   * Zone_enter is used to avoid reaching into zone to fattach door.
 743  744   */
 744  745  static void
 745  746  zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only)
 746  747  {
 747  748          char *path = ZS_DOOR_PATH;
 748  749          int fd, pid, stat, tmpl_fd;
 749  750          ctid_t ct;
 750  751  
 751  752          if ((tmpl_fd = init_template()) == -1) {
 752  753                  zsd_warn("Unable to init template");
 753  754                  return;
 754  755          }
 755  756  
 756  757          pid = forkx(0);
 757  758          if (pid < 0) {
 758  759                  (void) ct_tmpl_clear(tmpl_fd);
 759  760                  zsd_warn(gettext(
 760  761                      "Unable to fork to add zonestat to zoneid %d\n"), zid);
 761  762                  return;
 762  763          }
 763  764  
 764  765          if (pid == 0) {
 765  766                  (void) ct_tmpl_clear(tmpl_fd);
 766  767                  (void) close(tmpl_fd);
 767  768                  if (zid != 0 && zone_enter(zid) != 0) {
 768  769                          if (errno == EINVAL) {
 769  770                                  _exit(0);
 770  771                          }
 771  772                          _exit(1);
 772  773                  }
 773  774                  (void) fdetach(path);
 774  775                  (void) unlink(path);
 775  776                  if (detach_only)
 776  777                          _exit(0);
 777  778                  fd = open(path, O_CREAT|O_RDWR, 0644);
 778  779                  if (fd < 0)
 779  780                          _exit(2);
 780  781                  if (fattach(door, path) != 0)
 781  782                          _exit(3);
 782  783                  _exit(0);
 783  784          }
 784  785          if (contract_latest(&ct) == -1)
 785  786                  ct = -1;
 786  787          (void) ct_tmpl_clear(tmpl_fd);
 787  788          (void) close(tmpl_fd);
 788  789          (void) contract_abandon_id(ct);
 789  790          while (waitpid(pid, &stat, 0) != pid)
 790  791                  ;
 791  792          if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0)
 792  793                  return;
 793  794  
 794  795          zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid);
 795  796  
 796  797          if (WEXITSTATUS(stat) == 1)
 797  798                  zsd_warn(gettext("Cannot entering zone"));
 798  799          else if (WEXITSTATUS(stat) == 2)
 799  800                  zsd_warn(gettext("Unable to create door file: %s"), path);
 800  801          else if (WEXITSTATUS(stat) == 3)
 801  802                  zsd_warn(gettext("Unable to fattach file: %s"), path);
 802  803  
 803  804          zsd_warn(gettext("Internal error entering zone: %d"), zid);
 804  805  }
 805  806  
 806  807  /*
 807  808   * Zone lookup and allocation functions to manage list of currently running
 808  809   * zones.
 809  810   */
 810  811  static zsd_zone_t *
 811  812  zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
 812  813  {
 813  814          zsd_zone_t *zone;
 814  815  
 815  816          for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
 816  817              zone = list_next(&ctl->zsctl_zones, zone)) {
 817  818                  if (strcmp(zone->zsz_name, zonename) == 0) {
 818  819                          if (zoneid != -1)
 819  820                                  zone->zsz_id = zoneid;
 820  821                          return (zone);
 821  822                  }
 822  823          }
 823  824          return (NULL);
 824  825  }
 825  826  
 826  827  static zsd_zone_t *
 827  828  zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid)
 828  829  {
 829  830          zsd_zone_t *zone;
 830  831  
 831  832          for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
 832  833              zone = list_next(&ctl->zsctl_zones, zone)) {
 833  834                  if (zone->zsz_id == zoneid)
 834  835                          return (zone);
 835  836          }
 836  837          return (NULL);
 837  838  }
 838  839  
 839  840  static zsd_zone_t *
 840  841  zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
 841  842  {
 842  843          zsd_zone_t *zone;
 843  844  
 844  845          if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL)
 845  846                  return (NULL);
 846  847  
 847  848          (void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name));
 848  849          zone->zsz_id = zoneid;
 849  850          zone->zsz_found = B_FALSE;
 850  851  
 851  852          /*
 852  853           * Allocate as deleted so if not found in first pass, zone is deleted
 853  854           * from list.  This can happen if zone is returned by zone_list, but
 854  855           * exits before first attempt to fetch zone details.
 855  856           */
 856  857          zone->zsz_start = g_now;
 857  858          zone->zsz_hrstart = g_hrnow;
 858  859          zone->zsz_deleted = B_TRUE;
 859  860  
 860  861          zone->zsz_cpu_shares = ZS_LIMIT_NONE;
 861  862          zone->zsz_cpu_cap = ZS_LIMIT_NONE;
 862  863          zone->zsz_ram_cap = ZS_LIMIT_NONE;
 863  864          zone->zsz_locked_cap = ZS_LIMIT_NONE;
 864  865          zone->zsz_vm_cap = ZS_LIMIT_NONE;
 865  866  
 866  867          zone->zsz_processes_cap = ZS_LIMIT_NONE;
 867  868          zone->zsz_lwps_cap = ZS_LIMIT_NONE;
 868  869          zone->zsz_shm_cap = ZS_LIMIT_NONE;
 869  870          zone->zsz_shmids_cap = ZS_LIMIT_NONE;
 870  871          zone->zsz_semids_cap = ZS_LIMIT_NONE;
 871  872          zone->zsz_msgids_cap = ZS_LIMIT_NONE;
 872  873          zone->zsz_lofi_cap = ZS_LIMIT_NONE;
 873  874  
 874  875          ctl->zsctl_nzones++;
 875  876  
 876  877          return (zone);
 877  878  }
 878  879  
 879  880  static zsd_zone_t *
 880  881  zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
 881  882  {
 882  883          zsd_zone_t *zone, *tmp;
 883  884  
 884  885          if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL)
 885  886                  return (zone);
 886  887  
 887  888          if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL)
 888  889                  return (NULL);
 889  890  
 890  891          /* Insert sorted by zonename */
 891  892          tmp = list_head(&ctl->zsctl_zones);
 892  893          while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0)
 893  894                  tmp = list_next(&ctl->zsctl_zones, tmp);
 894  895  
 895  896          list_insert_before(&ctl->zsctl_zones, tmp, zone);
 896  897          return (zone);
 897  898  }
 898  899  
 899  900  /*
 900  901   * Mark all zones as not existing.  As zones are found, they will
 901  902   * be marked as existing.  If a zone is not found, then it must have
 902  903   * halted.
 903  904   */
 904  905  static void
 905  906  zsd_mark_zones_start(zsd_ctl_t *ctl)
 906  907  {
 907  908  
 908  909          zsd_zone_t *zone;
 909  910  
 910  911          for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
 911  912              zone = list_next(&ctl->zsctl_zones, zone)) {
 912  913                  zone->zsz_found = B_FALSE;
 913  914          }
 914  915  }
 915  916  
 916  917  /*
 917  918   * Mark each zone as not using pset.  If processes are found using the
 918  919   * pset, the zone will remain bound to the pset.  If none of a zones
 919  920   * processes are bound to the pset, the zone's usage of the pset will
 920  921   * be deleted.
 921  922   *
 922  923   */
 923  924  static void
 924  925  zsd_mark_pset_usage_start(zsd_pset_t *pset)
 925  926  {
 926  927          zsd_pset_usage_t *usage;
 927  928  
 928  929          for (usage = list_head(&pset->zsp_usage_list);
 929  930              usage != NULL;
 930  931              usage = list_next(&pset->zsp_usage_list, usage)) {
 931  932                  usage->zsu_found = B_FALSE;
 932  933                  usage->zsu_empty = B_TRUE;
 933  934          }
 934  935  }
 935  936  
 936  937  /*
 937  938   * Mark each pset as not existing.  If a pset is found, it will be marked
 938  939   * as existing.  If a pset is not found, it wil be deleted.
 939  940   */
 940  941  static void
 941  942  zsd_mark_psets_start(zsd_ctl_t *ctl)
 942  943  {
 943  944          zsd_pset_t *pset;
 944  945  
 945  946          for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
 946  947              pset = list_next(&ctl->zsctl_psets, pset)) {
 947  948                  pset->zsp_found = B_FALSE;
 948  949                  zsd_mark_pset_usage_start(pset);
 949  950          }
 950  951  }
 951  952  
 952  953  /*
 953  954   * A pset was found.  Update its information
 954  955   */
 955  956  static void
 956  957  zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online,
 957  958      uint64_t size, uint64_t min, uint64_t max, int64_t importance)
 958  959  {
 959  960          pset->zsp_empty = B_TRUE;
 960  961          pset->zsp_deleted = B_FALSE;
 961  962  
 962  963          assert(pset->zsp_found == B_FALSE);
 963  964  
 964  965          /* update pset flags */
 965  966          if (pset->zsp_active == B_FALSE)
 966  967                  /* pset not seen on previous interval.  It is new. */
 967  968                  pset->zsp_new = B_TRUE;
 968  969          else
 969  970                  pset->zsp_new = B_FALSE;
 970  971  
 971  972          pset->zsp_found = B_TRUE;
 972  973          pset->zsp_cputype = type;
 973  974          pset->zsp_online = online;
 974  975          pset->zsp_size = size;
 975  976          pset->zsp_min = min;
 976  977          pset->zsp_max = max;
 977  978          pset->zsp_importance = importance;
 978  979          pset->zsp_cpu_shares = 0;
 979  980          pset->zsp_scheds = 0;
 980  981          pset->zsp_active = B_TRUE;
 981  982  }
 982  983  
 983  984  /*
 984  985   * A zone's process was found using a pset. Charge the process to the pset and
 985  986   * the per-zone data for the pset.
 986  987   */
 987  988  static void
 988  989  zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched)
 989  990  {
 990  991          zsd_zone_t *zone = usage->zsu_zone;
 991  992          zsd_pset_t *pset = usage->zsu_pset;
 992  993  
 993  994          /* Nothing to do if already found */
 994  995          if (usage->zsu_found == B_TRUE)
 995  996                  goto add_stats;
 996  997  
 997  998          usage->zsu_found = B_TRUE;
 998  999          usage->zsu_empty = B_FALSE;
 999 1000  
1000 1001          usage->zsu_deleted = B_FALSE;
1001 1002          /* update usage flags */
1002 1003          if (usage->zsu_active == B_FALSE)
1003 1004                  usage->zsu_new = B_TRUE;
1004 1005          else
1005 1006                  usage->zsu_new = B_FALSE;
1006 1007  
1007 1008          usage->zsu_scheds = 0;
1008 1009          usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1009 1010          usage->zsu_active = B_TRUE;
1010 1011          pset->zsp_empty = B_FALSE;
1011 1012          zone->zsz_empty = B_FALSE;
1012 1013  
1013 1014  add_stats:
1014 1015          /* Detect zone's pset id, and if it is bound to multiple psets */
1015 1016          if (zone->zsz_psetid == ZS_PSET_ERROR)
1016 1017                  zone->zsz_psetid = pset->zsp_id;
1017 1018          else if (zone->zsz_psetid != pset->zsp_id)
1018 1019                  zone->zsz_psetid = ZS_PSET_MULTI;
1019 1020  
1020 1021          usage->zsu_scheds |= sched;
1021 1022          pset->zsp_scheds |= sched;
1022 1023          zone->zsz_scheds |= sched;
1023 1024  
1024 1025          /* Record if FSS is co-habitating with conflicting scheduler */
1025 1026          if ((pset->zsp_scheds & ZS_SCHED_FSS) &&
1026 1027              usage->zsu_scheds & (
1027 1028              ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) {
1028 1029                  usage->zsu_scheds |= ZS_SCHED_CONFLICT;
1029 1030  
1030 1031                  pset->zsp_scheds |= ZS_SCHED_CONFLICT;
1031 1032          }
1032 1033  
1033 1034  }
1034 1035  
1035 1036  /* Add cpu time for a process to a pset, zone, and system totals */
1036 1037  static void
1037 1038  zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta)
1038 1039  {
1039 1040          zsd_system_t *system = ctl->zsctl_system;
1040 1041          zsd_zone_t *zone = usage->zsu_zone;
1041 1042          zsd_pset_t *pset = usage->zsu_pset;
1042 1043  
1043 1044          TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta);
1044 1045          TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta);
1045 1046          TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta);
1046 1047          TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta);
1047 1048  }
1048 1049  
1049 1050  /* Determine which processor sets have been deleted */
1050 1051  static void
1051 1052  zsd_mark_psets_end(zsd_ctl_t *ctl)
1052 1053  {
1053 1054          zsd_pset_t *pset, *tmp;
1054 1055  
1055 1056          /*
1056 1057           * Mark pset as not exists, and deleted if it existed
1057 1058           * previous interval.
1058 1059           */
1059 1060          pset = list_head(&ctl->zsctl_psets);
1060 1061          while (pset != NULL) {
1061 1062                  if (pset->zsp_found == B_FALSE) {
1062 1063                          pset->zsp_empty = B_TRUE;
1063 1064                          if (pset->zsp_deleted == B_TRUE) {
1064 1065                                  tmp = pset;
1065 1066                                  pset = list_next(&ctl->zsctl_psets, pset);
1066 1067                                  list_remove(&ctl->zsctl_psets, tmp);
1067 1068                                  free(tmp);
1068 1069                                  ctl->zsctl_npsets--;
1069 1070                                  continue;
1070 1071                          } else {
1071 1072                                  /* Pset vanished during this interval */
1072 1073                                  pset->zsp_new = B_FALSE;
1073 1074                                  pset->zsp_deleted = B_TRUE;
1074 1075                                  pset->zsp_active = B_TRUE;
1075 1076                          }
1076 1077                  }
1077 1078                  pset = list_next(&ctl->zsctl_psets, pset);
1078 1079          }
1079 1080  }
1080 1081  
1081 1082  /* Determine which zones are no longer bound to processor sets */
1082 1083  static void
1083 1084  zsd_mark_pset_usages_end(zsd_ctl_t *ctl)
1084 1085  {
1085 1086          zsd_pset_t *pset;
1086 1087          zsd_zone_t *zone;
1087 1088          zsd_pset_usage_t *usage, *tmp;
1088 1089  
1089 1090          /*
1090 1091           * Mark pset as not exists, and deleted if it existed previous
1091 1092           * interval.
1092 1093           */
1093 1094          for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1094 1095              pset = list_next(&ctl->zsctl_psets, pset)) {
1095 1096                  usage = list_head(&pset->zsp_usage_list);
1096 1097                  while (usage != NULL) {
1097 1098                          /*
1098 1099                           * Mark pset as not exists, and deleted if it existed
1099 1100                           * previous interval.
1100 1101                           */
1101 1102                          if (usage->zsu_found == B_FALSE ||
1102 1103                              usage->zsu_zone->zsz_deleted == B_TRUE ||
1103 1104                              usage->zsu_pset->zsp_deleted == B_TRUE) {
1104 1105                                  tmp = usage;
1105 1106                                  usage = list_next(&pset->zsp_usage_list,
1106 1107                                      usage);
1107 1108                                  list_remove(&pset->zsp_usage_list, tmp);
1108 1109                                  free(tmp);
1109 1110                                  pset->zsp_nusage--;
1110 1111                                  ctl->zsctl_npset_usages--;
1111 1112                                  continue;
1112 1113                          } else {
1113 1114                                  usage->zsu_new = B_FALSE;
1114 1115                                  usage->zsu_deleted = B_TRUE;
1115 1116                                  usage->zsu_active = B_TRUE;
1116 1117                          }
1117 1118                          /* Add cpu shares for usages that are in FSS */
1118 1119                          zone = usage->zsu_zone;
1119 1120                          if (usage->zsu_scheds & ZS_SCHED_FSS &&
1120 1121                              zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED &&
1121 1122                              zone->zsz_cpu_shares != 0) {
1122 1123                                  zone = usage->zsu_zone;
1123 1124                                  usage->zsu_cpu_shares = zone->zsz_cpu_shares;
1124 1125                                  pset->zsp_cpu_shares += zone->zsz_cpu_shares;
1125 1126                          }
1126 1127                          usage = list_next(&pset->zsp_usage_list,
1127 1128                              usage);
1128 1129                  }
1129 1130          }
1130 1131  }
1131 1132  
1132 1133  /* A zone has been found.  Update its information */
1133 1134  static void
1134 1135  zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares,
1135 1136      uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap,
1136 1137      uint64_t vm_cap, uint64_t processes_cap, uint64_t processes,
1137 1138      uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm,
1138 1139      uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap,
1139 1140      uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap,
1140 1141      uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype,
1141 1142      uint_t iptype)
1142 1143  {
1143 1144          zsd_system_t *sys = ctl->zsctl_system;
1144 1145  
1145 1146          assert(zone->zsz_found == B_FALSE);
1146 1147  
1147 1148          /*
1148 1149           * Mark zone as exists, and new if it did not exist in previous
1149 1150           * interval.
1150 1151           */
1151 1152          zone->zsz_found = B_TRUE;
1152 1153          zone->zsz_empty = B_TRUE;
1153 1154          zone->zsz_deleted = B_FALSE;
1154 1155  
1155 1156          /*
1156 1157           * Zone is new.  Assume zone's properties are the same over entire
1157 1158           * interval.
1158 1159           */
1159 1160          if (zone->zsz_active == B_FALSE)
1160 1161                  zone->zsz_new = B_TRUE;
1161 1162          else
1162 1163                  zone->zsz_new = B_FALSE;
1163 1164  
1164 1165          (void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool));
1165 1166          (void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset));
1166 1167          zone->zsz_default_sched = sched;
1167 1168  
1168 1169          /* Schedulers updated later as processes are found */
1169 1170          zone->zsz_scheds = 0;
1170 1171  
1171 1172          /* Cpus updated later as psets bound are identified */
1172 1173          zone->zsz_cpus_online = 0;
1173 1174  
1174 1175          zone->zsz_cputype = cputype;
1175 1176          zone->zsz_iptype = iptype;
1176 1177          zone->zsz_psetid = ZS_PSET_ERROR;
1177 1178          zone->zsz_cpu_cap = cpu_cap;
1178 1179          zone->zsz_cpu_shares = cpu_shares;
1179 1180          zone->zsz_ram_cap = ram_cap;
1180 1181          zone->zsz_locked_cap = locked_cap;
1181 1182          zone->zsz_vm_cap = vm_cap;
1182 1183          zone->zsz_processes_cap = processes_cap;
1183 1184          zone->zsz_processes = processes;
1184 1185          zone->zsz_lwps_cap = lwps_cap;
1185 1186          zone->zsz_lwps = lwps;
1186 1187          zone->zsz_shm_cap = shm_cap;
1187 1188          zone->zsz_shm = shm;
1188 1189          zone->zsz_shmids_cap = shmids_cap;
1189 1190          zone->zsz_shmids = shmids;
1190 1191          zone->zsz_semids_cap = semids_cap;
1191 1192          zone->zsz_semids = semids;
1192 1193          zone->zsz_msgids_cap = msgids_cap;
1193 1194          zone->zsz_msgids = msgids;
1194 1195          zone->zsz_lofi_cap = lofi_cap;
1195 1196          zone->zsz_lofi = lofi;
1196 1197  
1197 1198          sys->zss_processes += processes;
1198 1199          sys->zss_lwps += lwps;
1199 1200          sys->zss_shm += shm;
1200 1201          sys->zss_shmids += shmids;
1201 1202          sys->zss_semids += semids;
1202 1203          sys->zss_msgids += msgids;
1203 1204          sys->zss_lofi += lofi;
1204 1205          zone->zsz_active = B_TRUE;
1205 1206  }
1206 1207  
1207 1208  
1208 1209  /* Determine which zones have halted */
1209 1210  static void
1210 1211  zsd_mark_zones_end(zsd_ctl_t *ctl)
1211 1212  {
1212 1213          zsd_zone_t *zone, *tmp;
1213 1214  
1214 1215          /*
1215 1216           * Mark zone as not existing, or delete if it did not exist in
1216 1217           * previous interval.
1217 1218           */
1218 1219          zone = list_head(&ctl->zsctl_zones);
1219 1220          while (zone != NULL) {
1220 1221                  if (zone->zsz_found == B_FALSE) {
1221 1222                          zone->zsz_empty = B_TRUE;
1222 1223                          if (zone->zsz_deleted == B_TRUE) {
1223 1224                                  /*
1224 1225                                   * Zone deleted in prior interval,
1225 1226                                   * so it no longer exists.
1226 1227                                   */
1227 1228                                  tmp = zone;
1228 1229                                  zone = list_next(&ctl->zsctl_zones, zone);
1229 1230                                  list_remove(&ctl->zsctl_zones, tmp);
1230 1231                                  free(tmp);
1231 1232                                  ctl->zsctl_nzones--;
1232 1233                                  continue;
1233 1234                          } else {
1234 1235                                  zone->zsz_new = B_FALSE;
1235 1236                                  zone->zsz_deleted = B_TRUE;
1236 1237                                  zone->zsz_active = B_TRUE;
1237 1238                          }
1238 1239                  }
1239 1240                  zone = list_next(&ctl->zsctl_zones, zone);
1240 1241          }
1241 1242  }
1242 1243  
1243 1244  /*
1244 1245   * Mark cpus as not existing.  If a cpu is found, it will be updated.  If
1245 1246   * a cpu is not found, then it must have gone offline, so it will be
1246 1247   * deleted.
1247 1248   *
1248 1249   * The kstat tracking data is rolled so that the usage since the previous
1249 1250   * interval can be determined.
1250 1251   */
1251 1252  static void
1252 1253  zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll)
1253 1254  {
1254 1255          zsd_cpu_t *cpu;
1255 1256  
1256 1257          /*
1257 1258           * Mark all cpus as not existing.  As cpus are found, they will
1258 1259           * be marked as existing.
1259 1260           */
1260 1261          for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL;
1261 1262              cpu = list_next(&ctl->zsctl_cpus, cpu)) {
1262 1263                  cpu->zsc_found = B_FALSE;
1263 1264                  if (cpu->zsc_active == B_TRUE && roll) {
1264 1265                          cpu->zsc_psetid_prev = cpu->zsc_psetid;
1265 1266                          cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1266 1267                          cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1267 1268                          cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1268 1269                          cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1269 1270                  }
1270 1271          }
1271 1272  }
1272 1273  
1273 1274  /*
1274 1275   * An array the size of the maximum number of cpus is kept.  Within this array
1275 1276   * a list of the online cpus is maintained.
1276 1277   */
1277 1278  zsd_cpu_t *
1278 1279  zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid)
1279 1280  {
1280 1281          zsd_cpu_t *cpu;
1281 1282  
1282 1283          assert(cpuid < ctl->zsctl_maxcpuid);
1283 1284          cpu = &(ctl->zsctl_cpu_array[cpuid]);
1284 1285          assert(cpuid == cpu->zsc_id);
1285 1286  
1286 1287          if (cpu->zsc_allocated == B_FALSE) {
1287 1288                  cpu->zsc_allocated = B_TRUE;
1288 1289                  list_insert_tail(&ctl->zsctl_cpus, cpu);
1289 1290          }
1290 1291          return (cpu);
1291 1292  }
1292 1293  
1293 1294  /* A cpu has been found.  Update its information */
1294 1295  static void
1295 1296  zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid)
1296 1297  {
1297 1298          /*
1298 1299           * legacy processor sets, the cpu may move while zonestatd is
1299 1300           * inspecting, causing it to be found twice.  In this case, just
1300 1301           * leave cpu in the first processor set in which it was found.
1301 1302           */
1302 1303          if (cpu->zsc_found == B_TRUE)
1303 1304                  return;
1304 1305  
1305 1306          /* Mark cpu as online */
1306 1307          cpu->zsc_found = B_TRUE;
1307 1308          cpu->zsc_offlined = B_FALSE;
1308 1309          cpu->zsc_pset = pset;
1309 1310          /*
1310 1311           * cpu is newly online.
1311 1312           */
1312 1313          if (cpu->zsc_active == B_FALSE) {
1313 1314                  /*
1314 1315                   * Cpu is newly online.
1315 1316                   */
1316 1317                  cpu->zsc_onlined = B_TRUE;
1317 1318                  cpu->zsc_psetid = psetid;
1318 1319                  cpu->zsc_psetid_prev = psetid;
1319 1320          } else {
1320 1321                  /*
1321 1322                   * cpu online during previous interval.  Save properties at
1322 1323                   * start of interval
1323 1324                   */
1324 1325                  cpu->zsc_onlined = B_FALSE;
1325 1326                  cpu->zsc_psetid = psetid;
1326 1327  
1327 1328          }
1328 1329          cpu->zsc_active = B_TRUE;
1329 1330  }
1330 1331  
1331 1332  /* Remove all offlined cpus from the list of tracked cpus */
1332 1333  static void
1333 1334  zsd_mark_cpus_end(zsd_ctl_t *ctl)
1334 1335  {
1335 1336          zsd_cpu_t *cpu, *tmp;
1336 1337          int id;
1337 1338  
1338 1339          /* Mark cpu as online or offline */
1339 1340          cpu = list_head(&ctl->zsctl_cpus);
1340 1341          while (cpu != NULL) {
1341 1342                  if (cpu->zsc_found == B_FALSE) {
1342 1343                          if (cpu->zsc_offlined == B_TRUE) {
1343 1344                                  /*
1344 1345                                   * cpu offlined in prior interval. It is gone.
1345 1346                                   */
1346 1347                                  tmp = cpu;
1347 1348                                  cpu = list_next(&ctl->zsctl_cpus, cpu);
1348 1349                                  list_remove(&ctl->zsctl_cpus, tmp);
1349 1350                                  /* Clear structure for future use */
1350 1351                                  id = tmp->zsc_id;
1351 1352                                  bzero(tmp, sizeof (zsd_cpu_t));
1352 1353                                  tmp->zsc_id = id;
1353 1354                                  tmp->zsc_allocated = B_FALSE;
1354 1355                                  tmp->zsc_psetid = ZS_PSET_ERROR;
1355 1356                                  tmp->zsc_psetid_prev = ZS_PSET_ERROR;
1356 1357  
1357 1358                          } else {
1358 1359                                  /*
1359 1360                                   * cpu online at start of interval.  Treat
1360 1361                                   * as still online, since it was online for
1361 1362                                   * some portion of the interval.
1362 1363                                   */
1363 1364                                  cpu->zsc_offlined = B_TRUE;
1364 1365                                  cpu->zsc_onlined = B_FALSE;
1365 1366                                  cpu->zsc_active = B_TRUE;
1366 1367                                  cpu->zsc_psetid = cpu->zsc_psetid_prev;
1367 1368                                  cpu->zsc_pset = NULL;
1368 1369                          }
1369 1370                  }
1370 1371                  cpu = list_next(&ctl->zsctl_cpus, cpu);
1371 1372          }
1372 1373  }
1373 1374  
1374 1375  /* Some utility functions for managing the list of processor sets */
1375 1376  static zsd_pset_t *
1376 1377  zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid)
1377 1378  {
1378 1379          zsd_pset_t *pset;
1379 1380  
1380 1381          for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1381 1382              pset = list_next(&ctl->zsctl_psets, pset)) {
1382 1383                  if (pset->zsp_id == psetid)
1383 1384                          return (pset);
1384 1385          }
1385 1386          return (NULL);
1386 1387  }
1387 1388  
1388 1389  static zsd_pset_t *
1389 1390  zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1390 1391  {
1391 1392          zsd_pset_t *pset;
1392 1393  
1393 1394          for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1394 1395              pset = list_next(&ctl->zsctl_psets, pset)) {
1395 1396                  if (strcmp(pset->zsp_name, psetname) == 0) {
1396 1397                          if (psetid != -1)
1397 1398                                  pset->zsp_id = psetid;
1398 1399                          return (pset);
1399 1400                  }
1400 1401          }
1401 1402          return (NULL);
1402 1403  }
1403 1404  
1404 1405  static zsd_pset_t *
1405 1406  zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1406 1407  {
1407 1408          zsd_pset_t *pset;
1408 1409  
1409 1410          if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL)
1410 1411                  return (NULL);
1411 1412  
1412 1413          (void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name));
1413 1414          pset->zsp_id = psetid;
1414 1415          pset->zsp_found = B_FALSE;
1415 1416          /*
1416 1417           * Allocate as deleted so if not found in first pass, pset is deleted
1417 1418           * from list.  This can happen if pset is returned by pset_list, but
1418 1419           * is destroyed before first attempt to fetch pset details.
1419 1420           */
1420 1421          list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t),
1421 1422              offsetof(zsd_pset_usage_t, zsu_next));
1422 1423  
1423 1424          pset->zsp_hrstart = g_hrnow;
1424 1425          pset->zsp_deleted = B_TRUE;
1425 1426          pset->zsp_empty = B_TRUE;
1426 1427          ctl->zsctl_npsets++;
1427 1428  
1428 1429          return (pset);
1429 1430  }
1430 1431  
1431 1432  static zsd_pset_t *
1432 1433  zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1433 1434  {
1434 1435          zsd_pset_t *pset, *tmp;
1435 1436  
1436 1437          if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL)
1437 1438                  return (pset);
1438 1439  
1439 1440          if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL)
1440 1441                  return (NULL);
1441 1442  
1442 1443          /* Insert sorted by psetname */
1443 1444          tmp = list_head(&ctl->zsctl_psets);
1444 1445          while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0)
1445 1446                  tmp = list_next(&ctl->zsctl_psets, tmp);
1446 1447  
1447 1448          list_insert_before(&ctl->zsctl_psets, tmp, pset);
1448 1449          return (pset);
1449 1450  }
1450 1451  
1451 1452  /* Some utility functions for managing the list of zones using each pset */
1452 1453  static zsd_pset_usage_t *
1453 1454  zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone)
1454 1455  {
1455 1456          zsd_pset_usage_t *usage;
1456 1457  
1457 1458          for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
1458 1459              usage = list_next(&pset->zsp_usage_list, usage))
1459 1460                  if (usage->zsu_zone == zone)
1460 1461                          return (usage);
1461 1462  
1462 1463          return (NULL);
1463 1464  }
1464 1465  
1465 1466  static zsd_pset_usage_t *
1466 1467  zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1467 1468  {
1468 1469          zsd_pset_usage_t *usage;
1469 1470  
1470 1471          if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t)))
1471 1472              == NULL)
1472 1473                  return (NULL);
1473 1474  
1474 1475          list_link_init(&usage->zsu_next);
1475 1476          usage->zsu_zone = zone;
1476 1477          usage->zsu_zoneid = zone->zsz_id;
1477 1478          usage->zsu_pset = pset;
1478 1479          usage->zsu_found = B_FALSE;
1479 1480          usage->zsu_active = B_FALSE;
1480 1481          usage->zsu_new = B_FALSE;
1481 1482          /*
1482 1483           * Allocate as not deleted.  If a process is found in a pset for
1483 1484           * a zone, the usage will not be deleted until at least the next
1484 1485           * interval.
1485 1486           */
1486 1487          usage->zsu_start = g_now;
1487 1488          usage->zsu_hrstart = g_hrnow;
1488 1489          usage->zsu_deleted = B_FALSE;
1489 1490          usage->zsu_empty = B_TRUE;
1490 1491          usage->zsu_scheds = 0;
1491 1492          usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1492 1493  
1493 1494          ctl->zsctl_npset_usages++;
1494 1495          pset->zsp_nusage++;
1495 1496  
1496 1497          return (usage);
1497 1498  }
1498 1499  
1499 1500  static zsd_pset_usage_t *
1500 1501  zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1501 1502  {
1502 1503          zsd_pset_usage_t *usage, *tmp;
1503 1504  
1504 1505          if ((usage = zsd_lookup_usage(pset, zone))
1505 1506              != NULL)
1506 1507                  return (usage);
1507 1508  
1508 1509          if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL)
1509 1510                  return (NULL);
1510 1511  
1511 1512          tmp = list_head(&pset->zsp_usage_list);
1512 1513          while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name)
1513 1514              > 0)
1514 1515                  tmp = list_next(&pset->zsp_usage_list, tmp);
1515 1516  
1516 1517          list_insert_before(&pset->zsp_usage_list, tmp, usage);
1517 1518          return (usage);
1518 1519  }
1519 1520  
1520 1521  static void
1521 1522  zsd_refresh_system(zsd_ctl_t *ctl)
1522 1523  {
1523 1524          zsd_system_t *system = ctl->zsctl_system;
1524 1525  
1525 1526          /* Re-count these values each interval */
1526 1527          system->zss_processes = 0;
1527 1528          system->zss_lwps = 0;
1528 1529          system->zss_shm = 0;
1529 1530          system->zss_shmids = 0;
1530 1531          system->zss_semids = 0;
1531 1532          system->zss_msgids = 0;
1532 1533          system->zss_lofi = 0;
1533 1534  }
1534 1535  
1535 1536  
1536 1537  /* Reads each cpu's kstats, and adds the usage to the cpu's pset */
1537 1538  static void
1538 1539  zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu)
1539 1540  {
1540 1541          zsd_system_t *sys;
1541 1542          processorid_t cpuid;
1542 1543          zsd_pset_t *pset_prev;
1543 1544          zsd_pset_t *pset;
1544 1545          kstat_t *kstat;
1545 1546          kstat_named_t *knp;
1546 1547          kid_t kid;
1547 1548          uint64_t idle, intr, kern, user;
1548 1549  
1549 1550          sys = ctl->zsctl_system;
1550 1551          pset = cpu->zsc_pset;
1551 1552          knp = NULL;
1552 1553          kid = -1;
1553 1554          cpuid = cpu->zsc_id;
1554 1555  
1555 1556          /* Get the cpu time totals for this cpu */
1556 1557          kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys");
1557 1558          if (kstat == NULL)
1558 1559                  return;
1559 1560  
1560 1561          kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
1561 1562          if (kid == -1)
1562 1563                  return;
1563 1564  
1564 1565          knp = kstat_data_lookup(kstat, "cpu_nsec_idle");
1565 1566          if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1566 1567                  return;
1567 1568  
1568 1569          idle = knp->value.ui64;
1569 1570  
1570 1571          knp = kstat_data_lookup(kstat, "cpu_nsec_kernel");
1571 1572          if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1572 1573                  return;
1573 1574  
1574 1575          kern = knp->value.ui64;
1575 1576  
1576 1577          knp = kstat_data_lookup(kstat, "cpu_nsec_user");
1577 1578          if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1578 1579                  return;
1579 1580  
1580 1581          user = knp->value.ui64;
1581 1582  
1582 1583          /*
1583 1584           * Tracking intr time per cpu just exists for future enhancements.
1584 1585           * The value is presently always zero.
1585 1586           */
1586 1587          intr = 0;
1587 1588          cpu->zsc_nsec_idle = idle;
1588 1589          cpu->zsc_nsec_intr = intr;
1589 1590          cpu->zsc_nsec_kern = kern;
1590 1591          cpu->zsc_nsec_user = user;
1591 1592  
1592 1593          if (cpu->zsc_onlined == B_TRUE) {
1593 1594                  /*
1594 1595                   * cpu is newly online.  There is no reference value,
1595 1596                   * so just record its current stats for comparison
1596 1597                   * on next stat read.
1597 1598                   */
1598 1599                  cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1599 1600                  cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1600 1601                  cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1601 1602                  cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1602 1603                  return;
1603 1604          }
1604 1605  
1605 1606          /*
1606 1607           * Calculate relative time since previous refresh.
1607 1608           * Paranoia.  Don't let time  go backwards.
1608 1609           */
1609 1610          idle = intr = kern = user = 0;
1610 1611          if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev)
1611 1612                  idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev;
1612 1613  
1613 1614          if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev)
1614 1615                  intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev;
1615 1616  
1616 1617          if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev)
1617 1618                  kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev;
1618 1619  
1619 1620          if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev)
1620 1621                  user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev;
1621 1622  
1622 1623          /* Update totals for cpu usage */
1623 1624          TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle);
1624 1625          TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr);
1625 1626          TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern);
1626 1627          TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user);
1627 1628  
1628 1629          /*
1629 1630           * Add cpu's stats to its pset if it is known to be in
1630 1631           * the pset since previous read.
1631 1632           */
1632 1633          if (cpu->zsc_psetid == cpu->zsc_psetid_prev ||
1633 1634              cpu->zsc_psetid_prev == ZS_PSET_ERROR ||
1634 1635              (pset_prev = zsd_lookup_pset_byid(ctl,
1635 1636              cpu->zsc_psetid_prev)) == NULL) {
1636 1637                  TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle);
1637 1638                  TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr);
1638 1639                  TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern);
1639 1640                  TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user);
1640 1641          } else {
1641 1642                  /*
1642 1643                   * Last pset was different than current pset.
1643 1644                   * Best guess is to split usage between the two.
1644 1645                   */
1645 1646                  TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2);
1646 1647                  TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2);
1647 1648                  TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2);
1648 1649                  TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2);
1649 1650  
1650 1651                  TIMESTRUC_ADD_NANOSEC(pset->zsp_idle,
1651 1652                      (idle / 2) + (idle % 2));
1652 1653                  TIMESTRUC_ADD_NANOSEC(pset->zsp_intr,
1653 1654                      (intr / 2) + (intr % 2));
1654 1655                  TIMESTRUC_ADD_NANOSEC(pset->zsp_kern,
1655 1656                      (kern / 2) + (kern % 2));
1656 1657                  TIMESTRUC_ADD_NANOSEC(pset->zsp_user,
1657 1658                      (user / 2) + (user % 2));
1658 1659          }
1659 1660          TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle);
1660 1661          TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr);
1661 1662          TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern);
1662 1663          TIMESTRUC_ADD_NANOSEC(sys->zss_user, user);
1663 1664  }
1664 1665  
1665 1666  /* Determine the details of a processor set by pset_id */
1666 1667  static int
1667 1668  zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname,
1668 1669      size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size,
1669 1670      uint64_t *min, uint64_t *max, int64_t *importance)
1670 1671  {
1671 1672          uint_t old, num;
1672 1673  
1673 1674          pool_conf_t *conf = ctl->zsctl_pool_conf;
1674 1675          pool_value_t **vals = ctl->zsctl_pool_vals;
1675 1676          pool_resource_t **res_list = NULL;
1676 1677          pool_resource_t *pset;
1677 1678          pool_component_t **cpus = NULL;
1678 1679          processorid_t *cache;
1679 1680          const char *string;
1680 1681          uint64_t uint64;
1681 1682          int64_t int64;
1682 1683          int i, ret, type;
1683 1684  
1684 1685          if (ctl->zsctl_pool_status == POOL_DISABLED) {
1685 1686  
1686 1687                  /*
1687 1688                   * Inspect legacy psets
1688 1689                   */
1689 1690                  for (;;) {
1690 1691                          old = num = ctl->zsctl_cpu_ncache;
1691 1692                          ret = pset_info(psetid, &type, &num,
1692 1693                              ctl->zsctl_cpu_cache);
1693 1694                          if (ret < 0) {
1694 1695                                  /* pset is gone.  Tell caller to retry */
1695 1696                                  errno = EINTR;
1696 1697                                  return (-1);
1697 1698                          }
1698 1699                          if (num <= old) {
1699 1700                          /* Success */
1700 1701                                  break;
1701 1702                          }
1702 1703                          if ((cache = (processorid_t *)realloc(
1703 1704                              ctl->zsctl_cpu_cache, num *
1704 1705                              sizeof (processorid_t))) != NULL) {
1705 1706                                  ctl->zsctl_cpu_ncache = num;
1706 1707                                  ctl->zsctl_cpu_cache = cache;
1707 1708                          } else {
1708 1709                                  /*
1709 1710                                   * Could not allocate to get new cpu list.
1710 1711                                   */
1711 1712                                  zsd_warn(gettext(
1712 1713                                      "Could not allocate for cpu list"));
1713 1714                                  errno = ENOMEM;
1714 1715                                  return (-1);
1715 1716                          }
1716 1717                  }
1717 1718                  /*
1718 1719                   * Old school pset.  Just make min and max equal
1719 1720                   * to its size
1720 1721                   */
1721 1722                  if (psetid == ZS_PSET_DEFAULT) {
1722 1723                          *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1723 1724                          (void) strlcpy(psetname, "pset_default", namelen);
1724 1725                  } else {
1725 1726                          *cputype = ZS_CPUTYPE_PSRSET_PSET;
1726 1727                          (void) snprintf(psetname, namelen,
1727 1728                              "SUNWlegacy_pset_%d", psetid);
1728 1729                  }
1729 1730  
1730 1731                  /*
1731 1732                   * Just treat legacy pset as a simple pool pset
1732 1733                   */
1733 1734                  *online = num;
1734 1735                  *size = num;
1735 1736                  *min = num;
1736 1737                  *max = num;
1737 1738                  *importance = 1;
1738 1739  
1739 1740                  return (0);
1740 1741          }
1741 1742  
1742 1743          /* Look up the pool pset using the pset id */
1743 1744          res_list = NULL;
1744 1745          pool_value_set_int64(vals[1], psetid);
1745 1746          if (pool_value_set_name(vals[1], "pset.sys_id")
1746 1747              != PO_SUCCESS)
1747 1748                  goto err;
1748 1749  
1749 1750          if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1750 1751                  goto err;
1751 1752          if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1752 1753                  goto err;
1753 1754          if ((res_list = pool_query_resources(conf, &num, vals)) == NULL)
1754 1755                  goto err;
1755 1756          if (num != 1)
1756 1757                  goto err;
1757 1758          pset = res_list[0];
1758 1759          free(res_list);
1759 1760          res_list = NULL;
1760 1761          if (pool_get_property(conf, pool_resource_to_elem(conf, pset),
1761 1762              "pset.name", vals[0]) != POC_STRING ||
1762 1763              pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1763 1764                  goto err;
1764 1765  
1765 1766          (void) strlcpy(psetname, string, namelen);
1766 1767          if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0)
1767 1768                  *cputype = ZS_CPUTYPE_DEDICATED;
1768 1769          else if (psetid == ZS_PSET_DEFAULT)
1769 1770                  *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1770 1771          else
1771 1772                  *cputype = ZS_CPUTYPE_POOL_PSET;
1772 1773  
1773 1774          /* Get size, min, max, and importance */
1774 1775          if (pool_get_property(conf, pool_resource_to_elem(conf,
1775 1776              pset), "pset.size", vals[0]) == POC_UINT &&
1776 1777              pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1777 1778                  *size = uint64;
1778 1779          else
1779 1780                  *size = 0;
1780 1781  
1781 1782                  /* Get size, min, max, and importance */
1782 1783          if (pool_get_property(conf, pool_resource_to_elem(conf,
1783 1784              pset), "pset.min", vals[0]) == POC_UINT &&
1784 1785              pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1785 1786                  *min = uint64;
1786 1787          else
1787 1788                  *min = 0;
1788 1789          if (*min >= ZSD_PSET_UNLIMITED)
1789 1790                  *min = ZS_LIMIT_NONE;
1790 1791  
1791 1792          if (pool_get_property(conf, pool_resource_to_elem(conf,
1792 1793              pset), "pset.max", vals[0]) == POC_UINT &&
1793 1794              pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1794 1795                  *max = uint64;
1795 1796          else
1796 1797                  *max = ZS_LIMIT_NONE;
1797 1798  
1798 1799          if (*max >= ZSD_PSET_UNLIMITED)
1799 1800                  *max = ZS_LIMIT_NONE;
1800 1801  
1801 1802          if (pool_get_property(conf, pool_resource_to_elem(conf,
1802 1803              pset), "pset.importance", vals[0]) == POC_INT &&
1803 1804              pool_value_get_int64(vals[0], &int64) == PO_SUCCESS)
1804 1805                  *importance = int64;
1805 1806          else
1806 1807                  *importance = (uint64_t)1;
1807 1808  
1808 1809          *online = 0;
1809 1810          if (*size == 0)
1810 1811                  return (0);
1811 1812  
1812 1813          /* get cpus */
1813 1814          cpus = pool_query_resource_components(conf, pset, &num, NULL);
1814 1815          if (cpus == NULL)
1815 1816                  goto err;
1816 1817  
1817 1818          /* Make sure there is space for cpu id list */
1818 1819          if (num > ctl->zsctl_cpu_ncache) {
1819 1820                  if ((cache = (processorid_t *)realloc(
1820 1821                      ctl->zsctl_cpu_cache, num *
1821 1822                      sizeof (processorid_t))) != NULL) {
1822 1823                          ctl->zsctl_cpu_ncache = num;
1823 1824                          ctl->zsctl_cpu_cache = cache;
1824 1825                  } else {
1825 1826                          /*
1826 1827                           * Could not allocate to get new cpu list.
1827 1828                           */
1828 1829                          zsd_warn(gettext(
1829 1830                              "Could not allocate for cpu list"));
1830 1831                          goto err;
1831 1832                  }
1832 1833          }
1833 1834  
1834 1835          /* count the online cpus */
1835 1836          for (i = 0; i < num; i++) {
1836 1837                  if (pool_get_property(conf, pool_component_to_elem(
1837 1838                      conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING ||
1838 1839                      pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1839 1840                          goto err;
1840 1841  
1841 1842                  if (strcmp(string, "on-line") != 0 &&
1842 1843                      strcmp(string, "no-intr") != 0)
1843 1844                          continue;
1844 1845  
1845 1846                  if (pool_get_property(conf, pool_component_to_elem(
1846 1847                      conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT ||
1847 1848                      pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
1848 1849                          goto err;
1849 1850  
1850 1851                  (*online)++;
1851 1852                  ctl->zsctl_cpu_cache[i] = (psetid_t)int64;
1852 1853          }
1853 1854          free(cpus);
1854 1855          return (0);
1855 1856  err:
1856 1857          if (res_list != NULL)
1857 1858                  free(res_list);
1858 1859          if (cpus != NULL)
1859 1860                  free(cpus);
1860 1861  
1861 1862          /*
1862 1863           * The pools operations should succeed since the conf is a consistent
1863 1864           * snapshot.  Tell caller there is no need to retry.
1864 1865           */
1865 1866          errno = EINVAL;
1866 1867          return (-1);
1867 1868  }
1868 1869  
1869 1870  /*
1870 1871   * Update the current list of processor sets.
1871 1872   * This also updates the list of online cpus, and each cpu's pset membership.
1872 1873   */
1873 1874  static void
1874 1875  zsd_refresh_psets(zsd_ctl_t *ctl)
1875 1876  {
1876 1877          int i, j, ret, state;
1877 1878          uint_t old, num;
1878 1879          uint_t cputype;
1879 1880          int64_t sys_id, importance;
1880 1881          uint64_t online, size, min, max;
1881 1882          zsd_system_t *system;
1882 1883          zsd_pset_t *pset;
1883 1884          zsd_cpu_t *cpu;
1884 1885          psetid_t *cache;
1885 1886          char psetname[ZS_PSETNAME_MAX];
1886 1887          processorid_t cpuid;
1887 1888          pool_value_t *pv_save = NULL;
1888 1889          pool_resource_t **res_list = NULL;
1889 1890          pool_resource_t *res;
1890 1891          pool_value_t **vals;
1891 1892          pool_conf_t *conf;
1892 1893          boolean_t roll_cpus = B_TRUE;
1893 1894  
1894 1895          /* Zero cpu counters to recount them */
1895 1896          system = ctl->zsctl_system;
1896 1897          system->zss_ncpus = 0;
1897 1898          system->zss_ncpus_online = 0;
1898 1899  retry:
1899 1900          ret = pool_get_status(&state);
1900 1901          if (ret == 0 && state == POOL_ENABLED) {
1901 1902  
1902 1903                  conf = ctl->zsctl_pool_conf;
1903 1904                  vals = ctl->zsctl_pool_vals;
1904 1905                  pv_save = vals[1];
1905 1906                  vals[1] = NULL;
1906 1907  
1907 1908                  if (ctl->zsctl_pool_status == POOL_DISABLED) {
1908 1909                          if (pool_conf_open(ctl->zsctl_pool_conf,
1909 1910                              pool_dynamic_location(), PO_RDONLY) == 0) {
1910 1911                                  ctl->zsctl_pool_status = POOL_ENABLED;
1911 1912                                  ctl->zsctl_pool_changed = POU_PSET;
1912 1913                          }
1913 1914                  } else {
1914 1915                          ctl->zsctl_pool_changed = 0;
1915 1916                          ret = pool_conf_update(ctl->zsctl_pool_conf,
1916 1917                              &(ctl->zsctl_pool_changed));
1917 1918                          if (ret < 0) {
1918 1919                                  /* Pools must have become disabled */
1919 1920                                  (void) pool_conf_close(ctl->zsctl_pool_conf);
1920 1921                                  ctl->zsctl_pool_status = POOL_DISABLED;
1921 1922                                  if (pool_error() == POE_SYSTEM && errno ==
1922 1923                                      ENOTACTIVE)
1923 1924                                          goto retry;
1924 1925  
1925 1926                                  zsd_warn(gettext(
1926 1927                                      "Unable to update pool configuration"));
1927 1928                                  /* Not able to get pool info.  Don't update. */
1928 1929                                  goto err;
1929 1930                          }
1930 1931                  }
1931 1932                  /* Get the list of psets using libpool */
1932 1933                  if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1933 1934                          goto err;
1934 1935  
1935 1936                  if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1936 1937                          goto err;
1937 1938                  if ((res_list = pool_query_resources(conf, &num, vals))
1938 1939                      == NULL)
1939 1940                          goto err;
1940 1941  
1941 1942                  if (num > ctl->zsctl_pset_ncache)  {
1942 1943                          if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1943 1944                              (num) * sizeof (psetid_t))) == NULL) {
1944 1945                                  goto err;
1945 1946                          }
1946 1947                          ctl->zsctl_pset_ncache = num;
1947 1948                          ctl->zsctl_pset_cache = cache;
1948 1949                  }
1949 1950                  /* Save the pset id of each pset */
1950 1951                  for (i = 0; i < num; i++) {
1951 1952                          res = res_list[i];
1952 1953                          if (pool_get_property(conf, pool_resource_to_elem(conf,
1953 1954                              res), "pset.sys_id", vals[0]) != POC_INT ||
1954 1955                              pool_value_get_int64(vals[0], &sys_id)
1955 1956                              != PO_SUCCESS)
1956 1957                                  goto err;
1957 1958                          ctl->zsctl_pset_cache[i] = (int)sys_id;
1958 1959                  }
1959 1960                  vals[1] = pv_save;
1960 1961                  pv_save = NULL;
1961 1962          } else {
1962 1963                  if (ctl->zsctl_pool_status == POOL_ENABLED) {
1963 1964                          (void) pool_conf_close(ctl->zsctl_pool_conf);
1964 1965                          ctl->zsctl_pool_status = POOL_DISABLED;
1965 1966                  }
1966 1967                  /* Get the pset list using legacy psets */
1967 1968                  for (;;) {
1968 1969                          old = num = ctl->zsctl_pset_ncache;
1969 1970                          (void) pset_list(ctl->zsctl_pset_cache, &num);
1970 1971                          if ((num + 1) <= old) {
1971 1972                                  break;
1972 1973                          }
1973 1974                          if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1974 1975                              (num + 1) * sizeof (psetid_t))) != NULL) {
1975 1976                                  ctl->zsctl_pset_ncache = num + 1;
1976 1977                                  ctl->zsctl_pset_cache = cache;
1977 1978                          } else {
1978 1979                                  /*
1979 1980                                   * Could not allocate to get new pset list.
1980 1981                                   * Give up
1981 1982                                   */
1982 1983                                  return;
1983 1984                          }
1984 1985                  }
1985 1986                  /* Add the default pset to list */
1986 1987                  ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0];
1987 1988                  ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT;
1988 1989                  num++;
1989 1990          }
1990 1991  psets_changed:
1991 1992          zsd_mark_cpus_start(ctl, roll_cpus);
1992 1993          zsd_mark_psets_start(ctl);
1993 1994          roll_cpus = B_FALSE;
1994 1995  
1995 1996          /* Refresh cpu membership of all psets */
1996 1997          for (i = 0; i < num; i++) {
1997 1998  
1998 1999                  /* Get pool pset information */
1999 2000                  sys_id = ctl->zsctl_pset_cache[i];
2000 2001                  if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname),
2001 2002                      &cputype, &online, &size, &min, &max, &importance)
2002 2003                      != 0) {
2003 2004                          if (errno == EINTR)
2004 2005                                  goto psets_changed;
2005 2006                          zsd_warn(gettext("Failed to get info for pset %d"),
2006 2007                              sys_id);
2007 2008                          continue;
2008 2009                  }
2009 2010  
2010 2011                  system->zss_ncpus += size;
2011 2012                  system->zss_ncpus_online += online;
2012 2013  
2013 2014                  pset = zsd_lookup_insert_pset(ctl, psetname,
2014 2015                      ctl->zsctl_pset_cache[i]);
2015 2016  
2016 2017                  /* update pset info */
2017 2018                  zsd_mark_pset_found(pset, cputype, online, size, min,
2018 2019                      max, importance);
2019 2020  
2020 2021                  /* update each cpu in pset */
2021 2022                  for (j = 0; j < pset->zsp_online; j++) {
2022 2023                          cpuid = ctl->zsctl_cpu_cache[j];
2023 2024                          cpu = zsd_lookup_insert_cpu(ctl, cpuid);
2024 2025                          zsd_mark_cpu_found(cpu, pset, sys_id);
2025 2026                  }
2026 2027          }
2027 2028  err:
2028 2029          if (res_list != NULL)
2029 2030                  free(res_list);
2030 2031          if (pv_save != NULL)
2031 2032                  vals[1] = pv_save;
2032 2033  }
2033 2034  
2034 2035  
2035 2036  
2036 2037  /*
2037 2038   * Fetch the current pool and pset name for the given zone.
2038 2039   */
2039 2040  static void
2040 2041  zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone,
2041 2042      char *pool, int poollen, char *pset, int psetlen, uint_t *cputype)
2042 2043  {
2043 2044          poolid_t poolid;
2044 2045          pool_t **pools = NULL;
2045 2046          pool_resource_t **res_list = NULL;
2046 2047          char poolname[ZS_POOLNAME_MAX];
2047 2048          char psetname[ZS_PSETNAME_MAX];
2048 2049          pool_conf_t *conf = ctl->zsctl_pool_conf;
2049 2050          pool_value_t *pv_save = NULL;
2050 2051          pool_value_t **vals = ctl->zsctl_pool_vals;
2051 2052          const char *string;
2052 2053          int ret;
2053 2054          int64_t int64;
2054 2055          uint_t num;
2055 2056  
2056 2057          ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID,
2057 2058              &poolid, sizeof (poolid));
2058 2059          if (ret < 0)
2059 2060                  goto lookup_done;
2060 2061  
2061 2062          pv_save = vals[1];
2062 2063          vals[1] = NULL;
2063 2064          pools = NULL;
2064 2065          res_list = NULL;
2065 2066  
2066 2067          /* Default values if lookup fails */
2067 2068          (void) strlcpy(poolname, "pool_default", sizeof (poolname));
2068 2069          (void) strlcpy(psetname, "pset_default", sizeof (poolname));
2069 2070          *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2070 2071  
2071 2072          /* no dedicated cpu if pools are disabled */
2072 2073          if (ctl->zsctl_pool_status == POOL_DISABLED)
2073 2074                  goto lookup_done;
2074 2075  
2075 2076          /* Get the pool name using the id */
2076 2077          pool_value_set_int64(vals[0], poolid);
2077 2078          if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS)
2078 2079                  goto lookup_done;
2079 2080  
2080 2081          if ((pools = pool_query_pools(conf, &num, vals)) == NULL)
2081 2082                  goto lookup_done;
2082 2083  
2083 2084          if (num != 1)
2084 2085                  goto lookup_done;
2085 2086  
2086 2087          if (pool_get_property(conf, pool_to_elem(conf, pools[0]),
2087 2088              "pool.name", vals[0]) != POC_STRING ||
2088 2089              pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2089 2090                  goto lookup_done;
2090 2091          (void) strlcpy(poolname, (char *)string, sizeof (poolname));
2091 2092  
2092 2093          /* Get the name of the pset for the pool */
2093 2094          if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
2094 2095                  goto lookup_done;
2095 2096  
2096 2097          if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
2097 2098                  goto lookup_done;
2098 2099  
2099 2100          if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals))
2100 2101              == NULL)
2101 2102                  goto lookup_done;
2102 2103  
2103 2104          if (num != 1)
2104 2105                  goto lookup_done;
2105 2106  
2106 2107          if (pool_get_property(conf, pool_resource_to_elem(conf,
2107 2108              res_list[0]), "pset.sys_id", vals[0]) != POC_INT ||
2108 2109              pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
2109 2110                  goto lookup_done;
2110 2111  
2111 2112          if (int64 == ZS_PSET_DEFAULT)
2112 2113                  *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2113 2114  
2114 2115          if (pool_get_property(conf, pool_resource_to_elem(conf,
2115 2116              res_list[0]), "pset.name", vals[0]) != POC_STRING ||
2116 2117              pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2117 2118                  goto lookup_done;
2118 2119  
2119 2120          (void) strlcpy(psetname, (char *)string, sizeof (psetname));
2120 2121  
2121 2122          if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0)
2122 2123                  *cputype = ZS_CPUTYPE_DEDICATED;
2123 2124          if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0)
2124 2125                  *cputype = ZS_CPUTYPE_PSRSET_PSET;
2125 2126          else
2126 2127                  *cputype = ZS_CPUTYPE_POOL_PSET;
2127 2128  
2128 2129  lookup_done:
2129 2130  
2130 2131          if (pv_save != NULL)
2131 2132                  vals[1] = pv_save;
2132 2133  
2133 2134          if (res_list)
2134 2135                  free(res_list);
2135 2136          if (pools)
2136 2137                  free(pools);
2137 2138  
2138 2139          (void) strlcpy(pool, poolname, poollen);
2139 2140          (void) strlcpy(pset, psetname, psetlen);
2140 2141  }
2141 2142  
2142 2143  /* Convert scheduler names to ZS_* scheduler flags */
2143 2144  static uint_t
2144 2145  zsd_schedname2int(char *clname, int pri)
2145 2146  {
2146 2147          uint_t sched = 0;
2147 2148  
2148 2149          if (strcmp(clname, "TS") == 0) {
2149 2150                  sched = ZS_SCHED_TS;
2150 2151          } else if (strcmp(clname, "IA") == 0) {
2151 2152                  sched = ZS_SCHED_IA;
2152 2153          } else if (strcmp(clname, "FX") == 0) {
2153 2154                  if (pri > 59) {
2154 2155                          sched = ZS_SCHED_FX_60;
2155 2156                  } else {
2156 2157                          sched = ZS_SCHED_FX;
2157 2158                  }
2158 2159          } else if (strcmp(clname, "RT") == 0) {
2159 2160                  sched = ZS_SCHED_RT;
2160 2161  
2161 2162          } else if (strcmp(clname, "FSS") == 0) {
2162 2163                  sched = ZS_SCHED_FSS;
2163 2164          }
2164 2165          return (sched);
2165 2166  }
2166 2167  
2167 2168  static uint64_t
2168 2169  zsd_get_zone_rctl_limit(char *name)
2169 2170  {
2170 2171          rctlblk_t *rblk;
2171 2172  
2172 2173          rblk = (rctlblk_t *)alloca(rctlblk_size());
2173 2174          if (getrctl(name, NULL, rblk, RCTL_FIRST)
2174 2175              != 0) {
2175 2176                  return (ZS_LIMIT_NONE);
2176 2177          }
2177 2178          return (rctlblk_get_value(rblk));
2178 2179  }
2179 2180  
2180 2181  static uint64_t
2181 2182  zsd_get_zone_rctl_usage(char *name)
2182 2183  {
  
    | 
      ↓ open down ↓ | 
    2149 lines elided | 
    
      ↑ open up ↑ | 
  
2183 2184          rctlblk_t *rblk;
2184 2185  
2185 2186          rblk = (rctlblk_t *)alloca(rctlblk_size());
2186 2187          if (getrctl(name, NULL, rblk, RCTL_USAGE)
2187 2188              != 0) {
2188 2189                  return (0);
2189 2190          }
2190 2191          return (rctlblk_get_value(rblk));
2191 2192  }
2192 2193  
2193      -#define ZSD_NUM_RCTL_VALS 19
     2194 +#define ZSD_NUM_RCTL_VALS 20
2194 2195  
2195 2196  /*
2196 2197   * Fetch the limit information for a zone.  This uses zone_enter() as the
2197 2198   * getrctl(2) system call only returns rctl information for the zone of
2198 2199   * the caller.
2199 2200   */
2200 2201  static int
2201 2202  zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares,
2202 2203      uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap,
2203 2204      uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes,
2204 2205      uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm,
2205 2206      uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap,
2206 2207      uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids,
2207 2208      uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched)
2208 2209  {
2209 2210          int p[2], pid, tmpl_fd, ret;
2210 2211          ctid_t ct;
2211 2212          char class[PC_CLNMSZ];
2212 2213          uint64_t vals[ZSD_NUM_RCTL_VALS];
2213 2214          zsd_system_t *sys = ctl->zsctl_system;
2214 2215          int i = 0;
2215 2216          int res = 0;
2216 2217  
2217 2218          /* Treat all caps as no cap on error */
2218 2219          *cpu_shares = ZS_LIMIT_NONE;
2219 2220          *cpu_cap = ZS_LIMIT_NONE;
2220 2221          *ram_cap = ZS_LIMIT_NONE;
2221 2222          *locked_cap = ZS_LIMIT_NONE;
2222 2223          *vm_cap = ZS_LIMIT_NONE;
2223 2224  
2224 2225          *processes_cap = ZS_LIMIT_NONE;
2225 2226          *lwps_cap = ZS_LIMIT_NONE;
2226 2227          *shm_cap = ZS_LIMIT_NONE;
2227 2228          *shmids_cap = ZS_LIMIT_NONE;
2228 2229          *semids_cap = ZS_LIMIT_NONE;
2229 2230          *msgids_cap = ZS_LIMIT_NONE;
  
    | 
      ↓ open down ↓ | 
    26 lines elided | 
    
      ↑ open up ↑ | 
  
2230 2231          *lofi_cap = ZS_LIMIT_NONE;
2231 2232  
2232 2233          *processes = 0;
2233 2234          *lwps = 0;
2234 2235          *shm = 0;
2235 2236          *shmids = 0;
2236 2237          *semids = 0;
2237 2238          *msgids = 0;
2238 2239          *lofi = 0;
2239 2240  
2240      -        /* Get the ram cap first since it is a zone attr */
2241      -        ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP,
2242      -            ram_cap, sizeof (*ram_cap));
2243      -        if (ret < 0 || *ram_cap == 0)
2244      -                *ram_cap = ZS_LIMIT_NONE;
2245      -
2246 2241          /* Get the zone's default scheduling class */
2247 2242          ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS,
2248 2243              class, sizeof (class));
2249 2244          if (ret < 0)
2250 2245                  return (-1);
2251 2246  
2252 2247          *sched = zsd_schedname2int(class, 0);
2253 2248  
2254 2249          /* rctl caps must be fetched from within the zone */
2255 2250          if (pipe(p) != 0)
2256 2251                  return (-1);
2257 2252  
2258 2253          if ((tmpl_fd = init_template()) == -1) {
2259 2254                  (void) close(p[0]);
2260 2255                  (void) close(p[1]);
2261 2256                  return (-1);
2262 2257          }
2263 2258          pid = forkx(0);
2264 2259          if (pid < 0) {
2265 2260                  (void) ct_tmpl_clear(tmpl_fd);
2266 2261                  (void) close(p[0]);
2267 2262                  (void) close(p[1]);
2268 2263                  return (-1);
2269 2264          }
2270 2265          if (pid == 0) {
2271 2266  
2272 2267                  (void) ct_tmpl_clear(tmpl_fd);
2273 2268                  (void) close(tmpl_fd);
2274 2269                  (void) close(p[0]);
2275 2270                  if (zone->zsz_id != getzoneid()) {
2276 2271                          if (zone_enter(zone->zsz_id) < 0) {
2277 2272                                  (void) close(p[1]);
2278 2273                                  _exit(0);
2279 2274                          }
2280 2275                  }
2281 2276  
2282 2277                  /* Get caps for zone, and write them to zonestatd parent. */
2283 2278                  vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares");
2284 2279                  vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap");
2285 2280                  vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory");
2286 2281                  vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap");
2287 2282                  vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes");
2288 2283                  vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes");
2289 2284                  vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps");
2290 2285                  vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps");
  
    | 
      ↓ open down ↓ | 
    35 lines elided | 
    
      ↑ open up ↑ | 
  
2291 2286                  vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory");
2292 2287                  vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory");
2293 2288                  vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids");
2294 2289                  vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids");
2295 2290                  vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids");
2296 2291                  vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids");
2297 2292                  vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids");
2298 2293                  vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids");
2299 2294                  vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi");
2300 2295                  vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi");
     2296 +                vals[i++] = zsd_get_zone_rctl_usage("zone.max-physical-memory");
2301 2297  
2302 2298                  if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2303 2299                      ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2304 2300                          (void) close(p[1]);
2305 2301                          _exit(1);
2306 2302                  }
2307 2303  
2308 2304                  (void) close(p[1]);
2309 2305                  _exit(0);
2310 2306          }
2311 2307          if (contract_latest(&ct) == -1)
2312 2308                  ct = -1;
2313 2309  
2314 2310          (void) ct_tmpl_clear(tmpl_fd);
2315 2311          (void) close(tmpl_fd);
2316 2312          (void) close(p[1]);
2317 2313          while (waitpid(pid, NULL, 0) != pid)
2318 2314                  ;
2319 2315  
2320 2316          /* Read cap from child in zone */
2321 2317          if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2322 2318              ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2323 2319                  res = -1;
2324 2320                  goto cleanup;
2325 2321          }
2326 2322          i = 0;
2327 2323          *cpu_shares = vals[i++];
2328 2324          *cpu_cap = vals[i++];
2329 2325          *locked_cap = vals[i++];
2330 2326          *vm_cap = vals[i++];
2331 2327          *processes_cap = vals[i++];
2332 2328          *processes = vals[i++];
2333 2329          *lwps_cap = vals[i++];
2334 2330          *lwps = vals[i++];
  
    | 
      ↓ open down ↓ | 
    24 lines elided | 
    
      ↑ open up ↑ | 
  
2335 2331          *shm_cap = vals[i++];
2336 2332          *shm = vals[i++];
2337 2333          *shmids_cap = vals[i++];
2338 2334          *shmids = vals[i++];
2339 2335          *semids_cap = vals[i++];
2340 2336          *semids = vals[i++];
2341 2337          *msgids_cap = vals[i++];
2342 2338          *msgids = vals[i++];
2343 2339          *lofi_cap = vals[i++];
2344 2340          *lofi = vals[i++];
     2341 +        *ram_cap = vals[i++];
2345 2342  
2346 2343          /* Interpret maximum values as no cap */
2347 2344          if (*cpu_cap == UINT32_MAX || *cpu_cap == 0)
2348 2345                  *cpu_cap = ZS_LIMIT_NONE;
2349 2346          if (*processes_cap == sys->zss_processes_max)
2350 2347                  *processes_cap = ZS_LIMIT_NONE;
2351 2348          if (*lwps_cap == sys->zss_lwps_max)
2352 2349                  *lwps_cap = ZS_LIMIT_NONE;
2353 2350          if (*shm_cap == sys->zss_shm_max)
2354 2351                  *shm_cap = ZS_LIMIT_NONE;
2355 2352          if (*shmids_cap == sys->zss_shmids_max)
2356 2353                  *shmids_cap = ZS_LIMIT_NONE;
2357 2354          if (*semids_cap == sys->zss_semids_max)
2358 2355                  *semids_cap = ZS_LIMIT_NONE;
2359 2356          if (*msgids_cap == sys->zss_msgids_max)
2360 2357                  *msgids_cap = ZS_LIMIT_NONE;
2361 2358          if (*lofi_cap == sys->zss_lofi_max)
2362 2359                  *lofi_cap = ZS_LIMIT_NONE;
2363 2360  
2364 2361  
2365 2362  cleanup:
2366 2363          (void) close(p[0]);
2367 2364          (void) ct_tmpl_clear(tmpl_fd);
2368 2365          (void) close(tmpl_fd);
2369 2366          (void) contract_abandon_id(ct);
2370 2367  
2371 2368          return (res);
2372 2369  }
2373 2370  
2374 2371  /* Update the current list of running zones */
2375 2372  static void
2376 2373  zsd_refresh_zones(zsd_ctl_t *ctl)
2377 2374  {
2378 2375          zsd_zone_t *zone;
2379 2376          uint_t old, num;
2380 2377          ushort_t flags;
2381 2378          int i, ret;
2382 2379          zoneid_t *cache;
2383 2380          uint64_t cpu_shares;
2384 2381          uint64_t cpu_cap;
2385 2382          uint64_t ram_cap;
2386 2383          uint64_t locked_cap;
2387 2384          uint64_t vm_cap;
2388 2385          uint64_t processes_cap;
2389 2386          uint64_t processes;
2390 2387          uint64_t lwps_cap;
2391 2388          uint64_t lwps;
2392 2389          uint64_t shm_cap;
2393 2390          uint64_t shm;
2394 2391          uint64_t shmids_cap;
2395 2392          uint64_t shmids;
2396 2393          uint64_t semids_cap;
2397 2394          uint64_t semids;
2398 2395          uint64_t msgids_cap;
2399 2396          uint64_t msgids;
2400 2397          uint64_t lofi_cap;
2401 2398          uint64_t lofi;
2402 2399  
2403 2400          char zonename[ZS_ZONENAME_MAX];
2404 2401          char poolname[ZS_POOLNAME_MAX];
2405 2402          char psetname[ZS_PSETNAME_MAX];
2406 2403          uint_t sched;
2407 2404          uint_t cputype;
2408 2405          uint_t iptype;
2409 2406  
2410 2407          /* Get the current list of running zones */
2411 2408          for (;;) {
2412 2409                  old = num = ctl->zsctl_zone_ncache;
2413 2410                  (void) zone_list(ctl->zsctl_zone_cache, &num);
2414 2411                  if (num <= old)
2415 2412                          break;
2416 2413                  if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache,
2417 2414                      (num) * sizeof (zoneid_t))) != NULL) {
2418 2415                          ctl->zsctl_zone_ncache = num;
2419 2416                          ctl->zsctl_zone_cache = cache;
2420 2417                  } else {
2421 2418                          /* Could not allocate to get new zone list.  Give up */
2422 2419                          return;
2423 2420                  }
2424 2421          }
2425 2422  
2426 2423          zsd_mark_zones_start(ctl);
2427 2424  
2428 2425          for (i = 0; i < num; i++) {
2429 2426  
2430 2427                  ret = getzonenamebyid(ctl->zsctl_zone_cache[i],
2431 2428                      zonename, sizeof (zonename));
2432 2429                  if (ret < 0)
2433 2430                          continue;
2434 2431  
2435 2432                  zone = zsd_lookup_insert_zone(ctl, zonename,
2436 2433                      ctl->zsctl_zone_cache[i]);
2437 2434  
2438 2435                  ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS,
2439 2436                      &flags, sizeof (flags));
2440 2437                  if (ret < 0)
2441 2438                          continue;
2442 2439  
2443 2440                  if (flags & ZF_NET_EXCL)
2444 2441                          iptype = ZS_IPTYPE_EXCLUSIVE;
2445 2442                  else
2446 2443                          iptype = ZS_IPTYPE_SHARED;
2447 2444  
2448 2445                  zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname),
2449 2446                      psetname, sizeof (psetname), &cputype);
2450 2447  
2451 2448                  if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap,
2452 2449                      &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes,
2453 2450                      &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids,
2454 2451                      &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap,
2455 2452                      &lofi, &sched) != 0)
2456 2453                          continue;
2457 2454  
2458 2455                  zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap,
2459 2456                      locked_cap, vm_cap, processes_cap, processes, lwps_cap,
2460 2457                      lwps, shm_cap, shm, shmids_cap, shmids, semids_cap,
2461 2458                      semids, msgids_cap, msgids, lofi_cap, lofi, poolname,
2462 2459                      psetname, sched, cputype, iptype);
2463 2460          }
2464 2461  }
2465 2462  
2466 2463  /* Fetch the details of a process from its psinfo_t */
2467 2464  static void
2468 2465  zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid,
2469 2466      psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid,
2470 2467      timestruc_t *delta, uint_t *sched)
2471 2468  {
2472 2469          timestruc_t d;
2473 2470          zsd_proc_t *proc;
2474 2471  
2475 2472          /* Get cached data for proc */
2476 2473          proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]);
2477 2474          *psetid = psinfo->pr_lwp.pr_bindpset;
2478 2475  
2479 2476          if (proc->zspr_psetid == ZS_PSET_ERROR)
2480 2477                  *prev_psetid = *psetid;
2481 2478          else
2482 2479                  *prev_psetid = proc->zspr_psetid;
2483 2480  
2484 2481          *zoneid = psinfo->pr_zoneid;
2485 2482          if (proc->zspr_zoneid == -1)
2486 2483                  *prev_zoneid = *zoneid;
2487 2484          else
2488 2485                  *prev_zoneid = proc->zspr_zoneid;
2489 2486  
2490 2487          TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage);
2491 2488          *delta = d;
2492 2489  
2493 2490          *sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname,
2494 2491              psinfo->pr_lwp.pr_pri);
2495 2492  
2496 2493          /* Update cached data for proc */
2497 2494          proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset;
2498 2495          proc->zspr_zoneid = psinfo->pr_zoneid;
2499 2496          proc->zspr_sched = *sched;
2500 2497          proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec;
2501 2498          proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec;
2502 2499          proc->zspr_ppid = psinfo->pr_ppid;
2503 2500  }
2504 2501  
2505 2502  /*
2506 2503   * Reset the known cpu usage of a process. This is done after a process
2507 2504   * exits so that if the pid is recycled, data from its previous life is
2508 2505   * not reused
2509 2506   */
2510 2507  static void
2511 2508  zsd_flush_proc_info(zsd_proc_t *proc)
2512 2509  {
2513 2510          proc->zspr_usage.tv_sec = 0;
2514 2511          proc->zspr_usage.tv_nsec = 0;
2515 2512  }
2516 2513  
2517 2514  /*
2518 2515   * Open the current extended accounting file.  On initialization, open the
2519 2516   * file as the current file to be used.  Otherwise, open the file as the
2520 2517   * next file to use of the current file reaches EOF.
2521 2518   */
2522 2519  static int
2523 2520  zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init)
2524 2521  {
2525 2522          int ret, oret, state, trys = 0, flags;
2526 2523          int *fd, *open;
2527 2524          ea_file_t *eaf;
2528 2525          struct stat64 *stat;
2529 2526          char path[MAXPATHLEN];
2530 2527  
2531 2528          /*
2532 2529           * The accounting file is first opened at the tail.  Following
2533 2530           * opens to new accounting files are opened at the head.
2534 2531           */
2535 2532          if (init == B_TRUE) {
2536 2533                  flags = EO_NO_VALID_HDR | EO_TAIL;
2537 2534                  fd = &ctl->zsctl_proc_fd;
2538 2535                  eaf = &ctl->zsctl_proc_eaf;
2539 2536                  stat = &ctl->zsctl_proc_stat;
2540 2537                  open = &ctl->zsctl_proc_open;
2541 2538          } else {
2542 2539                  flags = EO_NO_VALID_HDR | EO_HEAD;
2543 2540                  fd = &ctl->zsctl_proc_fd_next;
2544 2541                  eaf = &ctl->zsctl_proc_eaf_next;
2545 2542                  stat = &ctl->zsctl_proc_stat_next;
2546 2543                  open = &ctl->zsctl_proc_open_next;
2547 2544          }
2548 2545  
2549 2546          *fd = -1;
2550 2547          *open = 0;
2551 2548  retry:
2552 2549          /* open accounting files for cpu consumption */
2553 2550          ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state));
2554 2551          if (ret != 0) {
2555 2552                  zsd_warn(gettext("Unable to get process accounting state"));
2556 2553                  goto err;
2557 2554          }
2558 2555          if (state != AC_ON) {
2559 2556                  if (trys > 0) {
2560 2557                          zsd_warn(gettext(
2561 2558                              "Unable to enable process accounting"));
2562 2559                          goto err;
2563 2560                  }
2564 2561                  (void) zsd_enable_cpu_stats();
2565 2562                  trys++;
2566 2563                  goto retry;
2567 2564          }
2568 2565  
2569 2566          ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path));
2570 2567          if (ret != 0) {
2571 2568                  zsd_warn(gettext("Unable to get process accounting file"));
2572 2569                  goto err;
2573 2570          }
2574 2571  
2575 2572          if ((*fd = open64(path, O_RDONLY, 0)) >= 0 &&
2576 2573              (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0)
2577 2574                  ret = fstat64(*fd, stat);
2578 2575  
2579 2576          if (*fd < 0 || oret < 0 || ret < 0) {
2580 2577                  struct timespec ts;
2581 2578  
2582 2579                  /*
2583 2580                   * It is possible the accounting file is momentarily unavailable
2584 2581                   * because it is being rolled.  Try for up to half a second.
2585 2582                   *
2586 2583                   * If failure to open accounting file persists, give up.
2587 2584                   */
2588 2585                  if (oret == 0)
2589 2586                          (void) ea_close(eaf);
2590 2587                  else if (*fd >= 0)
2591 2588                          (void) close(*fd);
2592 2589                  if (trys > 500) {
2593 2590                          zsd_warn(gettext(
2594 2591                              "Unable to open process accounting file"));
2595 2592                          goto err;
2596 2593                  }
2597 2594                  /* wait one millisecond */
2598 2595                  ts.tv_sec = 0;
2599 2596                  ts.tv_nsec = NANOSEC / 1000;
2600 2597                  (void) nanosleep(&ts, NULL);
2601 2598                  goto retry;
2602 2599          }
2603 2600          *open = 1;
2604 2601          return (0);
2605 2602  err:
2606 2603          if (*fd >= 0)
2607 2604                  (void) close(*fd);
2608 2605          *open = 0;
2609 2606          *fd = -1;
2610 2607          return (-1);
2611 2608  }
2612 2609  
2613 2610  /*
2614 2611   * Walk /proc and charge each process to its zone and processor set.
2615 2612   * Then read exacct data for exited processes, and charge them as well.
2616 2613   */
2617 2614  static void
2618 2615  zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init)
2619 2616  {
2620 2617          DIR *dir;
2621 2618          struct dirent *dent;
2622 2619          psinfo_t psinfo;
2623 2620          int fd, ret;
2624 2621          zsd_proc_t *proc, *pproc, *tmp, *next;
2625 2622          list_t pplist, plist;
2626 2623          zsd_zone_t *zone, *prev_zone;
2627 2624          zsd_pset_t *pset, *prev_pset;
2628 2625          psetid_t psetid, prev_psetid;
2629 2626          zoneid_t zoneid, prev_zoneid;
2630 2627          zsd_pset_usage_t *usage, *prev_usage;
2631 2628          char path[MAXPATHLEN];
2632 2629  
2633 2630          ea_object_t object;
2634 2631          ea_object_t pobject;
2635 2632          boolean_t hrtime_expired = B_FALSE;
2636 2633          struct timeval interval_end;
2637 2634  
2638 2635          timestruc_t delta, d1, d2;
2639 2636          uint_t sched = 0;
2640 2637  
2641 2638          /*
2642 2639           * Get the current accounting file.  The current accounting file
2643 2640           * may be different than the file in use, as the accounting file
2644 2641           * may have been rolled, or manually changed by an admin.
2645 2642           */
2646 2643          ret = zsd_open_exacct(ctl, init);
2647 2644          if (ret != 0) {
2648 2645                  zsd_warn(gettext("Unable to track process accounting"));
2649 2646                  return;
2650 2647          }
2651 2648  
2652 2649          /*
2653 2650           * Mark the current time as the interval end time.  Don't track
2654 2651           * processes that exit after this time.
2655 2652           */
2656 2653          (void) gettimeofday(&interval_end, NULL);
2657 2654  
2658 2655          dir = opendir("/proc");
2659 2656          if (dir == NULL) {
2660 2657                  zsd_warn(gettext("Unable to open /proc"));
2661 2658                  return;
2662 2659          }
2663 2660  
2664 2661          dent = ctl->zsctl_procfs_dent;
2665 2662  
2666 2663          (void) memset(dent, 0, ctl->zsctl_procfs_dent_size);
2667 2664  
2668 2665          /* Walk all processes and compute each zone's usage on each pset. */
2669 2666          while (readdir_r(dir, dent) != 0) {
2670 2667  
2671 2668                  if (strcmp(dent->d_name, ".") == 0 ||
2672 2669                      strcmp(dent->d_name, "..") == 0)
2673 2670                          continue;
2674 2671  
2675 2672                  (void) snprintf(path, sizeof (path), "/proc/%s/psinfo",
2676 2673                      dent->d_name);
2677 2674  
2678 2675                  fd = open(path, O_RDONLY);
2679 2676                  if (fd < 0)
2680 2677                          continue;
2681 2678  
2682 2679                  if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) {
2683 2680                          (void) close(fd);
2684 2681                          continue;
2685 2682                  }
2686 2683                  (void) close(fd);
2687 2684  
2688 2685                  zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid,
2689 2686                      &zoneid, &prev_zoneid, &delta, &sched);
2690 2687  
2691 2688                  d1.tv_sec = delta.tv_sec / 2;
2692 2689                  d1.tv_nsec = delta.tv_nsec / 2;
2693 2690                  d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2);
2694 2691                  d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2);
2695 2692  
2696 2693                  /* Get the zone and pset this process is running in */
2697 2694                  zone = zsd_lookup_zone_byid(ctl, zoneid);
2698 2695                  if (zone == NULL)
2699 2696                          continue;
2700 2697                  pset = zsd_lookup_pset_byid(ctl, psetid);
2701 2698                  if (pset == NULL)
2702 2699                          continue;
2703 2700                  usage = zsd_lookup_insert_usage(ctl, pset, zone);
2704 2701                  if (usage == NULL)
2705 2702                          continue;
2706 2703  
2707 2704                  /*
2708 2705                   * Get the usage of the previous zone and pset if they were
2709 2706                   * different.
2710 2707                   */
2711 2708                  if (zoneid != prev_zoneid)
2712 2709                          prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid);
2713 2710                  else
2714 2711                          prev_zone = NULL;
2715 2712  
2716 2713                  if (psetid != prev_psetid)
2717 2714                          prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2718 2715                  else
2719 2716                          prev_pset = NULL;
2720 2717  
2721 2718                  prev_usage = NULL;
2722 2719                  if (prev_zone != NULL || prev_pset != NULL) {
2723 2720                          if (prev_zone == NULL)
2724 2721                                  prev_zone = zone;
2725 2722                          if (prev_pset == NULL)
2726 2723                                  prev_pset = pset;
2727 2724  
2728 2725                          prev_usage = zsd_lookup_insert_usage(ctl, prev_pset,
2729 2726                              prev_zone);
2730 2727                  }
2731 2728  
2732 2729                  /* Update the usage with the processes info */
2733 2730                  if (prev_usage == NULL) {
2734 2731                          zsd_mark_pset_usage_found(usage, sched);
2735 2732                  } else {
2736 2733                          zsd_mark_pset_usage_found(usage, sched);
2737 2734                          zsd_mark_pset_usage_found(prev_usage, sched);
2738 2735                  }
2739 2736  
2740 2737                  /*
2741 2738                   * First time around is just to get a starting point.  All
2742 2739                   * usages will be zero.
2743 2740                   */
2744 2741                  if (init == B_TRUE)
2745 2742                          continue;
2746 2743  
2747 2744                  if (prev_usage == NULL) {
2748 2745                          zsd_add_usage(ctl, usage, &delta);
2749 2746                  } else {
2750 2747                          zsd_add_usage(ctl, usage, &d1);
2751 2748                          zsd_add_usage(ctl, prev_usage, &d2);
2752 2749                  }
2753 2750          }
2754 2751          (void) closedir(dir);
2755 2752  
2756 2753          /*
2757 2754           * No need to collect exited proc data on initialization.  Just
2758 2755           * caching the usage of the known processes to get a zero starting
2759 2756           * point.
2760 2757           */
2761 2758          if (init == B_TRUE)
2762 2759                  return;
2763 2760  
2764 2761          /*
2765 2762           * Add accounting records to account for processes which have
2766 2763           * exited.
2767 2764           */
2768 2765          list_create(&plist, sizeof (zsd_proc_t),
2769 2766              offsetof(zsd_proc_t, zspr_next));
2770 2767          list_create(&pplist, sizeof (zsd_proc_t),
2771 2768              offsetof(zsd_proc_t, zspr_next));
2772 2769  
2773 2770          for (;;) {
2774 2771                  pid_t pid;
2775 2772                  pid_t ppid;
2776 2773                  timestruc_t user, sys, proc_usage;
2777 2774                  timestruc_t finish;
2778 2775                  int numfound = 0;
2779 2776  
2780 2777                  bzero(&object, sizeof (object));
2781 2778                  proc = NULL;
2782 2779                  zone = NULL;
2783 2780                  pset = NULL;
2784 2781                  usage = NULL;
2785 2782                  ret = ea_get_object(&ctl->zsctl_proc_eaf, &object);
2786 2783                  if (ret == EO_ERROR) {
2787 2784                          if (ea_error() == EXR_EOF) {
2788 2785  
2789 2786                                  struct stat64 *stat;
2790 2787                                  struct stat64 *stat_next;
2791 2788  
2792 2789                                  /*
2793 2790                                   * See if the next accounting file is the
2794 2791                                   * same as the current accounting file.
2795 2792                                   */
2796 2793                                  stat = &(ctl->zsctl_proc_stat);
2797 2794                                  stat_next = &(ctl->zsctl_proc_stat_next);
2798 2795                                  if (stat->st_ino == stat_next->st_ino &&
2799 2796                                      stat->st_dev == stat_next->st_dev) {
2800 2797                                          /*
2801 2798                                           * End of current accounting file is
2802 2799                                           * reached, so finished.  Clear EOF
2803 2800                                           * bit for next time around.
2804 2801                                           */
2805 2802                                          ea_clear(&ctl->zsctl_proc_eaf);
2806 2803                                          break;
2807 2804                                  } else {
2808 2805                                          /*
2809 2806                                           * Accounting file has changed.  Move
2810 2807                                           * to current accounting file.
2811 2808                                           */
2812 2809                                          (void) ea_close(&ctl->zsctl_proc_eaf);
2813 2810  
2814 2811                                          ctl->zsctl_proc_fd =
2815 2812                                              ctl->zsctl_proc_fd_next;
2816 2813                                          ctl->zsctl_proc_eaf =
2817 2814                                              ctl->zsctl_proc_eaf_next;
2818 2815                                          ctl->zsctl_proc_stat =
2819 2816                                              ctl->zsctl_proc_stat_next;
2820 2817  
2821 2818                                          ctl->zsctl_proc_fd_next = -1;
2822 2819                                          ctl->zsctl_proc_open_next = 0;
2823 2820                                          continue;
2824 2821                                  }
2825 2822                          } else {
2826 2823                                  /*
2827 2824                                   * Other accounting error.  Give up on
2828 2825                                   * accounting.
2829 2826                                   */
2830 2827                                  goto ea_err;
2831 2828                          }
2832 2829                  }
2833 2830                  /* Skip if not a process group */
2834 2831                  if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP ||
2835 2832                      (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) {
2836 2833                          (void) ea_free_item(&object, EUP_ALLOC);
2837 2834                          continue;
2838 2835                  }
2839 2836  
2840 2837                  /* The process group entry should be complete */
2841 2838                  while (numfound < 9) {
2842 2839                          bzero(&pobject, sizeof (pobject));
2843 2840                          ret = ea_get_object(&ctl->zsctl_proc_eaf,
2844 2841                              &pobject);
2845 2842                          if (ret < 0) {
2846 2843                                  (void) ea_free_item(&object, EUP_ALLOC);
2847 2844                                  zsd_warn(
2848 2845                                      "unable to get process accounting data");
2849 2846                                  goto ea_err;
2850 2847                          }
2851 2848                          /* Next entries should be process data */
2852 2849                          if ((pobject.eo_catalog & EXT_TYPE_MASK) ==
2853 2850                              EXT_GROUP) {
2854 2851                                  (void) ea_free_item(&object, EUP_ALLOC);
2855 2852                                  (void) ea_free_item(&pobject, EUP_ALLOC);
2856 2853                                  zsd_warn(
2857 2854                                      "process data of wrong type");
2858 2855                                  goto ea_err;
2859 2856                          }
2860 2857                          switch (pobject.eo_catalog & EXD_DATA_MASK) {
2861 2858                          case EXD_PROC_PID:
2862 2859                                  pid = pobject.eo_item.ei_uint32;
2863 2860                                  proc = &(ctl->zsctl_proc_array[pid]);
2864 2861                                  /*
2865 2862                                   * This process should not be currently in
2866 2863                                   * the list of processes to process.
2867 2864                                   */
2868 2865                                  assert(!list_link_active(&proc->zspr_next));
2869 2866                                  numfound++;
2870 2867                                  break;
2871 2868                          case EXD_PROC_ANCPID:
2872 2869                                  ppid = pobject.eo_item.ei_uint32;
2873 2870                                  pproc = &(ctl->zsctl_proc_array[ppid]);
2874 2871                                  numfound++;
2875 2872                                  break;
2876 2873                          case EXD_PROC_ZONENAME:
2877 2874                                  zone = zsd_lookup_zone(ctl,
2878 2875                                      pobject.eo_item.ei_string, -1);
2879 2876                                  numfound++;
2880 2877                                  break;
2881 2878                          case EXD_PROC_CPU_USER_SEC:
2882 2879                                  user.tv_sec =
2883 2880                                      pobject.eo_item.ei_uint64;
2884 2881                                  numfound++;
2885 2882                                  break;
2886 2883                          case EXD_PROC_CPU_USER_NSEC:
2887 2884                                  user.tv_nsec =
2888 2885                                      pobject.eo_item.ei_uint64;
2889 2886                                  numfound++;
2890 2887                                  break;
2891 2888                          case EXD_PROC_CPU_SYS_SEC:
2892 2889                                  sys.tv_sec =
2893 2890                                      pobject.eo_item.ei_uint64;
2894 2891                                  numfound++;
2895 2892                                  break;
2896 2893                          case EXD_PROC_CPU_SYS_NSEC:
2897 2894                                  sys.tv_nsec =
2898 2895                                      pobject.eo_item.ei_uint64;
2899 2896                                  numfound++;
2900 2897                                  break;
2901 2898                          case EXD_PROC_FINISH_SEC:
2902 2899                                  finish.tv_sec =
2903 2900                                      pobject.eo_item.ei_uint64;
2904 2901                                  numfound++;
2905 2902                                  break;
2906 2903                          case EXD_PROC_FINISH_NSEC:
2907 2904                                  finish.tv_nsec =
2908 2905                                      pobject.eo_item.ei_uint64;
2909 2906                                  numfound++;
2910 2907                                  break;
2911 2908                          }
2912 2909                          (void) ea_free_item(&pobject, EUP_ALLOC);
2913 2910                  }
2914 2911                  (void) ea_free_item(&object, EUP_ALLOC);
2915 2912                  if (numfound != 9) {
2916 2913                          zsd_warn(gettext(
2917 2914                              "Malformed process accounting entry found"));
2918 2915                          goto proc_done;
2919 2916                  }
2920 2917  
2921 2918                  if (finish.tv_sec > interval_end.tv_sec ||
2922 2919                      (finish.tv_sec == interval_end.tv_sec &&
2923 2920                      finish.tv_nsec > (interval_end.tv_usec * 1000)))
2924 2921                          hrtime_expired = B_TRUE;
2925 2922  
2926 2923                  /*
2927 2924                   * Try to identify the zone and pset to which this
2928 2925                   * exited process belongs.
2929 2926                   */
2930 2927                  if (zone == NULL)
2931 2928                          goto proc_done;
2932 2929  
2933 2930                  /* Save proc info */
2934 2931                  proc->zspr_ppid = ppid;
2935 2932                  proc->zspr_zoneid = zone->zsz_id;
2936 2933  
2937 2934                  prev_psetid = ZS_PSET_ERROR;
2938 2935                  sched = 0;
2939 2936  
2940 2937                  /*
2941 2938                   * The following tries to deduce the processes pset.
2942 2939                   *
2943 2940                   * First choose pset and sched using cached value from the
2944 2941                   * most recent time the process has been seen.
2945 2942                   *
2946 2943                   * pset and sched can change across zone_enter, so make sure
2947 2944                   * most recent sighting of this process was in the same
2948 2945                   * zone before using most recent known value.
2949 2946                   *
2950 2947                   * If there is no known value, use value of processes
2951 2948                   * parent.  If parent is unknown, walk parents until a known
2952 2949                   * parent is found.
2953 2950                   *
2954 2951                   * If no parent in the zone is found, use the zone's default
2955 2952                   * pset and scheduling class.
2956 2953                   */
2957 2954                  if (proc->zspr_psetid != ZS_PSET_ERROR) {
2958 2955                          prev_psetid = proc->zspr_psetid;
2959 2956                          pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2960 2957                          sched = proc->zspr_sched;
2961 2958                  } else if (pproc->zspr_zoneid == zone->zsz_id &&
2962 2959                      pproc->zspr_psetid != ZS_PSET_ERROR) {
2963 2960                          prev_psetid = pproc->zspr_psetid;
2964 2961                          pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2965 2962                          sched = pproc->zspr_sched;
2966 2963                  }
2967 2964  
2968 2965                  if (pset == NULL) {
2969 2966                          /*
2970 2967                           * Process or processes parent has never been seen.
2971 2968                           * Save to deduce a known parent later.
2972 2969                           */
2973 2970                          proc_usage = sys;
2974 2971                          TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2975 2972                          TIMESTRUC_DELTA(delta, proc_usage,
2976 2973                              proc->zspr_usage);
2977 2974                          proc->zspr_usage = delta;
2978 2975                          list_insert_tail(&plist, proc);
2979 2976                          continue;
2980 2977                  }
2981 2978  
2982 2979                  /* Add the zone's usage to the pset */
2983 2980                  usage = zsd_lookup_insert_usage(ctl, pset, zone);
2984 2981                  if (usage == NULL)
2985 2982                          goto proc_done;
2986 2983  
2987 2984                  zsd_mark_pset_usage_found(usage, sched);
2988 2985  
2989 2986                  /* compute the usage to add for the exited proc */
2990 2987                  proc_usage = sys;
2991 2988                  TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2992 2989                  TIMESTRUC_DELTA(delta, proc_usage,
2993 2990                      proc->zspr_usage);
2994 2991  
2995 2992                  zsd_add_usage(ctl, usage, &delta);
2996 2993  proc_done:
2997 2994                  zsd_flush_proc_info(proc);
2998 2995  
2999 2996                  if (hrtime_expired == B_TRUE)
3000 2997                          break;
3001 2998          }
3002 2999          /*
3003 3000           * close next accounting file.
3004 3001           */
3005 3002          if (ctl->zsctl_proc_open_next) {
3006 3003                  (void) ea_close(
3007 3004                      &ctl->zsctl_proc_eaf_next);
3008 3005                  ctl->zsctl_proc_open_next = 0;
3009 3006                  ctl->zsctl_proc_fd_next = -1;
3010 3007          }
3011 3008  
3012 3009          /* For the remaining processes, use pset and sched of a known parent */
3013 3010          proc = list_head(&plist);
3014 3011          while (proc != NULL) {
3015 3012                  next = proc;
3016 3013                  for (;;) {
3017 3014                          if (next->zspr_ppid == 0 || next->zspr_ppid == -1) {
3018 3015                                  /*
3019 3016                                   * Kernel process, or parent is unknown, skip
3020 3017                                   * process, remove from process list.
3021 3018                                   */
3022 3019                                  tmp = proc;
3023 3020                                  proc = list_next(&plist, proc);
3024 3021                                  list_link_init(&tmp->zspr_next);
3025 3022                                  break;
3026 3023                          }
3027 3024                          pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]);
3028 3025                          if (pproc->zspr_zoneid != proc->zspr_zoneid) {
3029 3026                                  /*
3030 3027                                   * Parent in different zone.  Save process and
3031 3028                                   * use zone's default pset and sched below
3032 3029                                   */
3033 3030                                  tmp = proc;
3034 3031                                  proc = list_next(&plist, proc);
3035 3032                                  list_remove(&plist, tmp);
3036 3033                                  list_insert_tail(&pplist, tmp);
3037 3034                                  break;
3038 3035                          }
3039 3036                          /* Parent has unknown pset, Search parent's parent  */
3040 3037                          if (pproc->zspr_psetid == ZS_PSET_ERROR) {
3041 3038                                  next = pproc;
3042 3039                                  continue;
3043 3040                          }
3044 3041                          /* Found parent with known pset.  Use its info */
3045 3042                          proc->zspr_psetid = pproc->zspr_psetid;
3046 3043                          proc->zspr_sched = pproc->zspr_sched;
3047 3044                          next->zspr_psetid = pproc->zspr_psetid;
3048 3045                          next->zspr_sched = pproc->zspr_sched;
3049 3046                          zone = zsd_lookup_zone_byid(ctl,
3050 3047                              proc->zspr_zoneid);
3051 3048                          if (zone == NULL) {
3052 3049                                  tmp = proc;
3053 3050                                  proc = list_next(&plist, proc);
3054 3051                                  list_remove(&plist, tmp);
3055 3052                                  list_link_init(&tmp->zspr_next);
3056 3053                                  break;
3057 3054                          }
3058 3055                          pset = zsd_lookup_pset_byid(ctl,
3059 3056                              proc->zspr_psetid);
3060 3057                          if (pset == NULL) {
3061 3058                                  tmp = proc;
3062 3059                                  proc = list_next(&plist, proc);
3063 3060                                  list_remove(&plist, tmp);
3064 3061                                  list_link_init(&tmp->zspr_next);
3065 3062                                  break;
3066 3063                          }
3067 3064                          /* Add the zone's usage to the pset */
3068 3065                          usage = zsd_lookup_insert_usage(ctl, pset, zone);
3069 3066                          if (usage == NULL) {
3070 3067                                  tmp = proc;
3071 3068                                  proc = list_next(&plist, proc);
3072 3069                                  list_remove(&plist, tmp);
3073 3070                                  list_link_init(&tmp->zspr_next);
3074 3071                                  break;
3075 3072                          }
3076 3073                          zsd_mark_pset_usage_found(usage, proc->zspr_sched);
3077 3074                          zsd_add_usage(ctl, usage, &proc->zspr_usage);
3078 3075                          zsd_flush_proc_info(proc);
3079 3076                          tmp = proc;
3080 3077                          proc = list_next(&plist, proc);
3081 3078                          list_remove(&plist, tmp);
3082 3079                          list_link_init(&tmp->zspr_next);
3083 3080                          break;
3084 3081                  }
3085 3082          }
3086 3083          /*
3087 3084           * Process has never been seen.  Using zone info to
3088 3085           * determine pset and scheduling class.
3089 3086           */
3090 3087          proc = list_head(&pplist);
3091 3088          while (proc != NULL) {
3092 3089  
3093 3090                  zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid);
3094 3091                  if (zone == NULL)
3095 3092                          goto next;
3096 3093                  if (zone->zsz_psetid != ZS_PSET_ERROR &&
3097 3094                      zone->zsz_psetid != ZS_PSET_MULTI) {
3098 3095                          prev_psetid = zone->zsz_psetid;
3099 3096                          pset = zsd_lookup_pset_byid(ctl, prev_psetid);
3100 3097                  } else {
3101 3098                          pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1);
3102 3099                          if (pset != NULL)
3103 3100                                  prev_psetid = pset->zsp_id;
3104 3101                  }
3105 3102                  if (pset == NULL)
3106 3103                          goto next;
3107 3104  
3108 3105                  sched = zone->zsz_scheds;
3109 3106                  /*
3110 3107                   * Ignore FX high scheduling class if it is not the
3111 3108                   * only scheduling class in the zone.
3112 3109                   */
3113 3110                  if (sched != ZS_SCHED_FX_60)
3114 3111                          sched &= (~ZS_SCHED_FX_60);
3115 3112                  /*
3116 3113                   * If more than one scheduling class has been found
3117 3114                   * in the zone, use zone's default scheduling class for
3118 3115                   * this process.
3119 3116                   */
3120 3117                  if ((sched & (sched - 1)) != 0)
3121 3118                          sched = zone->zsz_default_sched;
3122 3119  
3123 3120                  /* Add the zone's usage to the pset */
3124 3121                  usage = zsd_lookup_insert_usage(ctl, pset, zone);
3125 3122                  if (usage == NULL)
3126 3123                          goto next;
3127 3124  
3128 3125                  zsd_mark_pset_usage_found(usage, sched);
3129 3126                  zsd_add_usage(ctl, usage, &proc->zspr_usage);
3130 3127  next:
3131 3128                  tmp = proc;
3132 3129                  proc = list_next(&pplist, proc);
3133 3130                  zsd_flush_proc_info(tmp);
3134 3131                  list_link_init(&tmp->zspr_next);
3135 3132          }
3136 3133          return;
3137 3134  ea_err:
3138 3135          /*
3139 3136           * Close the next accounting file if we have not transitioned to it
3140 3137           * yet.
3141 3138           */
3142 3139          if (ctl->zsctl_proc_open_next) {
3143 3140                  (void) ea_close(&ctl->zsctl_proc_eaf_next);
3144 3141                  ctl->zsctl_proc_open_next = 0;
3145 3142                  ctl->zsctl_proc_fd_next = -1;
3146 3143          }
3147 3144  }
3148 3145  
3149 3146  /*
3150 3147   * getvmusage(2) uses size_t's in the passwd data structure, which differ
3151 3148   * in size for 32bit and 64 bit kernels.  Since this is a contracted interface,
3152 3149   * and zonestatd does not necessarily match the kernel's bitness, marshal
3153 3150   * results appropriately.
3154 3151   */
3155 3152  static int
3156 3153  zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf,
3157 3154      uint64_t *nres)
3158 3155  {
3159 3156          zsd_vmusage32_t *vmu32;
3160 3157          zsd_vmusage64_t *vmu64;
3161 3158          uint32_t nres32;
3162 3159          int i;
3163 3160          int ret;
3164 3161  
3165 3162          if (ctl->zsctl_kern_bits == 32)  {
3166 3163                  nres32 = *nres;
3167 3164                  ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3168 3165                      flags, age, (uintptr_t)buf, (uintptr_t)&nres32);
3169 3166                  *nres = nres32;
3170 3167                  if (ret == 0 && buf != NULL) {
3171 3168                          /*
3172 3169                           * An array of vmusage32_t's has been returned.
3173 3170                           * Convert it to an array of vmusage64_t's.
3174 3171                           */
3175 3172                          vmu32 = (zsd_vmusage32_t *)buf;
3176 3173                          vmu64 = (zsd_vmusage64_t *)buf;
3177 3174                          for (i = nres32 - 1; i >= 0; i--) {
3178 3175  
3179 3176                                  vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid;
3180 3177                                  vmu64[i].vmu_type = vmu32[i].vmu_type;
3181 3178                                  vmu64[i].vmu_type = vmu32[i].vmu_type;
3182 3179                                  vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all;
3183 3180                                  vmu64[i].vmu_rss_private =
3184 3181                                      vmu32[i].vmu_rss_private;
3185 3182                                  vmu64[i].vmu_rss_shared =
3186 3183                                      vmu32[i].vmu_rss_shared;
3187 3184                                  vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all;
3188 3185                                  vmu64[i].vmu_swap_private =
3189 3186                                      vmu32[i].vmu_swap_private;
3190 3187                                  vmu64[i].vmu_swap_shared =
3191 3188                                      vmu32[i].vmu_swap_shared;
3192 3189                          }
3193 3190                  }
3194 3191                  return (ret);
3195 3192          } else {
3196 3193                  /*
3197 3194                   * kernel is 64 bit, so use 64 bit structures as zonestat
3198 3195                   * expects.
3199 3196                   */
3200 3197                  return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3201 3198                      flags, age, (uintptr_t)buf, (uintptr_t)nres));
3202 3199  
3203 3200          }
3204 3201  }
3205 3202  
3206 3203  /*
3207 3204   * Update the current physical, virtual, and locked memory usage of the
3208 3205   * running zones.
3209 3206   */
3210 3207  static void
3211 3208  zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init)
3212 3209  {
3213 3210  
3214 3211          uint64_t phys_total;
3215 3212          uint64_t phys_used;
3216 3213          uint64_t phys_zones;
3217 3214          uint64_t phys_zones_overcount;
3218 3215          uint64_t phys_zones_extra;
3219 3216          uint64_t phys_zones_credit;
3220 3217  
3221 3218          uint64_t vm_free;
3222 3219          uint64_t vm_used;
3223 3220  
3224 3221          uint64_t disk_swap_total;
3225 3222          uint64_t disk_swap_used;        /* disk swap with contents */
3226 3223  
3227 3224          uint64_t physmem;
3228 3225          uint64_t pp_kernel;
3229 3226          uint64_t arc_size = 0;
3230 3227          struct anoninfo ani;
3231 3228  
3232 3229          int num_swap_devices;
3233 3230          struct swaptable *swt;
3234 3231          struct swapent *swent;
3235 3232          size_t swt_size;
3236 3233          char *path;
3237 3234  
3238 3235          zsd_vmusage64_t *vmusage;
3239 3236          uint64_t num_vmusage;
3240 3237  
3241 3238          int i, ret;
3242 3239  
3243 3240          zsd_system_t *sys;
3244 3241          zsd_zone_t *zone;
3245 3242          int vmu_nzones;
3246 3243  
3247 3244          kstat_t *kstat;
3248 3245          char kstat_name[KSTAT_STRLEN];
3249 3246          kstat_named_t *knp;
3250 3247          kid_t kid;
3251 3248  
3252 3249          if (init)
3253 3250                  return;
3254 3251  
3255 3252          sys = ctl->zsctl_system;
3256 3253  
3257 3254          /* interrogate swap devices to find the amount of disk swap */
3258 3255  disk_swap_again:
3259 3256          num_swap_devices = swapctl(SC_GETNSWP, NULL);
3260 3257  
3261 3258          if (num_swap_devices == 0) {
3262 3259                  sys->zss_swap_total = disk_swap_total = 0;
3263 3260                  sys->zss_swap_used = disk_swap_used = 0;
3264 3261                  /* No disk swap */
3265 3262                  goto disk_swap_done;
3266 3263          }
3267 3264          /* see if swap table needs to be larger */
3268 3265          if (num_swap_devices > ctl->zsctl_swap_cache_num) {
3269 3266                  swt_size = sizeof (int) +
3270 3267                      (num_swap_devices * sizeof (struct swapent)) +
3271 3268                      (num_swap_devices * MAXPATHLEN);
3272 3269                  if (ctl->zsctl_swap_cache != NULL)
3273 3270                          free(ctl->zsctl_swap_cache);
3274 3271  
3275 3272                  swt = (struct swaptable *)malloc(swt_size);
3276 3273                  if (swt == NULL) {
3277 3274                          /*
3278 3275                           * Could not allocate to get list of swap devices.
3279 3276                           * Just use data from the most recent read, which will
3280 3277                           * be zero if this is the first read.
3281 3278                           */
3282 3279                          zsd_warn(gettext("Unable to allocate to determine "
3283 3280                              "virtual memory"));
3284 3281                          disk_swap_total = sys->zss_swap_total;
3285 3282                          disk_swap_used = sys->zss_swap_used;
3286 3283                          goto disk_swap_done;
3287 3284                  }
3288 3285                  swent = swt->swt_ent;
3289 3286                  path = (char *)swt + (sizeof (int) +
3290 3287                      num_swap_devices * sizeof (swapent_t));
3291 3288                  for (i = 0; i < num_swap_devices; i++, swent++) {
3292 3289                          swent->ste_path = path;
3293 3290                          path += MAXPATHLEN;
3294 3291                  }
3295 3292                  swt->swt_n = num_swap_devices;
3296 3293                  ctl->zsctl_swap_cache = swt;
3297 3294                  ctl->zsctl_swap_cache_size = swt_size;
3298 3295                  ctl->zsctl_swap_cache_num = num_swap_devices;
3299 3296          }
3300 3297          num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache);
3301 3298          if (num_swap_devices < 0) {
3302 3299                  /* More swap devices have arrived */
3303 3300                  if (errno == ENOMEM)
3304 3301                          goto disk_swap_again;
3305 3302  
3306 3303                  zsd_warn(gettext("Unable to determine disk swap devices"));
3307 3304                  /* Unexpected error.  Use existing data */
3308 3305                  disk_swap_total = sys->zss_swap_total;
3309 3306                  disk_swap_used = sys->zss_swap_used;
3310 3307                  goto disk_swap_done;
3311 3308          }
3312 3309  
3313 3310          /* add up the disk swap */
3314 3311          disk_swap_total = 0;
3315 3312          disk_swap_used = 0;
3316 3313          swent = ctl->zsctl_swap_cache->swt_ent;
3317 3314          for (i = 0; i < num_swap_devices; i++, swent++) {
3318 3315                  disk_swap_total += swent->ste_pages;
3319 3316                  disk_swap_used += (swent->ste_pages - swent->ste_free);
3320 3317          }
3321 3318          disk_swap_total *= ctl->zsctl_pagesize;
3322 3319          disk_swap_used *= ctl->zsctl_pagesize;
3323 3320  
3324 3321          sys->zss_swap_total = disk_swap_total;
3325 3322          sys->zss_swap_used = disk_swap_used;
3326 3323  
3327 3324  disk_swap_done:
3328 3325  
3329 3326          /* get system pages kstat */
3330 3327          kid = -1;
3331 3328          kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages");
3332 3329          if (kstat == NULL)
3333 3330                  zsd_warn(gettext("Unable to lookup system pages kstat"));
3334 3331          else
3335 3332                  kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3336 3333  
3337 3334          if (kid == -1) {
3338 3335                  zsd_warn(gettext("Unable to read system pages kstat"));
3339 3336                  return;
3340 3337          } else {
3341 3338                  knp = kstat_data_lookup(kstat, "physmem");
3342 3339                  if (knp == NULL) {
3343 3340                          zsd_warn(gettext("Unable to read physmem"));
3344 3341                  } else {
3345 3342                          if (knp->data_type == KSTAT_DATA_UINT64)
3346 3343                                  physmem = knp->value.ui64;
3347 3344                          else if (knp->data_type == KSTAT_DATA_UINT32)
3348 3345                                  physmem = knp->value.ui32;
3349 3346                          else
3350 3347                                  return;
3351 3348                  }
3352 3349                  knp = kstat_data_lookup(kstat, "pp_kernel");
3353 3350                  if (knp == NULL) {
3354 3351                          zsd_warn(gettext("Unable to read pp_kernel"));
3355 3352                  } else {
3356 3353                          if (knp->data_type == KSTAT_DATA_UINT64)
3357 3354                                  pp_kernel = knp->value.ui64;
3358 3355                          else if (knp->data_type == KSTAT_DATA_UINT32)
3359 3356                                  pp_kernel = knp->value.ui32;
3360 3357                          else
3361 3358                                  return;
3362 3359                  }
3363 3360          }
3364 3361          physmem *= ctl->zsctl_pagesize;
3365 3362          pp_kernel *= ctl->zsctl_pagesize;
3366 3363  
3367 3364          /* get the zfs arc size if available */
3368 3365          arc_size = 0;
3369 3366          kid = -1;
3370 3367          kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats");
3371 3368          if (kstat != NULL)
3372 3369                  kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3373 3370          if (kid != -1) {
3374 3371                  knp = kstat_data_lookup(kstat, "size");
3375 3372                  if (knp != NULL)
3376 3373                          if (knp->data_type == KSTAT_DATA_UINT64)
3377 3374                                  arc_size = knp->value.ui64;
3378 3375          }
3379 3376  
3380 3377          /* Try to get swap information */
3381 3378          if (swapctl(SC_AINFO, &ani) < 0) {
3382 3379                  zsd_warn(gettext("Unable to get swap info"));
3383 3380                  return;
3384 3381          }
3385 3382  
3386 3383  vmusage_again:
3387 3384          /* getvmusage to get physical memory usage */
3388 3385          vmusage = ctl->zsctl_vmusage_cache;
3389 3386          num_vmusage = ctl->zsctl_vmusage_cache_num;
3390 3387  
3391 3388          ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0,
3392 3389              vmusage, &num_vmusage);
3393 3390  
3394 3391          if (ret != 0) {
3395 3392                  /* Unexpected error.  Use existing data */
3396 3393                  if (errno != EOVERFLOW) {
3397 3394                          zsd_warn(gettext(
3398 3395                              "Unable to read physical memory usage"));
3399 3396                          phys_zones = sys->zss_ram_zones;
3400 3397                          goto vmusage_done;
3401 3398                  }
3402 3399          }
3403 3400          /* vmusage results cache too small */
3404 3401          if (num_vmusage > ctl->zsctl_vmusage_cache_num) {
3405 3402  
3406 3403                  size_t size = sizeof (zsd_vmusage64_t) * num_vmusage;
3407 3404  
3408 3405                  if (ctl->zsctl_vmusage_cache != NULL)
3409 3406                          free(ctl->zsctl_vmusage_cache);
3410 3407                  vmusage = (zsd_vmusage64_t *)malloc(size);
3411 3408                  if (vmusage == NULL) {
3412 3409                          zsd_warn(gettext("Unable to alloc to determine "
3413 3410                              "physical memory usage"));
3414 3411                          phys_zones = sys->zss_ram_zones;
3415 3412                          goto vmusage_done;
3416 3413                  }
3417 3414                  ctl->zsctl_vmusage_cache = vmusage;
3418 3415                  ctl->zsctl_vmusage_cache_num = num_vmusage;
3419 3416                  goto vmusage_again;
3420 3417          }
3421 3418  
3422 3419          phys_zones_overcount = 0;
3423 3420          vmu_nzones = 0;
3424 3421          for (i = 0; i < num_vmusage; i++) {
3425 3422                  switch (vmusage[i].vmu_type) {
3426 3423                  case VMUSAGE_SYSTEM:
3427 3424                          /* total pages backing user process mappings */
3428 3425                          phys_zones = sys->zss_ram_zones =
3429 3426                              vmusage[i].vmu_rss_all;
3430 3427                          break;
3431 3428                  case VMUSAGE_ZONE:
3432 3429                          vmu_nzones++;
3433 3430                          phys_zones_overcount += vmusage[i].vmu_rss_all;
3434 3431                          zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id);
3435 3432                          if (zone != NULL)
3436 3433                                  zone->zsz_usage_ram = vmusage[i].vmu_rss_all;
3437 3434                          break;
3438 3435                  default:
3439 3436                          break;
3440 3437                  }
3441 3438          }
3442 3439          /*
3443 3440           * Figure how much memory was double counted due to text sharing
3444 3441           * between zones.  Credit this back so that the sum of the zones
3445 3442           * equals the total zone ram usage;
3446 3443           */
3447 3444          phys_zones_extra = phys_zones_overcount - phys_zones;
3448 3445          phys_zones_credit = phys_zones_extra / vmu_nzones;
3449 3446  
3450 3447  vmusage_done:
3451 3448  
3452 3449          /* walk the zones to get swap and locked kstats.  Fetch ram cap. */
3453 3450          sys->zss_locked_zones = 0;
3454 3451          sys->zss_vm_zones = 0;
3455 3452          for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3456 3453              zone = list_next(&ctl->zsctl_zones, zone)) {
3457 3454  
3458 3455                  /* If zone halted during interval, show memory usage as none */
3459 3456                  if (zone->zsz_active == B_FALSE ||
3460 3457                      zone->zsz_deleted == B_TRUE) {
3461 3458                          zone->zsz_usage_ram = 0;
3462 3459                          zone->zsz_usage_vm = 0;
3463 3460                          zone->zsz_usage_locked = 0;
3464 3461                          continue;
3465 3462                  }
3466 3463  
3467 3464                  if (phys_zones_credit > 0) {
3468 3465                          if (zone->zsz_usage_ram > phys_zones_credit) {
3469 3466                                  zone->zsz_usage_ram -= phys_zones_credit;
3470 3467                          }
3471 3468                  }
3472 3469                  /*
3473 3470                   * Get zone's swap usage.  Since zone could have halted,
3474 3471                   * treats as zero if cannot read
3475 3472                   */
3476 3473                  zone->zsz_usage_vm = 0;
3477 3474                  (void) snprintf(kstat_name, sizeof (kstat_name),
3478 3475                      "swapresv_zone_%d", zone->zsz_id);
3479 3476                  kid = -1;
3480 3477                  kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3481 3478                      zone->zsz_id, kstat_name);
3482 3479                  if (kstat != NULL)
3483 3480                          kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3484 3481                  if (kid != -1) {
3485 3482                          knp = kstat_data_lookup(kstat, "usage");
3486 3483                          if (knp != NULL &&
3487 3484                              knp->data_type == KSTAT_DATA_UINT64) {
3488 3485                                  zone->zsz_usage_vm = knp->value.ui64;
3489 3486                                  sys->zss_vm_zones += knp->value.ui64;
3490 3487                          }
3491 3488                  }
3492 3489                  /*
3493 3490                   * Get zone's locked usage.  Since zone could have halted,
3494 3491                   * treats as zero if cannot read
3495 3492                   */
3496 3493                  zone->zsz_usage_locked = 0;
3497 3494                  (void) snprintf(kstat_name, sizeof (kstat_name),
3498 3495                      "lockedmem_zone_%d", zone->zsz_id);
3499 3496                  kid = -1;
3500 3497                  kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3501 3498                      zone->zsz_id, kstat_name);
3502 3499                  if (kstat != NULL)
3503 3500                          kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3504 3501                  if (kid != -1) {
3505 3502                          knp = kstat_data_lookup(kstat, "usage");
3506 3503                          if (knp != NULL &&
3507 3504                              knp->data_type == KSTAT_DATA_UINT64) {
3508 3505                                  zone->zsz_usage_locked = knp->value.ui64;
3509 3506                                  /*
3510 3507                                   * Since locked memory accounting for zones
3511 3508                                   * can double count ddi locked memory, cap each
3512 3509                                   * zone's locked usage at its ram usage.
3513 3510                                   */
3514 3511                                  if (zone->zsz_usage_locked >
3515 3512                                      zone->zsz_usage_ram)
3516 3513                                          zone->zsz_usage_locked =
3517 3514                                              zone->zsz_usage_ram;
3518 3515                                  sys->zss_locked_zones +=
3519 3516                                      zone->zsz_usage_locked;
3520 3517                          }
3521 3518                  }
3522 3519          }
3523 3520  
3524 3521          phys_total =
3525 3522              sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize;
3526 3523  
3527 3524          phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES))
3528 3525              * ctl->zsctl_pagesize;
3529 3526  
3530 3527          /* Compute remaining statistics */
3531 3528          sys->zss_ram_total = phys_total;
3532 3529          sys->zss_ram_zones = phys_zones;
3533 3530          sys->zss_ram_kern = phys_used - phys_zones - arc_size;
3534 3531  
3535 3532          /*
3536 3533           * The total for kernel locked memory should include
3537 3534           * segkp locked pages, but oh well.  The arc size is subtracted,
3538 3535           * as that physical memory is reclaimable.
3539 3536           */
3540 3537          sys->zss_locked_kern = pp_kernel - arc_size;
3541 3538          /* Add memory used by kernel startup and obp to kernel locked */
3542 3539          if ((phys_total - physmem) > 0)
3543 3540                  sys->zss_locked_kern += phys_total - physmem;
3544 3541  
3545 3542          /*
3546 3543           * Add in the portion of (RAM+DISK) that is not available as swap,
3547 3544           * and consider it swap used by the kernel.
3548 3545           */
3549 3546          sys->zss_vm_total = phys_total + disk_swap_total;
3550 3547          vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize;
3551 3548          vm_used = sys->zss_vm_total - vm_free;
3552 3549          sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size;
3553 3550  }
3554 3551  
3555 3552  /*
3556 3553   * Charge each cpu's usage to its processor sets.  Also add the cpu's total
3557 3554   * time to each zone using the processor set.  This tracks the maximum
3558 3555   * amount of cpu time that a zone could have used.
3559 3556   */
3560 3557  static void
3561 3558  zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init)
3562 3559  {
3563 3560          zsd_system_t *sys;
3564 3561          zsd_zone_t *zone;
3565 3562          zsd_pset_usage_t *usage;
3566 3563          zsd_cpu_t *cpu;
3567 3564          zsd_cpu_t *cpu_next;
3568 3565          zsd_pset_t *pset;
3569 3566          timestruc_t ts;
3570 3567          uint64_t hrtime;
3571 3568          timestruc_t delta;
3572 3569  
3573 3570          /* Update the per-cpu kstat data */
3574 3571          cpu_next = list_head(&ctl->zsctl_cpus);
3575 3572          while (cpu_next != NULL) {
3576 3573                  cpu = cpu_next;
3577 3574                  cpu_next = list_next(&ctl->zsctl_cpus, cpu);
3578 3575                  zsd_update_cpu_stats(ctl, cpu);
3579 3576          }
3580 3577          /* Update the elapsed real time */
3581 3578          hrtime = gethrtime();
3582 3579          if (init) {
3583 3580                  /* first time around, store hrtime for future comparision */
3584 3581                  ctl->zsctl_hrtime = hrtime;
3585 3582                  ctl->zsctl_hrtime_prev = hrtime;
3586 3583  
3587 3584          } else {
3588 3585                  /* Compute increase in hrtime since the most recent read */
3589 3586                  ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime;
3590 3587                  ctl->zsctl_hrtime = hrtime;
3591 3588                  if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0)
3592 3589                          TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime);
3593 3590          }
3594 3591  
3595 3592          /* On initialization, all psets have zero time  */
3596 3593          if (init)
3597 3594                  return;
3598 3595  
3599 3596          for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
3600 3597              pset = list_next(&ctl->zsctl_psets, pset)) {
3601 3598  
3602 3599                  if (pset->zsp_active == B_FALSE) {
3603 3600                          zsd_warn(gettext("Internal error,inactive pset found"));
3604 3601                          continue;
3605 3602                  }
3606 3603  
3607 3604                  /* sum total used time for pset */
3608 3605                  ts.tv_sec = 0;
3609 3606                  ts.tv_nsec = 0;
3610 3607                  TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr);
3611 3608                  TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern);
3612 3609                  TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user);
3613 3610                  /* kernel time in pset is total time minus zone time */
3614 3611                  TIMESTRUC_DELTA(pset->zsp_usage_kern, ts,
3615 3612                      pset->zsp_usage_zones);
3616 3613                  if (pset->zsp_usage_kern.tv_sec < 0 ||
3617 3614                      pset->zsp_usage_kern.tv_nsec < 0) {
3618 3615                          pset->zsp_usage_kern.tv_sec = 0;
3619 3616                          pset->zsp_usage_kern.tv_nsec = 0;
3620 3617                  }
3621 3618                  /* Total pset elapsed time is used time plus idle time */
3622 3619                  TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle);
3623 3620  
3624 3621                  TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time);
3625 3622  
3626 3623                  for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
3627 3624                      usage = list_next(&pset->zsp_usage_list, usage)) {
3628 3625  
3629 3626                          zone = usage->zsu_zone;
3630 3627                          if (usage->zsu_cpu_shares != ZS_LIMIT_NONE &&
3631 3628                              usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED &&
3632 3629                              usage->zsu_cpu_shares != 0) {
3633 3630                                  /*
3634 3631                                   * Figure out how many nanoseconds of share time
3635 3632                                   * to give to the zone
3636 3633                                   */
3637 3634                                  hrtime = delta.tv_sec;
3638 3635                                  hrtime *= NANOSEC;
3639 3636                                  hrtime += delta.tv_nsec;
3640 3637                                  hrtime *= usage->zsu_cpu_shares;
3641 3638                                  hrtime /= pset->zsp_cpu_shares;
3642 3639                                  TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time,
3643 3640                                      hrtime);
3644 3641                          }
3645 3642                          /* Add pset time to each zone using pset */
3646 3643                          TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta);
3647 3644  
3648 3645                          zone->zsz_cpus_online += pset->zsp_online;
3649 3646                  }
3650 3647                  pset->zsp_total_time = ts;
3651 3648          }
3652 3649  
3653 3650          for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3654 3651              zone = list_next(&ctl->zsctl_zones, zone)) {
3655 3652  
3656 3653                  /* update cpu cap tracking if the zone has a cpu cap */
3657 3654                  if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) {
3658 3655                          uint64_t elapsed;
3659 3656  
3660 3657                          elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev;
3661 3658                          elapsed *= zone->zsz_cpu_cap;
3662 3659                          elapsed = elapsed / 100;
3663 3660                          TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed);
3664 3661                  }
3665 3662          }
3666 3663          sys = ctl->zsctl_system;
3667 3664          ts.tv_sec = 0;
3668 3665          ts.tv_nsec = 0;
3669 3666          TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr);
3670 3667          TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern);
3671 3668          TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user);
3672 3669  
3673 3670          /* kernel time in pset is total time minus zone time */
3674 3671          TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts,
3675 3672              sys->zss_cpu_usage_zones);
3676 3673          if (sys->zss_cpu_usage_kern.tv_sec < 0 ||
3677 3674              sys->zss_cpu_usage_kern.tv_nsec < 0) {
3678 3675                  sys->zss_cpu_usage_kern.tv_sec = 0;
3679 3676                  sys->zss_cpu_usage_kern.tv_nsec = 0;
3680 3677          }
3681 3678          /* Total pset elapsed time is used time plus idle time */
3682 3679          TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle);
3683 3680          sys->zss_cpu_total_time = ts;
3684 3681  }
3685 3682  
3686 3683  /*
3687 3684   * Saves current usage data to a cache that is read by libzonestat when
3688 3685   * calling zs_usage_read().
3689 3686   *
3690 3687   * All pointers in the cached data structure are set to NULL.  When
3691 3688   * libzonestat reads the cached data, it will set the pointers relative to
3692 3689   * its address space.
3693 3690   */
3694 3691  static void
3695 3692  zsd_usage_cache_update(zsd_ctl_t *ctl)
3696 3693  {
3697 3694          zs_usage_cache_t *cache;
3698 3695          zs_usage_cache_t *old;
3699 3696          zs_usage_t *usage;
3700 3697  
3701 3698          zs_system_t *sys;
3702 3699          zsd_system_t *dsys;
3703 3700          zs_zone_t *zone = NULL;
3704 3701          zsd_zone_t *dzone;
3705 3702          zs_pset_t *pset = NULL;
3706 3703          zsd_pset_t *dpset;
3707 3704          zs_pset_zone_t *pusage;
3708 3705          zsd_pset_usage_t *dpusage;
3709 3706  
3710 3707          char *next;
3711 3708          uint_t size, i, j;
3712 3709  
3713 3710          size =
3714 3711              sizeof (zs_usage_cache_t) +
3715 3712              sizeof (zs_usage_t) +
3716 3713              sizeof (zs_system_t) +
3717 3714              sizeof (zs_zone_t) * ctl->zsctl_nzones +
3718 3715              sizeof (zs_pset_t) *  ctl->zsctl_npsets +
3719 3716              sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages;
3720 3717  
3721 3718          cache = (zs_usage_cache_t *)malloc(size);
3722 3719          if (cache == NULL) {
3723 3720                  zsd_warn(gettext("Unable to allocate usage cache\n"));
3724 3721                  return;
3725 3722          }
3726 3723  
3727 3724          next = (char *)cache;
3728 3725          cache->zsuc_size = size - sizeof (zs_usage_cache_t);
3729 3726          next += sizeof (zs_usage_cache_t);
3730 3727  
3731 3728          /* LINTED */
3732 3729          usage = cache->zsuc_usage = (zs_usage_t *)next;
3733 3730          next += sizeof (zs_usage_t);
3734 3731          usage->zsu_start = g_start;
3735 3732          usage->zsu_hrstart = g_hrstart;
3736 3733          usage->zsu_time = g_now;
3737 3734          usage->zsu_hrtime = g_hrnow;
3738 3735          usage->zsu_nzones = ctl->zsctl_nzones;
3739 3736          usage->zsu_npsets = ctl->zsctl_npsets;
3740 3737          usage->zsu_system = NULL;
3741 3738  
3742 3739          /* LINTED */
3743 3740          sys = (zs_system_t *)next;
3744 3741          next += sizeof (zs_system_t);
3745 3742          dsys = ctl->zsctl_system;
3746 3743          sys->zss_ram_total = dsys->zss_ram_total;
3747 3744          sys->zss_ram_kern = dsys->zss_ram_kern;
3748 3745          sys->zss_ram_zones = dsys->zss_ram_zones;
3749 3746          sys->zss_locked_kern = dsys->zss_locked_kern;
3750 3747          sys->zss_locked_zones = dsys->zss_locked_zones;
3751 3748          sys->zss_vm_total = dsys->zss_vm_total;
3752 3749          sys->zss_vm_kern = dsys->zss_vm_kern;
3753 3750          sys->zss_vm_zones = dsys->zss_vm_zones;
3754 3751          sys->zss_swap_total = dsys->zss_swap_total;
3755 3752          sys->zss_swap_used = dsys->zss_swap_used;
3756 3753          sys->zss_ncpus = dsys->zss_ncpus;
3757 3754          sys->zss_ncpus_online = dsys->zss_ncpus_online;
3758 3755  
3759 3756          sys->zss_processes_max = dsys->zss_maxpid;
3760 3757          sys->zss_lwps_max = dsys->zss_lwps_max;
3761 3758          sys->zss_shm_max = dsys->zss_shm_max;
3762 3759          sys->zss_shmids_max = dsys->zss_shmids_max;
3763 3760          sys->zss_semids_max = dsys->zss_semids_max;
3764 3761          sys->zss_msgids_max = dsys->zss_msgids_max;
3765 3762          sys->zss_lofi_max = dsys->zss_lofi_max;
3766 3763  
3767 3764          sys->zss_processes = dsys->zss_processes;
3768 3765          sys->zss_lwps = dsys->zss_lwps;
3769 3766          sys->zss_shm = dsys->zss_shm;
3770 3767          sys->zss_shmids = dsys->zss_shmids;
3771 3768          sys->zss_semids = dsys->zss_semids;
3772 3769          sys->zss_msgids = dsys->zss_msgids;
3773 3770          sys->zss_lofi = dsys->zss_lofi;
3774 3771  
3775 3772          sys->zss_cpu_total_time = dsys->zss_cpu_total_time;
3776 3773          sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones;
3777 3774          sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern;
3778 3775  
3779 3776          for (i = 0, dzone = list_head(&ctl->zsctl_zones);
3780 3777              i < ctl->zsctl_nzones;
3781 3778              i++, dzone = list_next(&ctl->zsctl_zones, dzone)) {
3782 3779                  /* LINTED */
3783 3780                  zone = (zs_zone_t *)next;
3784 3781                  next += sizeof (zs_zone_t);
3785 3782                  list_link_init(&zone->zsz_next);
3786 3783                  zone->zsz_system = NULL;
3787 3784  
3788 3785                  (void) strlcpy(zone->zsz_name, dzone->zsz_name,
3789 3786                      sizeof (zone->zsz_name));
3790 3787                  (void) strlcpy(zone->zsz_pool, dzone->zsz_pool,
3791 3788                      sizeof (zone->zsz_pool));
3792 3789                  (void) strlcpy(zone->zsz_pset, dzone->zsz_pset,
3793 3790                      sizeof (zone->zsz_pset));
3794 3791                  zone->zsz_id = dzone->zsz_id;
3795 3792                  zone->zsz_cputype = dzone->zsz_cputype;
3796 3793                  zone->zsz_iptype = dzone->zsz_iptype;
3797 3794                  zone->zsz_start = dzone->zsz_start;
3798 3795                  zone->zsz_hrstart = dzone->zsz_hrstart;
3799 3796                  zone->zsz_scheds = dzone->zsz_scheds;
3800 3797                  zone->zsz_cpu_shares = dzone->zsz_cpu_shares;
3801 3798                  zone->zsz_cpu_cap = dzone->zsz_cpu_cap;
3802 3799                  zone->zsz_ram_cap = dzone->zsz_ram_cap;
3803 3800                  zone->zsz_vm_cap = dzone->zsz_vm_cap;
3804 3801                  zone->zsz_locked_cap = dzone->zsz_locked_cap;
3805 3802                  zone->zsz_cpu_usage = dzone->zsz_cpu_usage;
3806 3803                  zone->zsz_cpus_online = dzone->zsz_cpus_online;
3807 3804                  zone->zsz_pset_time = dzone->zsz_pset_time;
3808 3805                  zone->zsz_cap_time = dzone->zsz_cap_time;
3809 3806                  zone->zsz_share_time = dzone->zsz_share_time;
3810 3807                  zone->zsz_usage_ram = dzone->zsz_usage_ram;
3811 3808                  zone->zsz_usage_locked = dzone->zsz_usage_locked;
3812 3809                  zone->zsz_usage_vm = dzone->zsz_usage_vm;
3813 3810  
3814 3811                  zone->zsz_processes_cap = dzone->zsz_processes_cap;
3815 3812                  zone->zsz_lwps_cap = dzone->zsz_lwps_cap;
3816 3813                  zone->zsz_shm_cap = dzone->zsz_shm_cap;
3817 3814                  zone->zsz_shmids_cap = dzone->zsz_shmids_cap;
3818 3815                  zone->zsz_semids_cap = dzone->zsz_semids_cap;
3819 3816                  zone->zsz_msgids_cap = dzone->zsz_msgids_cap;
3820 3817                  zone->zsz_lofi_cap = dzone->zsz_lofi_cap;
3821 3818  
3822 3819                  zone->zsz_processes = dzone->zsz_processes;
3823 3820                  zone->zsz_lwps = dzone->zsz_lwps;
3824 3821                  zone->zsz_shm = dzone->zsz_shm;
3825 3822                  zone->zsz_shmids = dzone->zsz_shmids;
3826 3823                  zone->zsz_semids = dzone->zsz_semids;
3827 3824                  zone->zsz_msgids = dzone->zsz_msgids;
3828 3825                  zone->zsz_lofi = dzone->zsz_lofi;
3829 3826          }
3830 3827  
3831 3828          for (i = 0, dpset = list_head(&ctl->zsctl_psets);
3832 3829              i < ctl->zsctl_npsets;
3833 3830              i++, dpset = list_next(&ctl->zsctl_psets, dpset)) {
3834 3831                  /* LINTED */
3835 3832                  pset = (zs_pset_t *)next;
3836 3833                  next += sizeof (zs_pset_t);
3837 3834                  list_link_init(&pset->zsp_next);
3838 3835                  (void) strlcpy(pset->zsp_name, dpset->zsp_name,
3839 3836                      sizeof (pset->zsp_name));
3840 3837                  pset->zsp_id = dpset->zsp_id;
3841 3838                  pset->zsp_cputype = dpset->zsp_cputype;
3842 3839                  pset->zsp_start = dpset->zsp_start;
3843 3840                  pset->zsp_hrstart = dpset->zsp_hrstart;
3844 3841                  pset->zsp_online = dpset->zsp_online;
3845 3842                  pset->zsp_size = dpset->zsp_size;
3846 3843                  pset->zsp_min = dpset->zsp_min;
3847 3844                  pset->zsp_max = dpset->zsp_max;
3848 3845                  pset->zsp_importance = dpset->zsp_importance;
3849 3846                  pset->zsp_scheds = dpset->zsp_scheds;
3850 3847                  pset->zsp_cpu_shares = dpset->zsp_cpu_shares;
3851 3848                  pset->zsp_total_time = dpset->zsp_total_time;
3852 3849                  pset->zsp_usage_kern = dpset->zsp_usage_kern;
3853 3850                  pset->zsp_usage_zones = dpset->zsp_usage_zones;
3854 3851                  pset->zsp_nusage = dpset->zsp_nusage;
3855 3852                  /* Add pset usages for pset */
3856 3853                  for (j = 0, dpusage = list_head(&dpset->zsp_usage_list);
3857 3854                      j < dpset->zsp_nusage;
3858 3855                      j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) {
3859 3856                          /* LINTED */
3860 3857                          pusage = (zs_pset_zone_t *)next;
3861 3858                          next += sizeof (zs_pset_zone_t);
3862 3859                          /* pointers are computed by client */
3863 3860                          pusage->zspz_pset = NULL;
3864 3861                          pusage->zspz_zone = NULL;
3865 3862                          list_link_init(&pusage->zspz_next);
3866 3863                          pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id;
3867 3864                          pusage->zspz_start = dpusage->zsu_start;
3868 3865                          pusage->zspz_hrstart = dpusage->zsu_hrstart;
3869 3866                          pusage->zspz_hrstart = dpusage->zsu_hrstart;
3870 3867                          pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares;
3871 3868                          pusage->zspz_scheds = dpusage->zsu_scheds;
3872 3869                          pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage;
3873 3870                  }
3874 3871          }
3875 3872  
3876 3873          /* Update the current cache pointer */
3877 3874          (void) mutex_lock(&g_usage_cache_lock);
3878 3875                  old = g_usage_cache;
3879 3876                  cache->zsuc_ref = 1;
3880 3877                  cache->zsuc_gen = g_gen_next;
3881 3878                  usage->zsu_gen = g_gen_next;
3882 3879                  usage->zsu_size = size;
3883 3880                  g_usage_cache = cache;
3884 3881                  if (old != NULL) {
3885 3882                          old->zsuc_ref--;
3886 3883                          if (old->zsuc_ref == 0)
3887 3884                                  free(old);
3888 3885                  }
3889 3886                  g_gen_next++;
3890 3887          /* Wake up any clients that are waiting for this calculation */
3891 3888          if (g_usage_cache_kickers > 0) {
3892 3889                  (void) cond_broadcast(&g_usage_cache_wait);
3893 3890          }
3894 3891          (void) mutex_unlock(&g_usage_cache_lock);
3895 3892  }
3896 3893  
3897 3894  static zs_usage_cache_t *
3898 3895  zsd_usage_cache_hold_locked()
3899 3896  {
3900 3897          zs_usage_cache_t *ret;
3901 3898  
3902 3899          ret = g_usage_cache;
3903 3900          ret->zsuc_ref++;
3904 3901          return (ret);
3905 3902  }
3906 3903  
3907 3904  void
3908 3905  zsd_usage_cache_rele(zs_usage_cache_t *cache)
3909 3906  {
3910 3907          (void) mutex_lock(&g_usage_cache_lock);
3911 3908          cache->zsuc_ref--;
3912 3909          if (cache->zsuc_ref == 0)
3913 3910                  free(cache);
3914 3911          (void) mutex_unlock(&g_usage_cache_lock);
3915 3912  }
3916 3913  
3917 3914  /* Close the handles held by zsd_open() */
3918 3915  void
3919 3916  zsd_close(zsd_ctl_t *ctl)
3920 3917  {
3921 3918          zsd_zone_t *zone;
3922 3919          zsd_pset_t *pset;
3923 3920          zsd_pset_usage_t *usage;
3924 3921          zsd_cpu_t *cpu;
3925 3922          int id;
3926 3923  
3927 3924          if (ctl->zsctl_kstat_ctl) {
3928 3925                  (void) kstat_close(ctl->zsctl_kstat_ctl);
3929 3926                  ctl->zsctl_kstat_ctl = NULL;
3930 3927          }
3931 3928          if (ctl->zsctl_proc_open) {
3932 3929                  (void) ea_close(&ctl->zsctl_proc_eaf);
3933 3930                  ctl->zsctl_proc_open = 0;
3934 3931                  ctl->zsctl_proc_fd = -1;
3935 3932          }
3936 3933          if (ctl->zsctl_pool_conf) {
3937 3934                  if (ctl->zsctl_pool_status == POOL_ENABLED)
3938 3935                          (void) pool_conf_close(ctl->zsctl_pool_conf);
3939 3936                  ctl->zsctl_pool_status = POOL_DISABLED;
3940 3937          }
3941 3938  
3942 3939          while ((zone = list_head(&ctl->zsctl_zones)) != NULL) {
3943 3940                  list_remove(&ctl->zsctl_zones, zone);
3944 3941                  free(zone);
3945 3942                  ctl->zsctl_nzones--;
3946 3943          }
3947 3944  
3948 3945          while ((pset = list_head(&ctl->zsctl_psets)) != NULL) {
3949 3946                  while ((usage = list_head(&pset->zsp_usage_list))
3950 3947                      != NULL) {
3951 3948                          list_remove(&pset->zsp_usage_list, usage);
3952 3949                          ctl->zsctl_npset_usages--;
3953 3950                          free(usage);
3954 3951                  }
3955 3952                  list_remove(&ctl->zsctl_psets, pset);
3956 3953                  free(pset);
3957 3954                  ctl->zsctl_npsets--;
3958 3955          }
3959 3956  
3960 3957          /* Release all cpus being tracked */
3961 3958          while (cpu = list_head(&ctl->zsctl_cpus)) {
3962 3959                  list_remove(&ctl->zsctl_cpus, cpu);
3963 3960                  id = cpu->zsc_id;
3964 3961                  bzero(cpu, sizeof (zsd_cpu_t));
3965 3962                  cpu->zsc_id = id;
3966 3963                  cpu->zsc_allocated = B_FALSE;
3967 3964                  cpu->zsc_psetid = ZS_PSET_ERROR;
3968 3965                  cpu->zsc_psetid_prev = ZS_PSET_ERROR;
3969 3966          }
3970 3967  
3971 3968          assert(ctl->zsctl_npset_usages == 0);
3972 3969          assert(ctl->zsctl_npsets == 0);
3973 3970          assert(ctl->zsctl_nzones == 0);
3974 3971          (void) zsd_disable_cpu_stats();
3975 3972  }
3976 3973  
3977 3974  
3978 3975  /*
3979 3976   * Update the utilization data for all zones and processor sets.
3980 3977   */
3981 3978  static int
3982 3979  zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory)
3983 3980  {
3984 3981          (void) kstat_chain_update(ctl->zsctl_kstat_ctl);
3985 3982          (void) gettimeofday(&(ctl->zsctl_timeofday), NULL);
3986 3983  
3987 3984          zsd_refresh_system(ctl);
3988 3985  
3989 3986          /*
3990 3987           * Memory calculation is expensive.  Only update it on sample
3991 3988           * intervals.
3992 3989           */
3993 3990          if (do_memory == B_TRUE)
3994 3991                  zsd_refresh_memory(ctl, init);
3995 3992          zsd_refresh_zones(ctl);
3996 3993          zsd_refresh_psets(ctl);
3997 3994          zsd_refresh_procs(ctl, init);
3998 3995          zsd_refresh_cpu_stats(ctl, init);
3999 3996  
4000 3997          /*
4001 3998           * Delete objects that no longer exist.
4002 3999           * Pset usages must be deleted first as they point to zone and
4003 4000           * pset objects.
4004 4001           */
4005 4002          zsd_mark_pset_usages_end(ctl);
4006 4003          zsd_mark_psets_end(ctl);
4007 4004          zsd_mark_cpus_end(ctl);
4008 4005          zsd_mark_zones_end(ctl);
4009 4006  
4010 4007          /*
4011 4008           * Save results for clients.
4012 4009           */
4013 4010          zsd_usage_cache_update(ctl);
4014 4011  
4015 4012          /*
4016 4013           * Roll process accounting file.
4017 4014           */
4018 4015          (void) zsd_roll_exacct();
4019 4016          return (0);
4020 4017  }
4021 4018  
4022 4019  /*
4023 4020   * Get the system rctl, which is the upper most limit
4024 4021   */
4025 4022  static uint64_t
4026 4023  zsd_get_system_rctl(char *name)
4027 4024  {
4028 4025          rctlblk_t *rblk, *rblk_last;
4029 4026  
4030 4027          rblk = (rctlblk_t *)alloca(rctlblk_size());
4031 4028          rblk_last = (rctlblk_t *)alloca(rctlblk_size());
4032 4029  
4033 4030          if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0)
4034 4031                  return (ZS_LIMIT_NONE);
4035 4032  
4036 4033          while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0)
4037 4034                  (void) bcopy(rblk, rblk_last, rctlblk_size());
4038 4035  
4039 4036          return (rctlblk_get_value(rblk_last));
4040 4037  }
4041 4038  
4042 4039  /*
4043 4040   * Open any necessary subsystems for collecting utilization data,
4044 4041   * allocate and initialize data structures, and get initial utilization.
4045 4042   *
4046 4043   * Errors:
4047 4044   *      ENOMEM  out of memory
4048 4045   *      EINVAL  other error
4049 4046   */
4050 4047  static zsd_ctl_t *
4051 4048  zsd_open(zsd_ctl_t *ctl)
4052 4049  {
4053 4050          zsd_system_t *system;
4054 4051  
4055 4052          char path[MAXPATHLEN];
4056 4053          long pathmax;
4057 4054          struct statvfs svfs;
4058 4055          int ret;
4059 4056          int i;
4060 4057          size_t size;
4061 4058          int err;
4062 4059  
4063 4060          if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1,
4064 4061              sizeof (zsd_ctl_t))) == NULL) {
4065 4062                          zsd_warn(gettext("Out of Memory"));
4066 4063                          errno = ENOMEM;
4067 4064                          goto err;
4068 4065          }
4069 4066          ctl->zsctl_proc_fd = -1;
4070 4067  
4071 4068          /* open kstats */
4072 4069          if (ctl->zsctl_kstat_ctl == NULL &&
4073 4070              (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) {
4074 4071                  err = errno;
4075 4072                  zsd_warn(gettext("Unable to open kstats"));
4076 4073                  errno = err;
4077 4074                  if (errno != ENOMEM)
4078 4075                          errno = EAGAIN;
4079 4076                  goto err;
4080 4077          }
4081 4078  
4082 4079          /*
4083 4080           * These are set when the accounting file is opened by
4084 4081           * zsd_update_procs()
4085 4082           */
4086 4083          ctl->zsctl_proc_fd = -1;
4087 4084          ctl->zsctl_proc_fd_next = -1;
4088 4085          ctl->zsctl_proc_open = 0;
4089 4086          ctl->zsctl_proc_open_next = 0;
4090 4087  
4091 4088  check_exacct:
4092 4089          (void) zsd_enable_cpu_stats();
4093 4090  
4094 4091          /* Create structures to track usage */
4095 4092          if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *)
4096 4093              calloc(1, sizeof (zsd_system_t))) == NULL) {
4097 4094                  ret = -1;
4098 4095                  zsd_warn(gettext("Out of Memory"));
4099 4096                  errno = ENOMEM;
4100 4097                  goto err;
4101 4098          }
4102 4099          system = ctl->zsctl_system;
4103 4100          /* get the kernel bitness to know structure layout for getvmusage */
4104 4101          ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path));
4105 4102          if (ret < 0)
4106 4103                  ctl->zsctl_kern_bits = 32;
4107 4104          else
4108 4105                  ctl->zsctl_kern_bits = 64;
4109 4106          ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE);
4110 4107  
4111 4108          size = sysconf(_SC_CPUID_MAX);
4112 4109          ctl->zsctl_maxcpuid = size;
4113 4110          if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array =
4114 4111              (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) {
4115 4112                  zsd_warn(gettext("Out of Memory"));
4116 4113                  errno = ENOMEM;
4117 4114                  goto err;
4118 4115          }
4119 4116          for (i = 0; i <= ctl->zsctl_maxcpuid; i++) {
4120 4117                  ctl->zsctl_cpu_array[i].zsc_id = i;
4121 4118                  ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE;
4122 4119                  ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR;
4123 4120                  ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR;
4124 4121          }
4125 4122          if (statvfs("/proc", &svfs) != 0 ||
4126 4123              strcmp("/proc", svfs.f_fstr) != 0) {
4127 4124                  zsd_warn(gettext("/proc not a procfs filesystem"));
4128 4125                  errno = EINVAL;
4129 4126                  goto err;
4130 4127          }
4131 4128  
4132 4129          size = sysconf(_SC_MAXPID) + 1;
4133 4130          ctl->zsctl_maxproc = size;
4134 4131          if (ctl->zsctl_proc_array == NULL &&
4135 4132              (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size,
4136 4133              sizeof (zsd_proc_t))) == NULL) {
4137 4134                  zsd_warn(gettext("Out of Memory"));
4138 4135                  errno = ENOMEM;
4139 4136                  goto err;
4140 4137          }
4141 4138          for (i = 0; i <= ctl->zsctl_maxproc; i++) {
4142 4139                  list_link_init(&(ctl->zsctl_proc_array[i].zspr_next));
4143 4140                  ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR;
4144 4141                  ctl->zsctl_proc_array[i].zspr_zoneid = -1;
4145 4142                  ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0;
4146 4143                  ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0;
4147 4144                  ctl->zsctl_proc_array[i].zspr_ppid = -1;
4148 4145          }
4149 4146  
4150 4147          list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t),
4151 4148              offsetof(zsd_zone_t, zsz_next));
4152 4149  
4153 4150          list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t),
4154 4151              offsetof(zsd_pset_t, zsp_next));
4155 4152  
4156 4153          list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t),
4157 4154              offsetof(zsd_cpu_t, zsc_next));
4158 4155  
4159 4156          pathmax = pathconf("/proc", _PC_NAME_MAX);
4160 4157          if (pathmax < 0) {
4161 4158                  zsd_warn(gettext("Unable to determine max path of /proc"));
4162 4159                  errno = EINVAL;
4163 4160                  goto err;
4164 4161          }
4165 4162          size = sizeof (struct dirent) + pathmax + 1;
4166 4163  
4167 4164          ctl->zsctl_procfs_dent_size = size;
4168 4165          if (ctl->zsctl_procfs_dent == NULL &&
4169 4166              (ctl->zsctl_procfs_dent = (struct dirent *)calloc(1, size))
4170 4167              == NULL) {
4171 4168                  zsd_warn(gettext("Out of Memory"));
4172 4169                  errno = ENOMEM;
4173 4170                  goto err;
4174 4171          }
4175 4172  
4176 4173          if (ctl->zsctl_pool_conf == NULL &&
4177 4174              (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) {
4178 4175                  zsd_warn(gettext("Out of Memory"));
4179 4176                  errno = ENOMEM;
4180 4177                  goto err;
4181 4178          }
4182 4179          ctl->zsctl_pool_status = POOL_DISABLED;
4183 4180          ctl->zsctl_pool_changed = 0;
4184 4181  
4185 4182          if (ctl->zsctl_pool_vals[0] == NULL &&
4186 4183              (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) {
4187 4184                  zsd_warn(gettext("Out of Memory"));
4188 4185                  errno = ENOMEM;
4189 4186                  goto err;
4190 4187          }
4191 4188          if (ctl->zsctl_pool_vals[1] == NULL &&
4192 4189              (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) {
4193 4190                  zsd_warn(gettext("Out of Memory"));
4194 4191                  errno = ENOMEM;
4195 4192                  goto err;
4196 4193          }
4197 4194          ctl->zsctl_pool_vals[2] = NULL;
4198 4195  
4199 4196          /*
4200 4197           * get system limits
4201 4198           */
4202 4199          system->zss_maxpid = size = sysconf(_SC_MAXPID);
4203 4200          system->zss_processes_max = zsd_get_system_rctl("zone.max-processes");
4204 4201          system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps");
4205 4202          system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory");
4206 4203          system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids");
4207 4204          system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids");
4208 4205          system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids");
4209 4206          system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi");
4210 4207  
4211 4208          g_gen_next = 1;
4212 4209  
4213 4210          if (zsd_read(ctl, B_TRUE, B_FALSE) != 0)
4214 4211                  zsd_warn(gettext("Reading zone statistics failed"));
4215 4212  
4216 4213          return (ctl);
4217 4214  err:
4218 4215          if (ctl)
4219 4216                  zsd_close(ctl);
4220 4217  
4221 4218          return (NULL);
4222 4219  }
4223 4220  
4224 4221  /* Copy utilization data to buffer, filtering data if non-global zone. */
4225 4222  static void
4226 4223  zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage,
4227 4224      boolean_t is_gz)
4228 4225  {
4229 4226          zs_usage_t *cusage;
4230 4227          zs_system_t *sys, *csys;
4231 4228          zs_zone_t *zone, *czone;
4232 4229          zs_pset_t *pset, *cpset;
4233 4230          zs_pset_zone_t *pz, *cpz, *foundpz;
4234 4231          size_t size = 0, csize = 0;
4235 4232          char *start, *cstart;
4236 4233          int i, j;
4237 4234          timestruc_t delta;
4238 4235  
4239 4236          /* Privileged users in the global zone get everything */
4240 4237          if (is_gz) {
4241 4238                  cusage = cache->zsuc_usage;
4242 4239                  (void) bcopy(cusage, usage, cusage->zsu_size);
4243 4240                  return;
4244 4241          }
4245 4242  
4246 4243          /* Zones just get their own usage */
4247 4244          cusage = cache->zsuc_usage;
4248 4245  
4249 4246          start = (char *)usage;
4250 4247          cstart = (char *)cusage;
4251 4248          size += sizeof (zs_usage_t);
4252 4249          csize += sizeof (zs_usage_t);
4253 4250  
4254 4251          usage->zsu_start = cusage->zsu_start;
4255 4252          usage->zsu_hrstart = cusage->zsu_hrstart;
4256 4253          usage->zsu_time = cusage->zsu_time;
4257 4254          usage->zsu_hrtime = cusage->zsu_hrtime;
4258 4255          usage->zsu_gen = cusage->zsu_gen;
4259 4256          usage->zsu_nzones = 1;
4260 4257          usage->zsu_npsets = 0;
4261 4258  
4262 4259          /* LINTED */
4263 4260          sys = (zs_system_t *)(start + size);
4264 4261          /* LINTED */
4265 4262          csys = (zs_system_t *)(cstart + csize);
4266 4263          size += sizeof (zs_system_t);
4267 4264          csize += sizeof (zs_system_t);
4268 4265  
4269 4266          /* Save system limits but not usage */
4270 4267          *sys = *csys;
4271 4268          sys->zss_ncpus = 0;
4272 4269          sys->zss_ncpus_online = 0;
4273 4270  
4274 4271          /* LINTED */
4275 4272          zone = (zs_zone_t *)(start + size);
4276 4273          /* LINTED */
4277 4274          czone = (zs_zone_t *)(cstart + csize);
4278 4275          /* Find the matching zone */
4279 4276          for (i = 0; i < cusage->zsu_nzones; i++) {
4280 4277                  if (czone->zsz_id == zid) {
4281 4278                          *zone = *czone;
4282 4279                          size += sizeof (zs_zone_t);
4283 4280                  }
4284 4281                  csize += sizeof (zs_zone_t);
4285 4282                  /* LINTED */
4286 4283                  czone = (zs_zone_t *)(cstart + csize);
4287 4284          }
4288 4285          sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram);
4289 4286          sys->zss_ram_zones = zone->zsz_usage_ram;
4290 4287  
4291 4288          sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm);
4292 4289          sys->zss_vm_zones = zone->zsz_usage_vm;
4293 4290  
4294 4291          sys->zss_locked_kern += (sys->zss_locked_zones -
4295 4292              zone->zsz_usage_locked);
4296 4293          sys->zss_locked_zones = zone->zsz_usage_locked;
4297 4294  
4298 4295          TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage);
4299 4296          TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta);
4300 4297          sys->zss_cpu_usage_zones = zone->zsz_cpu_usage;
4301 4298  
4302 4299          /* LINTED */
4303 4300          pset = (zs_pset_t *)(start + size);
4304 4301          /* LINTED */
4305 4302          cpset = (zs_pset_t *)(cstart + csize);
4306 4303          for (i = 0; i < cusage->zsu_npsets; i++) {
4307 4304                  csize += sizeof (zs_pset_t);
4308 4305                  /* LINTED */
4309 4306                  cpz = (zs_pset_zone_t *)(csize + cstart);
4310 4307                  foundpz = NULL;
4311 4308                  for (j = 0; j < cpset->zsp_nusage; j++) {
4312 4309                          if (cpz->zspz_zoneid == zid)
4313 4310                                  foundpz = cpz;
4314 4311  
4315 4312                          csize += sizeof (zs_pset_zone_t);
4316 4313                          /* LINTED */
4317 4314                          cpz = (zs_pset_zone_t *)(csize + cstart);
4318 4315                  }
4319 4316                  if (foundpz != NULL) {
4320 4317                          size += sizeof (zs_pset_t);
4321 4318                          /* LINTED */
4322 4319                          pz = (zs_pset_zone_t *)(start + size);
4323 4320                          size += sizeof (zs_pset_zone_t);
4324 4321  
4325 4322                          *pset = *cpset;
4326 4323                          *pz = *foundpz;
4327 4324  
4328 4325                          TIMESTRUC_DELTA(delta, pset->zsp_usage_zones,
4329 4326                              pz->zspz_cpu_usage);
4330 4327                          TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta);
4331 4328                          pset->zsp_usage_zones = pz->zspz_cpu_usage;
4332 4329                          pset->zsp_nusage = 1;
4333 4330                          usage->zsu_npsets++;
4334 4331                          sys->zss_ncpus += pset->zsp_size;
4335 4332                          sys->zss_ncpus_online += pset->zsp_online;
4336 4333                  }
4337 4334                  /* LINTED */
4338 4335                  cpset = (zs_pset_t *)(cstart + csize);
4339 4336          }
4340 4337          usage->zsu_size = size;
4341 4338  }
4342 4339  
4343 4340  /*
4344 4341   * Respond to new connections from libzonestat.so.  Also respond to zoneadmd,
4345 4342   * which reports new zones.
4346 4343   */
4347 4344  /* ARGSUSED */
4348 4345  static void
4349 4346  zsd_server(void *cookie, char *argp, size_t arg_size,
4350 4347      door_desc_t *dp, uint_t n_desc)
4351 4348  {
4352 4349          int *args, cmd;
4353 4350          door_desc_t door;
4354 4351          ucred_t *ucred;
4355 4352          const priv_set_t *eset;
4356 4353  
4357 4354          if (argp == DOOR_UNREF_DATA) {
4358 4355                  (void) door_return(NULL, 0, NULL, 0);
4359 4356                  thr_exit(NULL);
4360 4357          }
4361 4358  
4362 4359          if (arg_size != sizeof (cmd) * 2) {
4363 4360                  (void) door_return(NULL, 0, NULL, 0);
4364 4361                  thr_exit(NULL);
4365 4362          }
4366 4363  
4367 4364          /* LINTED */
4368 4365          args = (int *)argp;
4369 4366          cmd = args[0];
4370 4367  
4371 4368          /* If connection, return door to stat server */
4372 4369          if (cmd == ZSD_CMD_CONNECT) {
4373 4370  
4374 4371                  /* Verify client compilation version */
4375 4372                  if (args[1] != ZS_VERSION) {
4376 4373                          args[1] = ZSD_STATUS_VERSION_MISMATCH;
4377 4374                          (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4378 4375                          thr_exit(NULL);
4379 4376                  }
4380 4377                  ucred = alloca(ucred_size());
4381 4378                  /* Verify client permission */
4382 4379                  if (door_ucred(&ucred) != 0) {
4383 4380                          args[1] = ZSD_STATUS_INTERNAL_ERROR;
4384 4381                          (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4385 4382                          thr_exit(NULL);
4386 4383                  }
4387 4384  
4388 4385                  eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4389 4386                  if (eset == NULL) {
4390 4387                          args[1] = ZSD_STATUS_INTERNAL_ERROR;
4391 4388                          (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4392 4389                          thr_exit(NULL);
4393 4390                  }
4394 4391                  if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4395 4392                          args[1] = ZSD_STATUS_PERMISSION;
4396 4393                          (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4397 4394                          thr_exit(NULL);
4398 4395                  }
4399 4396  
4400 4397                  /* Return stat server door */
4401 4398                  args[1] = ZSD_STATUS_OK;
4402 4399                  door.d_attributes = DOOR_DESCRIPTOR;
4403 4400                  door.d_data.d_desc.d_descriptor = g_stat_door;
4404 4401                  (void) door_return(argp, sizeof (cmd) * 2, &door, 1);
4405 4402                  thr_exit(NULL);
4406 4403          }
4407 4404  
4408 4405          /* Respond to zoneadmd informing zonestatd of a new zone */
4409 4406          if (cmd == ZSD_CMD_NEW_ZONE) {
4410 4407                  zsd_fattach_zone(args[1], g_server_door, B_FALSE);
4411 4408                  (void) door_return(NULL, 0, NULL, 0);
4412 4409                  thr_exit(NULL);
4413 4410          }
4414 4411  
4415 4412          args[1] = ZSD_STATUS_INTERNAL_ERROR;
4416 4413          (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4417 4414          thr_exit(NULL);
4418 4415  }
4419 4416  
4420 4417  /*
4421 4418   * Respond to libzonestat.so clients with the current utlilzation data.
4422 4419   */
4423 4420  /* ARGSUSED */
4424 4421  static void
4425 4422  zsd_stat_server(void *cookie, char *argp, size_t arg_size,
4426 4423      door_desc_t *dp, uint_t n_desc)
4427 4424  {
4428 4425          uint64_t *args, cmd;
4429 4426          zs_usage_cache_t *cache;
4430 4427          int ret;
4431 4428          char *rvalp;
4432 4429          size_t rvals;
4433 4430          zs_usage_t *usage;
4434 4431          ucred_t *ucred;
4435 4432          zoneid_t zoneid;
4436 4433          const priv_set_t *eset;
4437 4434          boolean_t is_gz = B_FALSE;
4438 4435  
4439 4436          /* Tell stat thread there are no more clients */
4440 4437          if (argp == DOOR_UNREF_DATA) {
4441 4438                  (void) mutex_lock(&g_usage_cache_lock);
4442 4439                  g_hasclient = B_FALSE;
4443 4440                  (void) cond_signal(&g_usage_cache_kick);
4444 4441                  (void) mutex_unlock(&g_usage_cache_lock);
4445 4442                  (void) door_return(NULL, 0, NULL, 0);
4446 4443                  thr_exit(NULL);
4447 4444          }
4448 4445          if (arg_size != sizeof (cmd) * 2) {
4449 4446                  (void) door_return(NULL, 0, NULL, 0);
4450 4447                  thr_exit(NULL);
4451 4448          }
4452 4449          /* LINTED */
4453 4450          args = (uint64_t *)argp;
4454 4451          cmd = args[0];
4455 4452          if (cmd != ZSD_CMD_READ) {
4456 4453                  (void) door_return(NULL, 0, NULL, 0);
4457 4454                  thr_exit(NULL);
4458 4455          }
4459 4456          ucred = alloca(ucred_size());
4460 4457          if (door_ucred(&ucred) != 0) {
4461 4458                  (void) door_return(NULL, 0, NULL, 0);
4462 4459                  thr_exit(NULL);
4463 4460          }
4464 4461          zoneid = ucred_getzoneid(ucred);
4465 4462  
4466 4463          if (zoneid == GLOBAL_ZONEID)
4467 4464                  is_gz = B_TRUE;
4468 4465  
4469 4466          eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4470 4467          if (eset == NULL) {
4471 4468                  (void) door_return(NULL, 0, NULL, 0);
4472 4469                  thr_exit(NULL);
4473 4470          }
4474 4471          if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4475 4472                  (void) door_return(NULL, 0, NULL, 0);
4476 4473                  thr_exit(NULL);
4477 4474          }
4478 4475          (void) mutex_lock(&g_usage_cache_lock);
4479 4476          g_hasclient = B_TRUE;
4480 4477  
4481 4478          /*
4482 4479           * Force a new cpu calculation for client.  This will force a
4483 4480           * new memory calculation if the memory data is older than the
4484 4481           * sample period.
4485 4482           */
4486 4483          g_usage_cache_kickers++;
4487 4484          (void) cond_signal(&g_usage_cache_kick);
4488 4485          ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock);
4489 4486          g_usage_cache_kickers--;
4490 4487          if (ret != 0 && errno == EINTR) {
4491 4488                  (void) mutex_unlock(&g_usage_cache_lock);
4492 4489                  zsd_warn(gettext(
4493 4490                      "Interrupted before writing usage size to client\n"));
4494 4491                  (void) door_return(NULL, 0, NULL, 0);
4495 4492                  thr_exit(NULL);
4496 4493          }
4497 4494          cache = zsd_usage_cache_hold_locked();
4498 4495          if (cache == NULL) {
4499 4496                  zsd_warn(gettext("Usage cache empty.\n"));
4500 4497                  (void) door_return(NULL, 0, NULL, 0);
4501 4498                  thr_exit(NULL);
4502 4499          }
4503 4500          (void) mutex_unlock(&g_usage_cache_lock);
4504 4501  
4505 4502          /* Copy current usage data to stack to send to client */
4506 4503          usage = (zs_usage_t *)alloca(cache->zsuc_size);
4507 4504  
4508 4505          /* Filter out results if caller is non-global zone */
4509 4506          zsd_usage_filter(zoneid, cache, usage, is_gz);
4510 4507  
4511 4508          rvalp = (void *)usage;
4512 4509          rvals = usage->zsu_size;
4513 4510          zsd_usage_cache_rele(cache);
4514 4511  
4515 4512          (void) door_return(rvalp, rvals, 0, NULL);
4516 4513          thr_exit(NULL);
4517 4514  }
4518 4515  
4519 4516  static volatile boolean_t g_quit;
4520 4517  
4521 4518  /* ARGSUSED */
4522 4519  static void
4523 4520  zonestat_quithandler(int sig)
4524 4521  {
4525 4522          g_quit = B_TRUE;
4526 4523  }
4527 4524  
4528 4525  /*
4529 4526   * The stat thread generates new utilization data when clients request
4530 4527   * it.  It also manages opening and closing the subsystems used to gather
4531 4528   * data depending on if clients exist.
4532 4529   */
4533 4530  /* ARGSUSED */
4534 4531  void *
4535 4532  stat_thread(void *arg)
4536 4533  {
4537 4534          time_t start;
4538 4535          time_t now;
4539 4536          time_t next_memory;
4540 4537          boolean_t do_memory;
4541 4538          boolean_t do_read;
4542 4539          boolean_t do_close;
4543 4540  
4544 4541          start = time(NULL);
4545 4542          if (start < 0) {
4546 4543                  if (g_quit == B_TRUE)
4547 4544                          goto quit;
4548 4545                  zsd_warn(gettext("Unable to fetch current time"));
4549 4546                  g_quit = B_TRUE;
4550 4547                  goto quit;
4551 4548          }
4552 4549  
4553 4550          next_memory = start;
4554 4551          while (g_quit == B_FALSE) {
4555 4552                  for (;;) {
4556 4553                          /*
4557 4554                           * These are used to decide if the most recent memory
4558 4555                           * calculation was within a sample interval,
4559 4556                           * and weather or not the usage collection needs to
4560 4557                           * be opened or closed.
4561 4558                           */
4562 4559                          do_memory = B_FALSE;
4563 4560                          do_read = B_FALSE;
4564 4561                          do_close = B_FALSE;
4565 4562  
4566 4563                          /*
4567 4564                           * If all clients have gone, close usage collecting
4568 4565                           */
4569 4566                          (void) mutex_lock(&g_usage_cache_lock);
4570 4567                          if (!g_hasclient && g_open == B_TRUE) {
4571 4568                                  do_close = B_TRUE;
4572 4569                                  (void) mutex_unlock(&g_usage_cache_lock);
4573 4570                                  break;
4574 4571                          }
4575 4572                          if (g_quit == B_TRUE) {
4576 4573                                  (void) mutex_unlock(
4577 4574                                      &g_usage_cache_lock);
4578 4575                                  break;
4579 4576                          }
4580 4577                          /*
4581 4578                           * Wait for a usage data request
4582 4579                           */
4583 4580                          if (g_usage_cache_kickers == 0) {
4584 4581                                  (void) cond_wait(&g_usage_cache_kick,
4585 4582                                      &g_usage_cache_lock);
4586 4583                          }
4587 4584                          now = time(NULL);
4588 4585                          if (now < 0) {
4589 4586                                  if (g_quit == B_TRUE) {
4590 4587                                          (void) mutex_unlock(
4591 4588                                              &g_usage_cache_lock);
4592 4589                                          goto quit;
4593 4590                                  }
4594 4591                                  g_quit = B_TRUE;
4595 4592                                  (void) mutex_unlock(&g_usage_cache_lock);
4596 4593                                  zsd_warn(gettext(
4597 4594                                      "Unable to fetch current time"));
4598 4595                                  goto quit;
4599 4596                          }
4600 4597                          if (g_hasclient) {
4601 4598                                  do_read = B_TRUE;
4602 4599                                  if (now >= next_memory) {
4603 4600                                          do_memory = B_TRUE;
4604 4601                                          next_memory = now + g_interval;
4605 4602                                  }
4606 4603                          } else {
4607 4604                                  do_close = B_TRUE;
4608 4605                          }
4609 4606                          (void) mutex_unlock(&g_usage_cache_lock);
4610 4607                          if (do_read || do_close)
4611 4608                                  break;
4612 4609                  }
4613 4610                  g_now = now;
4614 4611                  g_hrnow = gethrtime();
4615 4612                  if (g_hasclient && g_open == B_FALSE) {
4616 4613                          g_start = g_now;
4617 4614                          g_hrstart = g_hrnow;
4618 4615                          g_ctl = zsd_open(g_ctl);
4619 4616                          if (g_ctl == NULL)
4620 4617                                  zsd_warn(gettext(
4621 4618                                      "Unable to open zone statistics"));
4622 4619                          else
4623 4620                                  g_open = B_TRUE;
4624 4621                  }
4625 4622                  if (do_read && g_ctl) {
4626 4623                          if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) {
4627 4624                                  zsd_warn(gettext(
4628 4625                                      "Unable to read zone statistics"));
4629 4626                                  g_quit = B_TRUE;
4630 4627                                  return (NULL);
4631 4628                          }
4632 4629                  }
4633 4630                  (void) mutex_lock(&g_usage_cache_lock);
4634 4631                  if (!g_hasclient && g_open == B_TRUE && g_ctl) {
4635 4632                          (void) mutex_unlock(&g_usage_cache_lock);
4636 4633                          zsd_close(g_ctl);
4637 4634                          g_open = B_FALSE;
4638 4635                  } else {
4639 4636                          (void) mutex_unlock(&g_usage_cache_lock);
4640 4637                  }
4641 4638          }
4642 4639  quit:
4643 4640          if (g_open)
4644 4641                  zsd_close(g_ctl);
4645 4642  
4646 4643          (void) thr_kill(g_main, SIGINT);
4647 4644          thr_exit(NULL);
4648 4645          return (NULL);
4649 4646  }
4650 4647  
4651 4648  void
4652 4649  zsd_set_fx()
4653 4650  {
4654 4651          pcinfo_t pcinfo;
4655 4652          pcparms_t pcparms;
4656 4653  
4657 4654          (void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname));
4658 4655          if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) {
4659 4656                  zsd_warn(gettext("cannot get FX class parameters"));
4660 4657                  return;
4661 4658          }
4662 4659          pcparms.pc_cid = pcinfo.pc_cid;
4663 4660          ((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60;
4664 4661          ((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60;
4665 4662          ((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0;
4666 4663          ((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE;
4667 4664          if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1)
4668 4665                  zsd_warn(gettext("cannot enter the FX class"));
4669 4666  }
4670 4667  
4671 4668  static int pipe_fd;
4672 4669  
4673 4670  static void
4674 4671  daemonize_ready(char status)
4675 4672  {
4676 4673          /*
4677 4674           * wake the parent with a clue
4678 4675           */
4679 4676          (void) write(pipe_fd, &status, 1);
4680 4677          (void) close(pipe_fd);
4681 4678  }
4682 4679  
4683 4680  static int
4684 4681  daemonize_start(void)
4685 4682  {
4686 4683          char data;
4687 4684          int status;
4688 4685  
4689 4686          int filedes[2];
4690 4687          pid_t pid;
4691 4688  
4692 4689          (void) close(0);
4693 4690          (void) dup2(2, 1);
4694 4691  
4695 4692          if (pipe(filedes) < 0)
4696 4693                  return (-1);
4697 4694  
4698 4695          (void) fflush(NULL);
4699 4696  
4700 4697          if ((pid = fork1()) < 0)
4701 4698                  return (-1);
4702 4699  
4703 4700          if (pid != 0) {
4704 4701                  /*
4705 4702                   * parent
4706 4703                   */
4707 4704                  struct sigaction act;
4708 4705  
4709 4706                  act.sa_sigaction = SIG_DFL;
4710 4707                  (void) sigemptyset(&act.sa_mask);
4711 4708                  act.sa_flags = 0;
4712 4709  
4713 4710                  (void) sigaction(SIGPIPE, &act, NULL);  /* ignore SIGPIPE */
4714 4711  
4715 4712                  (void) close(filedes[1]);
4716 4713                  if (read(filedes[0], &data, 1) == 1) {
4717 4714                          /* forward ready code via exit status */
4718 4715                          exit(data);
4719 4716                  }
4720 4717                  status = -1;
4721 4718                  (void) wait4(pid, &status, 0, NULL);
4722 4719                  /* daemon process exited before becoming ready */
4723 4720                  if (WIFEXITED(status)) {
4724 4721                          /* assume daemon process printed useful message */
4725 4722                          exit(WEXITSTATUS(status));
4726 4723                  } else {
4727 4724                          zsd_warn(gettext("daemon process killed or died"));
4728 4725                          exit(1);
4729 4726                  }
4730 4727          }
4731 4728  
4732 4729          /*
4733 4730           * child
4734 4731           */
4735 4732          pipe_fd = filedes[1];
4736 4733          (void) close(filedes[0]);
4737 4734  
4738 4735          /*
4739 4736           * generic Unix setup
4740 4737           */
4741 4738          (void) setsid();
4742 4739          (void) umask(0000);
4743 4740  
4744 4741          return (0);
4745 4742  }
4746 4743  
4747 4744  static void
4748 4745  fattach_all_zones(boolean_t detach_only)
4749 4746  {
4750 4747          zoneid_t *zids;
4751 4748          uint_t nzids, nzids_last;
4752 4749          int i;
4753 4750  
4754 4751  again:
4755 4752          (void) zone_list(NULL, &nzids);
4756 4753          nzids_last = nzids;
4757 4754          zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last);
4758 4755          if (zids == NULL)
4759 4756                  zsd_error(gettext("Out of memory"));
4760 4757  
4761 4758          (void) zone_list(zids, &nzids);
4762 4759          if (nzids > nzids_last) {
4763 4760                  free(zids);
4764 4761                  goto again;
4765 4762          }
4766 4763          for (i = 0; i < nzids; i++)
4767 4764                  zsd_fattach_zone(zids[i], g_server_door, detach_only);
4768 4765  
4769 4766          free(zids);
4770 4767  }
4771 4768  
4772 4769  int
4773 4770  main(int argc, char *argv[])
4774 4771  {
4775 4772  
4776 4773          int arg;
4777 4774          thread_t tid;
4778 4775          scf_simple_prop_t *prop;
4779 4776          uint64_t *intervalp;
4780 4777          boolean_t opt_cleanup = B_FALSE;
4781 4778  
4782 4779          g_main = thr_self();
4783 4780          g_quit = B_FALSE;
4784 4781          (void) signal(SIGINT, zonestat_quithandler);
4785 4782          (void) signal(SIGTERM, zonestat_quithandler);
4786 4783          (void) signal(SIGHUP, zonestat_quithandler);
4787 4784  /*      (void) sigignore(SIGCHLD); */
4788 4785          (void) sigignore(SIGPIPE);
4789 4786  
4790 4787          if (getzoneid() != GLOBAL_ZONEID)
4791 4788                  zsd_error(gettext("Must be run from global zone only"));
4792 4789  
4793 4790          while ((arg = getopt(argc, argv, "c"))
4794 4791              != EOF) {
4795 4792                  switch (arg) {
4796 4793                  case 'c':
4797 4794                          opt_cleanup = B_TRUE;
4798 4795                          break;
4799 4796                  default:
4800 4797                          zsd_error(gettext("Invalid option"));
4801 4798                  }
4802 4799          }
4803 4800  
4804 4801          if (opt_cleanup) {
4805 4802                  if (zsd_disable_cpu_stats() != 0)
4806 4803                          exit(1);
4807 4804                  else
4808 4805                          exit(0);
4809 4806          }
4810 4807  
4811 4808          /* Get the configured sample interval */
4812 4809          prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default",
4813 4810              "config", "sample_interval");
4814 4811          if (prop == NULL)
4815 4812                  zsd_error(gettext("Unable to fetch SMF property "
4816 4813                      "\"config/sample_interval\""));
4817 4814  
4818 4815          if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT)
4819 4816                  zsd_error(gettext("Malformed SMF property "
4820 4817                      "\"config/sample_interval\".  Must be of type \"count\""));
4821 4818  
4822 4819          intervalp = scf_simple_prop_next_count(prop);
4823 4820          g_interval = *intervalp;
4824 4821          if (g_interval == 0)
4825 4822                  zsd_error(gettext("Malformed SMF property "
4826 4823                      "\"config/sample_interval\".  Must be greater than zero"));
4827 4824  
4828 4825          scf_simple_prop_free(prop);
4829 4826  
4830 4827          if (daemonize_start() < 0)
4831 4828                  zsd_error(gettext("Unable to start daemon\n"));
4832 4829  
4833 4830          /* Run at high priority */
4834 4831          zsd_set_fx();
4835 4832  
4836 4833          (void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL);
4837 4834          (void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL);
4838 4835          (void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL);
4839 4836  
4840 4837          g_server_door = door_create(zsd_server, NULL,
4841 4838              DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4842 4839          if (g_server_door < 0)
4843 4840                  zsd_error(gettext("Unable to create server door\n"));
4844 4841  
4845 4842  
4846 4843          g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI |
4847 4844              DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4848 4845          if (g_stat_door < 0)
4849 4846                  zsd_error(gettext("Unable to create statistics door\n"));
4850 4847  
4851 4848          fattach_all_zones(B_FALSE);
4852 4849  
4853 4850          if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0)
4854 4851                  zsd_error(gettext("Unable to create statistics thread\n"));
4855 4852  
4856 4853          daemonize_ready(0);
4857 4854  
4858 4855          /* Wait for signal to quit */
4859 4856          while (g_quit == B_FALSE)
4860 4857                  (void) pause();
4861 4858  
4862 4859          /* detach doors */
4863 4860          fattach_all_zones(B_TRUE);
4864 4861  
4865 4862          (void) door_revoke(g_server_door);
4866 4863          (void) door_revoke(g_stat_door);
4867 4864  
4868 4865          /* kick stat thread and wait for it to close the statistics */
4869 4866          (void) mutex_lock(&g_usage_cache_lock);
4870 4867          g_quit = B_TRUE;
4871 4868          (void) cond_signal(&g_usage_cache_kick);
4872 4869          (void) mutex_unlock(&g_usage_cache_lock);
4873 4870  end:
4874 4871          (void) thr_join(tid, NULL, NULL);
4875 4872          return (0);
4876 4873  }
  
    | 
      ↓ open down ↓ | 
    2522 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX