1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 #include <alloca.h>
  26 #include <assert.h>
  27 #include <dirent.h>
  28 #include <dlfcn.h>
  29 #include <door.h>
  30 #include <errno.h>
  31 #include <exacct.h>
  32 #include <ctype.h>
  33 #include <fcntl.h>
  34 #include <kstat.h>
  35 #include <libcontract.h>
  36 #include <libintl.h>
  37 #include <libscf.h>
  38 #include <zonestat.h>
  39 #include <zonestat_impl.h>
  40 #include <limits.h>
  41 #include <pool.h>
  42 #include <procfs.h>
  43 #include <rctl.h>
  44 #include <thread.h>
  45 #include <signal.h>
  46 #include <stdarg.h>
  47 #include <stddef.h>
  48 #include <stdio.h>
  49 #include <stdlib.h>
  50 #include <strings.h>
  51 #include <synch.h>
  52 #include <sys/acctctl.h>
  53 #include <sys/contract/process.h>
  54 #include <sys/ctfs.h>
  55 #include <sys/fork.h>
  56 #include <sys/param.h>
  57 #include <sys/priocntl.h>
  58 #include <sys/fxpriocntl.h>
  59 #include <sys/processor.h>
  60 #include <sys/pset.h>
  61 #include <sys/socket.h>
  62 #include <sys/stat.h>
  63 #include <sys/statvfs.h>
  64 #include <sys/swap.h>
  65 #include <sys/systeminfo.h>
  66 #include <thread.h>
  67 #include <sys/list.h>
  68 #include <sys/time.h>
  69 #include <sys/types.h>
  70 #include <sys/vm_usage.h>
  71 #include <sys/wait.h>
  72 #include <sys/zone.h>
  73 #include <time.h>
  74 #include <ucred.h>
  75 #include <unistd.h>
  76 #include <vm/anon.h>
  77 #include <zone.h>
  78 #include <zonestat.h>
  79 
  80 #define MAX_PSET_NAME   1024    /* Taken from PV_NAME_MAX_LEN */
  81 #define ZSD_PSET_UNLIMITED      UINT16_MAX
  82 #define ZONESTAT_EXACCT_FILE    "/var/adm/exacct/zonestat-process"
  83 
  84 /*
  85  * zonestatd implements gathering cpu and memory utilization data for
  86  * running zones.  It has these components:
  87  *
  88  * zsd_server:
  89  *      Door server to respond to client connections.  Each client
  90  *      will connect using libzonestat.so, which will open and
  91  *      call /var/tmp/.zonestat_door.  Each connecting client is given
  92  *      a file descriptor to the stat server.
  93  *
  94  *      The zsd_server also responds to zoneadmd, which reports when a
  95  *      new zone is booted.  This is used to fattach the zsd_server door
  96  *      into the new zone.
  97  *
  98  * zsd_stat_server:
  99  *      Receives client requests for the current utilization data.  Each
 100  *      client request will cause zonestatd to update the current utilization
 101  *      data by kicking the stat_thread.
 102  *
 103  *      If the client is in a non-global zone, the utilization data will
 104  *      be filtered to only show the given zone.  The usage by all other zones
 105  *      will be added to the system utilization.
 106  *
 107  * stat_thread:
 108  *      The stat thread implements querying the system to determine the
 109  *      current utilization data for each running zone.  This includes
 110  *      inspecting the system's processor set configuration, as well as details
 111  *      of each zone, such as their configured limits, and which processor
 112  *      sets they are running in.
 113  *
 114  *      The stat_thread will only update memory utilization data as often as
 115  *      the configured config/sample_interval on the zones-monitoring service.
 116  */
 117 
 118 /*
 119  * The private vmusage structure unfortunately uses size_t types, and assumes
 120  * the caller's bitness matches the kernel's bitness.  Since the getvmusage()
 121  * system call is contracted, and zonestatd is 32 bit, the following structures
 122  * are used to interact with a 32bit or 64 bit kernel.
 123  */
 124 typedef struct zsd_vmusage32 {
 125         id_t vmu_zoneid;
 126         uint_t vmu_type;
 127         id_t vmu_id;
 128 
 129         uint32_t vmu_rss_all;
 130         uint32_t vmu_rss_private;
 131         uint32_t vmu_rss_shared;
 132         uint32_t vmu_swap_all;
 133         uint32_t vmu_swap_private;
 134         uint32_t vmu_swap_shared;
 135 } zsd_vmusage32_t;
 136 
 137 typedef struct zsd_vmusage64 {
 138         id_t vmu_zoneid;
 139         uint_t vmu_type;
 140         id_t vmu_id;
 141         /*
 142          * An amd64 kernel will align the following uint64_t members, but a
 143          * 32bit i386 process will not without help.
 144          */
 145         int vmu_align_next_members_on_8_bytes;
 146         uint64_t vmu_rss_all;
 147         uint64_t vmu_rss_private;
 148         uint64_t vmu_rss_shared;
 149         uint64_t vmu_swap_all;
 150         uint64_t vmu_swap_private;
 151         uint64_t vmu_swap_shared;
 152 } zsd_vmusage64_t;
 153 
 154 struct zsd_zone;
 155 
 156 /* Used to store a zone's usage of a pset */
 157 typedef struct zsd_pset_usage {
 158         struct zsd_zone *zsu_zone;
 159         struct zsd_pset *zsu_pset;
 160 
 161         list_node_t     zsu_next;
 162 
 163         zoneid_t        zsu_zoneid;
 164         boolean_t       zsu_found;      /* zone bound at end of interval */
 165         boolean_t       zsu_active;     /* zone was bound during interval */
 166         boolean_t       zsu_new;        /* zone newly bound in this interval */
 167         boolean_t       zsu_deleted;    /* zone was unbound in this interval */
 168         boolean_t       zsu_empty;      /* no procs in pset in this interval */
 169         time_t          zsu_start;      /* time when zone was found in pset */
 170         hrtime_t        zsu_hrstart;    /* time when zone  was found in pset */
 171         uint64_t        zsu_cpu_shares;
 172         uint_t          zsu_scheds;     /* schedulers found in this pass */
 173         timestruc_t     zsu_cpu_usage;  /* cpu time used */
 174 } zsd_pset_usage_t;
 175 
 176 /* Used to store a pset's utilization */
 177 typedef struct zsd_pset {
 178         psetid_t        zsp_id;
 179         list_node_t     zsp_next;
 180         char            zsp_name[ZS_PSETNAME_MAX];
 181 
 182         uint_t          zsp_cputype;    /* default, dedicated or shared */
 183         boolean_t       zsp_found;      /* pset found at end of interval */
 184         boolean_t       zsp_new;        /* pset new in this interval */
 185         boolean_t       zsp_deleted;    /* pset deleted in this interval */
 186         boolean_t       zsp_active;     /* pset existed during interval */
 187         boolean_t       zsp_empty;      /* no processes in pset */
 188         time_t          zsp_start;
 189         hrtime_t        zsp_hrstart;
 190 
 191         uint64_t        zsp_online;     /* online cpus in interval */
 192         uint64_t        zsp_size;       /* size in this interval */
 193         uint64_t        zsp_min;        /* configured min in this interval */
 194         uint64_t        zsp_max;        /* configured max in this interval */
 195         int64_t         zsp_importance; /* configured max in this interval */
 196 
 197         uint_t          zsp_scheds;     /* scheds of processes found in pset */
 198         uint64_t        zsp_cpu_shares; /* total shares in this interval */
 199 
 200         timestruc_t     zsp_total_time;
 201         timestruc_t     zsp_usage_kern;
 202         timestruc_t     zsp_usage_zones;
 203 
 204         /* Individual zone usages of pset */
 205         list_t          zsp_usage_list;
 206         int             zsp_nusage;
 207 
 208         /* Summed kstat values from individual cpus in pset */
 209         timestruc_t     zsp_idle;
 210         timestruc_t     zsp_intr;
 211         timestruc_t     zsp_kern;
 212         timestruc_t     zsp_user;
 213 
 214 } zsd_pset_t;
 215 
 216 /* Used to track an individual cpu's utilization as reported by kstats */
 217 typedef struct zsd_cpu {
 218         processorid_t   zsc_id;
 219         list_node_t     zsc_next;
 220         psetid_t        zsc_psetid;
 221         psetid_t        zsc_psetid_prev;
 222         zsd_pset_t      *zsc_pset;
 223 
 224         boolean_t       zsc_found;      /* cpu online in this interval */
 225         boolean_t       zsc_onlined;    /* cpu onlined during this interval */
 226         boolean_t       zsc_offlined;   /* cpu offlined during this interval */
 227         boolean_t       zsc_active;     /* cpu online during this interval */
 228         boolean_t       zsc_allocated;  /* True if cpu has ever been found */
 229 
 230         /* kstats this interval */
 231         uint64_t        zsc_nsec_idle;
 232         uint64_t        zsc_nsec_intr;
 233         uint64_t        zsc_nsec_kern;
 234         uint64_t        zsc_nsec_user;
 235 
 236         /* kstats in most recent interval */
 237         uint64_t        zsc_nsec_idle_prev;
 238         uint64_t        zsc_nsec_intr_prev;
 239         uint64_t        zsc_nsec_kern_prev;
 240         uint64_t        zsc_nsec_user_prev;
 241 
 242         /* Total kstat increases since zonestatd started reading kstats */
 243         timestruc_t     zsc_idle;
 244         timestruc_t     zsc_intr;
 245         timestruc_t     zsc_kern;
 246         timestruc_t     zsc_user;
 247 
 248 } zsd_cpu_t;
 249 
 250 /* Used to describe an individual zone and its utilization */
 251 typedef struct zsd_zone {
 252         zoneid_t        zsz_id;
 253         list_node_t     zsz_next;
 254         char            zsz_name[ZS_ZONENAME_MAX];
 255         uint_t          zsz_cputype;
 256         uint_t          zsz_iptype;
 257         time_t          zsz_start;
 258         hrtime_t        zsz_hrstart;
 259 
 260         char            zsz_pool[ZS_POOLNAME_MAX];
 261         char            zsz_pset[ZS_PSETNAME_MAX];
 262         int             zsz_default_sched;
 263         /* These are deduced by inspecting processes */
 264         psetid_t        zsz_psetid;
 265         uint_t          zsz_scheds;
 266 
 267         boolean_t       zsz_new;        /* zone booted during this interval */
 268         boolean_t       zsz_deleted;    /* halted during this interval */
 269         boolean_t       zsz_active;     /* running in this interval */
 270         boolean_t       zsz_empty;      /* no processes in this interval */
 271         boolean_t       zsz_gone;       /* not installed in this interval */
 272         boolean_t       zsz_found;      /* Running at end of this interval */
 273 
 274         uint64_t        zsz_cpu_shares;
 275         uint64_t        zsz_cpu_cap;
 276         uint64_t        zsz_ram_cap;
 277         uint64_t        zsz_locked_cap;
 278         uint64_t        zsz_vm_cap;
 279 
 280         uint64_t        zsz_cpus_online;
 281         timestruc_t     zsz_cpu_usage;  /* cpu time of cpu cap */
 282         timestruc_t     zsz_cap_time;   /* cpu time of cpu cap */
 283         timestruc_t     zsz_share_time; /* cpu time of share of cpu */
 284         timestruc_t     zsz_pset_time;  /* time of all psets zone is bound to */
 285 
 286         uint64_t        zsz_usage_ram;
 287         uint64_t        zsz_usage_locked;
 288         uint64_t        zsz_usage_vm;
 289 
 290         uint64_t        zsz_processes_cap;
 291         uint64_t        zsz_lwps_cap;
 292         uint64_t        zsz_shm_cap;
 293         uint64_t        zsz_shmids_cap;
 294         uint64_t        zsz_semids_cap;
 295         uint64_t        zsz_msgids_cap;
 296         uint64_t        zsz_lofi_cap;
 297 
 298         uint64_t        zsz_processes;
 299         uint64_t        zsz_lwps;
 300         uint64_t        zsz_shm;
 301         uint64_t        zsz_shmids;
 302         uint64_t        zsz_semids;
 303         uint64_t        zsz_msgids;
 304         uint64_t        zsz_lofi;
 305 
 306 } zsd_zone_t;
 307 
 308 /*
 309  * Used to track the cpu usage of an individual processes.
 310  *
 311  * zonestatd sweeps /proc each interval and charges the cpu usage of processes.
 312  * to their zone.  As processes exit, their extended accounting records are
 313  * read and the difference of their total and known usage is charged to their
 314  * zone.
 315  *
 316  * If a process is never seen in /proc, the total usage on its extended
 317  * accounting record will be charged to its zone.
 318  */
 319 typedef struct zsd_proc {
 320         list_node_t     zspr_next;
 321         pid_t           zspr_ppid;
 322         psetid_t        zspr_psetid;
 323         zoneid_t        zspr_zoneid;
 324         int             zspr_sched;
 325         timestruc_t     zspr_usage;
 326 } zsd_proc_t;
 327 
 328 /* Used to track the overall resource usage of the system */
 329 typedef struct zsd_system {
 330 
 331         uint64_t zss_ram_total;
 332         uint64_t zss_ram_kern;
 333         uint64_t zss_ram_zones;
 334 
 335         uint64_t zss_locked_kern;
 336         uint64_t zss_locked_zones;
 337 
 338         uint64_t zss_vm_total;
 339         uint64_t zss_vm_kern;
 340         uint64_t zss_vm_zones;
 341 
 342         uint64_t zss_swap_total;
 343         uint64_t zss_swap_used;
 344 
 345         timestruc_t zss_idle;
 346         timestruc_t zss_intr;
 347         timestruc_t zss_kern;
 348         timestruc_t zss_user;
 349 
 350         timestruc_t zss_cpu_total_time;
 351         timestruc_t zss_cpu_usage_kern;
 352         timestruc_t zss_cpu_usage_zones;
 353 
 354         uint64_t zss_maxpid;
 355         uint64_t zss_processes_max;
 356         uint64_t zss_lwps_max;
 357         uint64_t zss_shm_max;
 358         uint64_t zss_shmids_max;
 359         uint64_t zss_semids_max;
 360         uint64_t zss_msgids_max;
 361         uint64_t zss_lofi_max;
 362 
 363         uint64_t zss_processes;
 364         uint64_t zss_lwps;
 365         uint64_t zss_shm;
 366         uint64_t zss_shmids;
 367         uint64_t zss_semids;
 368         uint64_t zss_msgids;
 369         uint64_t zss_lofi;
 370 
 371         uint64_t zss_ncpus;
 372         uint64_t zss_ncpus_online;
 373 
 374 } zsd_system_t;
 375 
 376 /*
 377  * A dumping ground for various information and structures used to compute
 378  * utilization.
 379  *
 380  * This structure is used to track the system while clients are connected.
 381  * When The first client connects, a zsd_ctl is allocated and configured by
 382  * zsd_open().  When all clients disconnect, the zsd_ctl is closed.
 383  */
 384 typedef struct zsd_ctl {
 385         kstat_ctl_t     *zsctl_kstat_ctl;
 386 
 387         /* To track extended accounting */
 388         int             zsctl_proc_fd;          /* Log currently being used */
 389         ea_file_t       zsctl_proc_eaf;
 390         struct stat64   zsctl_proc_stat;
 391         int             zsctl_proc_open;
 392         int             zsctl_proc_fd_next;     /* Log file to use next */
 393         ea_file_t       zsctl_proc_eaf_next;
 394         struct stat64   zsctl_proc_stat_next;
 395         int             zsctl_proc_open_next;
 396 
 397         /* pool configuration handle */
 398         pool_conf_t     *zsctl_pool_conf;
 399         int             zsctl_pool_status;
 400         int             zsctl_pool_changed;
 401 
 402         /* The above usage tacking structures */
 403         zsd_system_t    *zsctl_system;
 404         list_t          zsctl_zones;
 405         list_t          zsctl_psets;
 406         list_t          zsctl_cpus;
 407         zsd_cpu_t       *zsctl_cpu_array;
 408         zsd_proc_t      *zsctl_proc_array;
 409 
 410         /* Various system info */
 411         uint64_t        zsctl_maxcpuid;
 412         uint64_t        zsctl_maxproc;
 413         uint64_t        zsctl_kern_bits;
 414         uint64_t        zsctl_pagesize;
 415 
 416         /* Used to track time available under a cpu cap. */
 417         uint64_t        zsctl_hrtime;
 418         uint64_t        zsctl_hrtime_prev;
 419         timestruc_t     zsctl_hrtime_total;
 420 
 421         struct timeval  zsctl_timeofday;
 422 
 423         /* Caches for arrays allocated for use by various system calls */
 424         psetid_t        *zsctl_pset_cache;
 425         uint_t          zsctl_pset_ncache;
 426         processorid_t   *zsctl_cpu_cache;
 427         uint_t          zsctl_cpu_ncache;
 428         zoneid_t        *zsctl_zone_cache;
 429         uint_t          zsctl_zone_ncache;
 430         struct swaptable *zsctl_swap_cache;
 431         uint64_t        zsctl_swap_cache_size;
 432         uint64_t        zsctl_swap_cache_num;
 433         zsd_vmusage64_t *zsctl_vmusage_cache;
 434         uint64_t        zsctl_vmusage_cache_num;
 435 
 436         /* Info about procfs for scanning /proc */
 437         struct dirent   *zsctl_procfs_dent;
 438         long            zsctl_procfs_dent_size;
 439         pool_value_t    *zsctl_pool_vals[3];
 440 
 441         /* Counts on tracked entities */
 442         uint_t          zsctl_nzones;
 443         uint_t          zsctl_npsets;
 444         uint_t          zsctl_npset_usages;
 445 } zsd_ctl_t;
 446 
 447 zsd_ctl_t               *g_ctl;
 448 boolean_t               g_open;         /* True if g_ctl is open */
 449 int                     g_hasclient;    /* True if any clients are connected */
 450 
 451 /*
 452  * The usage cache is updated by the stat_thread, and copied to clients by
 453  * the zsd_stat_server.  Mutex and cond are to synchronize between the
 454  * stat_thread and the stat_server.
 455  */
 456 zs_usage_cache_t        *g_usage_cache;
 457 mutex_t                 g_usage_cache_lock;
 458 cond_t                  g_usage_cache_kick;
 459 uint_t                  g_usage_cache_kickers;
 460 cond_t                  g_usage_cache_wait;
 461 char                    *g_usage_cache_buf;
 462 uint_t                  g_usage_cache_bufsz;
 463 uint64_t                g_gen_next;
 464 
 465 /* fds of door servers */
 466 int                     g_server_door;
 467 int                     g_stat_door;
 468 
 469 /*
 470  * Starting and current time.  Used to throttle memory calculation, and to
 471  * mark new zones and psets with their boot and creation time.
 472  */
 473 time_t                  g_now;
 474 time_t                  g_start;
 475 hrtime_t                g_hrnow;
 476 hrtime_t                g_hrstart;
 477 uint64_t                g_interval;
 478 
 479 /*
 480  * main() thread.
 481  */
 482 thread_t                g_main;
 483 
 484 /* PRINTFLIKE1 */
 485 static void
 486 zsd_warn(const char *fmt, ...)
 487 {
 488         va_list alist;
 489 
 490         va_start(alist, fmt);
 491 
 492         (void) fprintf(stderr, gettext("zonestat: Warning: "));
 493         (void) vfprintf(stderr, fmt, alist);
 494         (void) fprintf(stderr, "\n");
 495         va_end(alist);
 496 }
 497 
 498 /* PRINTFLIKE1 */
 499 static void
 500 zsd_error(const char *fmt, ...)
 501 {
 502         va_list alist;
 503 
 504         va_start(alist, fmt);
 505 
 506         (void) fprintf(stderr, gettext("zonestat: Error: "));
 507         (void) vfprintf(stderr, fmt, alist);
 508         (void) fprintf(stderr, "\n");
 509         va_end(alist);
 510         exit(1);
 511 }
 512 
 513 /* Turns on extended accounting if not configured externally */
 514 int
 515 zsd_enable_cpu_stats()
 516 {
 517         char *path = ZONESTAT_EXACCT_FILE;
 518         char oldfile[MAXPATHLEN];
 519         int ret, state = AC_ON;
 520         ac_res_t res[6];
 521 
 522         /*
 523          * Start a new accounting file  if accounting not configured
 524          * externally.
 525          */
 526 
 527         res[0].ar_id = AC_PROC_PID;
 528         res[0].ar_state = AC_ON;
 529         res[1].ar_id = AC_PROC_ANCPID;
 530         res[1].ar_state = AC_ON;
 531         res[2].ar_id = AC_PROC_CPU;
 532         res[2].ar_state = AC_ON;
 533         res[3].ar_id = AC_PROC_TIME;
 534         res[3].ar_state = AC_ON;
 535         res[4].ar_id = AC_PROC_ZONENAME;
 536         res[4].ar_state = AC_ON;
 537         res[5].ar_id = AC_NONE;
 538         res[5].ar_state = AC_ON;
 539         if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
 540                 zsd_warn(gettext("Unable to set accounting resources"));
 541                 return (-1);
 542         }
 543         /* Only set accounting file if none is configured */
 544         ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
 545         if (ret < 0) {
 546 
 547                 (void) unlink(path);
 548                 if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1)
 549                     == -1) {
 550                         zsd_warn(gettext("Unable to set accounting file"));
 551                         return (-1);
 552                 }
 553         }
 554         if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
 555                 zsd_warn(gettext("Unable to enable accounting"));
 556                 return (-1);
 557         }
 558         return (0);
 559 }
 560 
 561 /* Turns off extended accounting if not configured externally */
 562 int
 563 zsd_disable_cpu_stats()
 564 {
 565         char *path = ZONESTAT_EXACCT_FILE;
 566         int ret, state = AC_OFF;
 567         ac_res_t res[6];
 568         char oldfile[MAXPATHLEN];
 569 
 570         /* If accounting file is externally configured, leave it alone */
 571         ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
 572         if (ret == 0 && strcmp(oldfile, path) != 0)
 573                 return (0);
 574 
 575         res[0].ar_id = AC_PROC_PID;
 576         res[0].ar_state = AC_OFF;
 577         res[1].ar_id = AC_PROC_ANCPID;
 578         res[1].ar_state = AC_OFF;
 579         res[2].ar_id = AC_PROC_CPU;
 580         res[2].ar_state = AC_OFF;
 581         res[3].ar_id = AC_PROC_TIME;
 582         res[3].ar_state = AC_OFF;
 583         res[4].ar_id = AC_PROC_ZONENAME;
 584         res[4].ar_state = AC_OFF;
 585         res[5].ar_id = AC_NONE;
 586         res[5].ar_state = AC_OFF;
 587         if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
 588                 zsd_warn(gettext("Unable to clear accounting resources"));
 589                 return (-1);
 590         }
 591         if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) {
 592                 zsd_warn(gettext("Unable to clear accounting file"));
 593                 return (-1);
 594         }
 595         if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
 596                 zsd_warn(gettext("Unable to diable accounting"));
 597                 return (-1);
 598         }
 599 
 600         (void) unlink(path);
 601         return (0);
 602 }
 603 
 604 /*
 605  * If not configured externally, deletes the current extended accounting file
 606  * and starts a new one.
 607  *
 608  * Since the stat_thread holds an open handle to the accounting file, it will
 609  * read all remaining entries from the old file before switching to
 610  * read the new one.
 611  */
 612 int
 613 zsd_roll_exacct(void)
 614 {
 615         int ret;
 616         char *path = ZONESTAT_EXACCT_FILE;
 617         char oldfile[MAXPATHLEN];
 618 
 619         /* If accounting file is externally configured, leave it alone */
 620         ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
 621         if (ret == 0 && strcmp(oldfile, path) != 0)
 622                 return (0);
 623 
 624         if (unlink(path) != 0)
 625                 /* Roll it next time */
 626                 return (0);
 627 
 628         if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) {
 629                 zsd_warn(gettext("Unable to set accounting file"));
 630                 return (-1);
 631         }
 632         return (0);
 633 }
 634 
 635 /* Contract stuff for zone_enter() */
 636 int
 637 init_template(void)
 638 {
 639         int fd;
 640         int err = 0;
 641 
 642         fd = open64(CTFS_ROOT "/process/template", O_RDWR);
 643         if (fd == -1)
 644                 return (-1);
 645 
 646         /*
 647          * For now, zoneadmd doesn't do anything with the contract.
 648          * Deliver no events, don't inherit, and allow it to be orphaned.
 649          */
 650         err |= ct_tmpl_set_critical(fd, 0);
 651         err |= ct_tmpl_set_informative(fd, 0);
 652         err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
 653         err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
 654         if (err || ct_tmpl_activate(fd)) {
 655                 (void) close(fd);
 656                 return (-1);
 657         }
 658 
 659         return (fd);
 660 }
 661 
 662 /*
 663  * Contract stuff for zone_enter()
 664  */
 665 int
 666 contract_latest(ctid_t *id)
 667 {
 668         int cfd, r;
 669         ct_stathdl_t st;
 670         ctid_t result;
 671 
 672         if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1)
 673                 return (errno);
 674 
 675         if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) {
 676                 (void) close(cfd);
 677                 return (r);
 678         }
 679 
 680         result = ct_status_get_id(st);
 681         ct_status_free(st);
 682         (void) close(cfd);
 683 
 684         *id = result;
 685         return (0);
 686 }
 687 
 688 static int
 689 close_on_exec(int fd)
 690 {
 691         int flags = fcntl(fd, F_GETFD, 0);
 692         if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1))
 693                 return (0);
 694         return (-1);
 695 }
 696 
 697 int
 698 contract_open(ctid_t ctid, const char *type, const char *file, int oflag)
 699 {
 700         char path[PATH_MAX];
 701         int n, fd;
 702 
 703         if (type == NULL)
 704                 type = "all";
 705 
 706         n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file);
 707         if (n >= sizeof (path)) {
 708                 errno = ENAMETOOLONG;
 709                 return (-1);
 710         }
 711 
 712         fd = open64(path, oflag);
 713         if (fd != -1) {
 714                 if (close_on_exec(fd) == -1) {
 715                         int err = errno;
 716                         (void) close(fd);
 717                         errno = err;
 718                         return (-1);
 719                 }
 720         }
 721         return (fd);
 722 }
 723 
 724 int
 725 contract_abandon_id(ctid_t ctid)
 726 {
 727         int fd, err;
 728 
 729         fd = contract_open(ctid, "all", "ctl", O_WRONLY);
 730         if (fd == -1)
 731                 return (errno);
 732 
 733         err = ct_ctl_abandon(fd);
 734         (void) close(fd);
 735 
 736         return (err);
 737 }
 738 /*
 739  * Attach the zsd_server to a zone.  Called for each zone when zonestatd
 740  * starts, and for each newly booted zone when zoneadmd contacts the zsd_server
 741  *
 742  * Zone_enter is used to avoid reaching into zone to fattach door.
 743  */
 744 static void
 745 zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only)
 746 {
 747         char *path = ZS_DOOR_PATH;
 748         int fd, pid, stat, tmpl_fd;
 749         ctid_t ct;
 750 
 751         if ((tmpl_fd = init_template()) == -1) {
 752                 zsd_warn("Unable to init template");
 753                 return;
 754         }
 755 
 756         pid = forkx(0);
 757         if (pid < 0) {
 758                 (void) ct_tmpl_clear(tmpl_fd);
 759                 zsd_warn(gettext(
 760                     "Unable to fork to add zonestat to zoneid %d\n"), zid);
 761                 return;
 762         }
 763 
 764         if (pid == 0) {
 765                 (void) ct_tmpl_clear(tmpl_fd);
 766                 (void) close(tmpl_fd);
 767                 if (zid != 0 && zone_enter(zid) != 0) {
 768                         if (errno == EINVAL) {
 769                                 _exit(0);
 770                         }
 771                         _exit(1);
 772                 }
 773                 (void) fdetach(path);
 774                 (void) unlink(path);
 775                 if (detach_only)
 776                         _exit(0);
 777                 fd = open(path, O_CREAT|O_RDWR, 0644);
 778                 if (fd < 0)
 779                         _exit(2);
 780                 if (fattach(door, path) != 0)
 781                         _exit(3);
 782                 _exit(0);
 783         }
 784         if (contract_latest(&ct) == -1)
 785                 ct = -1;
 786         (void) ct_tmpl_clear(tmpl_fd);
 787         (void) close(tmpl_fd);
 788         (void) contract_abandon_id(ct);
 789         while (waitpid(pid, &stat, 0) != pid)
 790                 ;
 791         if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0)
 792                 return;
 793 
 794         zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid);
 795 
 796         if (WEXITSTATUS(stat) == 1)
 797                 zsd_warn(gettext("Cannot entering zone"));
 798         else if (WEXITSTATUS(stat) == 2)
 799                 zsd_warn(gettext("Unable to create door file: %s"), path);
 800         else if (WEXITSTATUS(stat) == 3)
 801                 zsd_warn(gettext("Unable to fattach file: %s"), path);
 802 
 803         zsd_warn(gettext("Internal error entering zone: %d"), zid);
 804 }
 805 
 806 /*
 807  * Zone lookup and allocation functions to manage list of currently running
 808  * zones.
 809  */
 810 static zsd_zone_t *
 811 zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
 812 {
 813         zsd_zone_t *zone;
 814 
 815         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
 816             zone = list_next(&ctl->zsctl_zones, zone)) {
 817                 if (strcmp(zone->zsz_name, zonename) == 0) {
 818                         if (zoneid != -1)
 819                                 zone->zsz_id = zoneid;
 820                         return (zone);
 821                 }
 822         }
 823         return (NULL);
 824 }
 825 
 826 static zsd_zone_t *
 827 zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid)
 828 {
 829         zsd_zone_t *zone;
 830 
 831         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
 832             zone = list_next(&ctl->zsctl_zones, zone)) {
 833                 if (zone->zsz_id == zoneid)
 834                         return (zone);
 835         }
 836         return (NULL);
 837 }
 838 
 839 static zsd_zone_t *
 840 zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
 841 {
 842         zsd_zone_t *zone;
 843 
 844         if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL)
 845                 return (NULL);
 846 
 847         (void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name));
 848         zone->zsz_id = zoneid;
 849         zone->zsz_found = B_FALSE;
 850 
 851         /*
 852          * Allocate as deleted so if not found in first pass, zone is deleted
 853          * from list.  This can happen if zone is returned by zone_list, but
 854          * exits before first attempt to fetch zone details.
 855          */
 856         zone->zsz_start = g_now;
 857         zone->zsz_hrstart = g_hrnow;
 858         zone->zsz_deleted = B_TRUE;
 859 
 860         zone->zsz_cpu_shares = ZS_LIMIT_NONE;
 861         zone->zsz_cpu_cap = ZS_LIMIT_NONE;
 862         zone->zsz_ram_cap = ZS_LIMIT_NONE;
 863         zone->zsz_locked_cap = ZS_LIMIT_NONE;
 864         zone->zsz_vm_cap = ZS_LIMIT_NONE;
 865 
 866         zone->zsz_processes_cap = ZS_LIMIT_NONE;
 867         zone->zsz_lwps_cap = ZS_LIMIT_NONE;
 868         zone->zsz_shm_cap = ZS_LIMIT_NONE;
 869         zone->zsz_shmids_cap = ZS_LIMIT_NONE;
 870         zone->zsz_semids_cap = ZS_LIMIT_NONE;
 871         zone->zsz_msgids_cap = ZS_LIMIT_NONE;
 872         zone->zsz_lofi_cap = ZS_LIMIT_NONE;
 873 
 874         ctl->zsctl_nzones++;
 875 
 876         return (zone);
 877 }
 878 
 879 static zsd_zone_t *
 880 zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
 881 {
 882         zsd_zone_t *zone, *tmp;
 883 
 884         if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL)
 885                 return (zone);
 886 
 887         if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL)
 888                 return (NULL);
 889 
 890         /* Insert sorted by zonename */
 891         tmp = list_head(&ctl->zsctl_zones);
 892         while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0)
 893                 tmp = list_next(&ctl->zsctl_zones, tmp);
 894 
 895         list_insert_before(&ctl->zsctl_zones, tmp, zone);
 896         return (zone);
 897 }
 898 
 899 /*
 900  * Mark all zones as not existing.  As zones are found, they will
 901  * be marked as existing.  If a zone is not found, then it must have
 902  * halted.
 903  */
 904 static void
 905 zsd_mark_zones_start(zsd_ctl_t *ctl)
 906 {
 907 
 908         zsd_zone_t *zone;
 909 
 910         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
 911             zone = list_next(&ctl->zsctl_zones, zone)) {
 912                 zone->zsz_found = B_FALSE;
 913         }
 914 }
 915 
 916 /*
 917  * Mark each zone as not using pset.  If processes are found using the
 918  * pset, the zone will remain bound to the pset.  If none of a zones
 919  * processes are bound to the pset, the zone's usage of the pset will
 920  * be deleted.
 921  *
 922  */
 923 static void
 924 zsd_mark_pset_usage_start(zsd_pset_t *pset)
 925 {
 926         zsd_pset_usage_t *usage;
 927 
 928         for (usage = list_head(&pset->zsp_usage_list);
 929             usage != NULL;
 930             usage = list_next(&pset->zsp_usage_list, usage)) {
 931                 usage->zsu_found = B_FALSE;
 932                 usage->zsu_empty = B_TRUE;
 933         }
 934 }
 935 
 936 /*
 937  * Mark each pset as not existing.  If a pset is found, it will be marked
 938  * as existing.  If a pset is not found, it wil be deleted.
 939  */
 940 static void
 941 zsd_mark_psets_start(zsd_ctl_t *ctl)
 942 {
 943         zsd_pset_t *pset;
 944 
 945         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
 946             pset = list_next(&ctl->zsctl_psets, pset)) {
 947                 pset->zsp_found = B_FALSE;
 948                 zsd_mark_pset_usage_start(pset);
 949         }
 950 }
 951 
 952 /*
 953  * A pset was found.  Update its information
 954  */
 955 static void
 956 zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online,
 957     uint64_t size, uint64_t min, uint64_t max, int64_t importance)
 958 {
 959         pset->zsp_empty = B_TRUE;
 960         pset->zsp_deleted = B_FALSE;
 961 
 962         assert(pset->zsp_found == B_FALSE);
 963 
 964         /* update pset flags */
 965         if (pset->zsp_active == B_FALSE)
 966                 /* pset not seen on previous interval.  It is new. */
 967                 pset->zsp_new = B_TRUE;
 968         else
 969                 pset->zsp_new = B_FALSE;
 970 
 971         pset->zsp_found = B_TRUE;
 972         pset->zsp_cputype = type;
 973         pset->zsp_online = online;
 974         pset->zsp_size = size;
 975         pset->zsp_min = min;
 976         pset->zsp_max = max;
 977         pset->zsp_importance = importance;
 978         pset->zsp_cpu_shares = 0;
 979         pset->zsp_scheds = 0;
 980         pset->zsp_active = B_TRUE;
 981 }
 982 
 983 /*
 984  * A zone's process was found using a pset. Charge the process to the pset and
 985  * the per-zone data for the pset.
 986  */
 987 static void
 988 zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched)
 989 {
 990         zsd_zone_t *zone = usage->zsu_zone;
 991         zsd_pset_t *pset = usage->zsu_pset;
 992 
 993         /* Nothing to do if already found */
 994         if (usage->zsu_found == B_TRUE)
 995                 goto add_stats;
 996 
 997         usage->zsu_found = B_TRUE;
 998         usage->zsu_empty = B_FALSE;
 999 
1000         usage->zsu_deleted = B_FALSE;
1001         /* update usage flags */
1002         if (usage->zsu_active == B_FALSE)
1003                 usage->zsu_new = B_TRUE;
1004         else
1005                 usage->zsu_new = B_FALSE;
1006 
1007         usage->zsu_scheds = 0;
1008         usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1009         usage->zsu_active = B_TRUE;
1010         pset->zsp_empty = B_FALSE;
1011         zone->zsz_empty = B_FALSE;
1012 
1013 add_stats:
1014         /* Detect zone's pset id, and if it is bound to multiple psets */
1015         if (zone->zsz_psetid == ZS_PSET_ERROR)
1016                 zone->zsz_psetid = pset->zsp_id;
1017         else if (zone->zsz_psetid != pset->zsp_id)
1018                 zone->zsz_psetid = ZS_PSET_MULTI;
1019 
1020         usage->zsu_scheds |= sched;
1021         pset->zsp_scheds |= sched;
1022         zone->zsz_scheds |= sched;
1023 
1024         /* Record if FSS is co-habitating with conflicting scheduler */
1025         if ((pset->zsp_scheds & ZS_SCHED_FSS) &&
1026             usage->zsu_scheds & (
1027             ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) {
1028                 usage->zsu_scheds |= ZS_SCHED_CONFLICT;
1029 
1030                 pset->zsp_scheds |= ZS_SCHED_CONFLICT;
1031         }
1032 
1033 }
1034 
1035 /* Add cpu time for a process to a pset, zone, and system totals */
1036 static void
1037 zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta)
1038 {
1039         zsd_system_t *system = ctl->zsctl_system;
1040         zsd_zone_t *zone = usage->zsu_zone;
1041         zsd_pset_t *pset = usage->zsu_pset;
1042 
1043         TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta);
1044         TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta);
1045         TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta);
1046         TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta);
1047 }
1048 
1049 /* Determine which processor sets have been deleted */
1050 static void
1051 zsd_mark_psets_end(zsd_ctl_t *ctl)
1052 {
1053         zsd_pset_t *pset, *tmp;
1054 
1055         /*
1056          * Mark pset as not exists, and deleted if it existed
1057          * previous interval.
1058          */
1059         pset = list_head(&ctl->zsctl_psets);
1060         while (pset != NULL) {
1061                 if (pset->zsp_found == B_FALSE) {
1062                         pset->zsp_empty = B_TRUE;
1063                         if (pset->zsp_deleted == B_TRUE) {
1064                                 tmp = pset;
1065                                 pset = list_next(&ctl->zsctl_psets, pset);
1066                                 list_remove(&ctl->zsctl_psets, tmp);
1067                                 free(tmp);
1068                                 ctl->zsctl_npsets--;
1069                                 continue;
1070                         } else {
1071                                 /* Pset vanished during this interval */
1072                                 pset->zsp_new = B_FALSE;
1073                                 pset->zsp_deleted = B_TRUE;
1074                                 pset->zsp_active = B_TRUE;
1075                         }
1076                 }
1077                 pset = list_next(&ctl->zsctl_psets, pset);
1078         }
1079 }
1080 
1081 /* Determine which zones are no longer bound to processor sets */
1082 static void
1083 zsd_mark_pset_usages_end(zsd_ctl_t *ctl)
1084 {
1085         zsd_pset_t *pset;
1086         zsd_zone_t *zone;
1087         zsd_pset_usage_t *usage, *tmp;
1088 
1089         /*
1090          * Mark pset as not exists, and deleted if it existed previous
1091          * interval.
1092          */
1093         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1094             pset = list_next(&ctl->zsctl_psets, pset)) {
1095                 usage = list_head(&pset->zsp_usage_list);
1096                 while (usage != NULL) {
1097                         /*
1098                          * Mark pset as not exists, and deleted if it existed
1099                          * previous interval.
1100                          */
1101                         if (usage->zsu_found == B_FALSE ||
1102                             usage->zsu_zone->zsz_deleted == B_TRUE ||
1103                             usage->zsu_pset->zsp_deleted == B_TRUE) {
1104                                 tmp = usage;
1105                                 usage = list_next(&pset->zsp_usage_list,
1106                                     usage);
1107                                 list_remove(&pset->zsp_usage_list, tmp);
1108                                 free(tmp);
1109                                 pset->zsp_nusage--;
1110                                 ctl->zsctl_npset_usages--;
1111                                 continue;
1112                         } else {
1113                                 usage->zsu_new = B_FALSE;
1114                                 usage->zsu_deleted = B_TRUE;
1115                                 usage->zsu_active = B_TRUE;
1116                         }
1117                         /* Add cpu shares for usages that are in FSS */
1118                         zone = usage->zsu_zone;
1119                         if (usage->zsu_scheds & ZS_SCHED_FSS &&
1120                             zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED &&
1121                             zone->zsz_cpu_shares != 0) {
1122                                 zone = usage->zsu_zone;
1123                                 usage->zsu_cpu_shares = zone->zsz_cpu_shares;
1124                                 pset->zsp_cpu_shares += zone->zsz_cpu_shares;
1125                         }
1126                         usage = list_next(&pset->zsp_usage_list,
1127                             usage);
1128                 }
1129         }
1130 }
1131 
1132 /* A zone has been found.  Update its information */
1133 static void
1134 zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares,
1135     uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap,
1136     uint64_t vm_cap, uint64_t processes_cap, uint64_t processes,
1137     uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm,
1138     uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap,
1139     uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap,
1140     uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype,
1141     uint_t iptype)
1142 {
1143         zsd_system_t *sys = ctl->zsctl_system;
1144 
1145         assert(zone->zsz_found == B_FALSE);
1146 
1147         /*
1148          * Mark zone as exists, and new if it did not exist in previous
1149          * interval.
1150          */
1151         zone->zsz_found = B_TRUE;
1152         zone->zsz_empty = B_TRUE;
1153         zone->zsz_deleted = B_FALSE;
1154 
1155         /*
1156          * Zone is new.  Assume zone's properties are the same over entire
1157          * interval.
1158          */
1159         if (zone->zsz_active == B_FALSE)
1160                 zone->zsz_new = B_TRUE;
1161         else
1162                 zone->zsz_new = B_FALSE;
1163 
1164         (void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool));
1165         (void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset));
1166         zone->zsz_default_sched = sched;
1167 
1168         /* Schedulers updated later as processes are found */
1169         zone->zsz_scheds = 0;
1170 
1171         /* Cpus updated later as psets bound are identified */
1172         zone->zsz_cpus_online = 0;
1173 
1174         zone->zsz_cputype = cputype;
1175         zone->zsz_iptype = iptype;
1176         zone->zsz_psetid = ZS_PSET_ERROR;
1177         zone->zsz_cpu_cap = cpu_cap;
1178         zone->zsz_cpu_shares = cpu_shares;
1179         zone->zsz_ram_cap = ram_cap;
1180         zone->zsz_locked_cap = locked_cap;
1181         zone->zsz_vm_cap = vm_cap;
1182         zone->zsz_processes_cap = processes_cap;
1183         zone->zsz_processes = processes;
1184         zone->zsz_lwps_cap = lwps_cap;
1185         zone->zsz_lwps = lwps;
1186         zone->zsz_shm_cap = shm_cap;
1187         zone->zsz_shm = shm;
1188         zone->zsz_shmids_cap = shmids_cap;
1189         zone->zsz_shmids = shmids;
1190         zone->zsz_semids_cap = semids_cap;
1191         zone->zsz_semids = semids;
1192         zone->zsz_msgids_cap = msgids_cap;
1193         zone->zsz_msgids = msgids;
1194         zone->zsz_lofi_cap = lofi_cap;
1195         zone->zsz_lofi = lofi;
1196 
1197         sys->zss_processes += processes;
1198         sys->zss_lwps += lwps;
1199         sys->zss_shm += shm;
1200         sys->zss_shmids += shmids;
1201         sys->zss_semids += semids;
1202         sys->zss_msgids += msgids;
1203         sys->zss_lofi += lofi;
1204         zone->zsz_active = B_TRUE;
1205 }
1206 
1207 
1208 /* Determine which zones have halted */
1209 static void
1210 zsd_mark_zones_end(zsd_ctl_t *ctl)
1211 {
1212         zsd_zone_t *zone, *tmp;
1213 
1214         /*
1215          * Mark zone as not existing, or delete if it did not exist in
1216          * previous interval.
1217          */
1218         zone = list_head(&ctl->zsctl_zones);
1219         while (zone != NULL) {
1220                 if (zone->zsz_found == B_FALSE) {
1221                         zone->zsz_empty = B_TRUE;
1222                         if (zone->zsz_deleted == B_TRUE) {
1223                                 /*
1224                                  * Zone deleted in prior interval,
1225                                  * so it no longer exists.
1226                                  */
1227                                 tmp = zone;
1228                                 zone = list_next(&ctl->zsctl_zones, zone);
1229                                 list_remove(&ctl->zsctl_zones, tmp);
1230                                 free(tmp);
1231                                 ctl->zsctl_nzones--;
1232                                 continue;
1233                         } else {
1234                                 zone->zsz_new = B_FALSE;
1235                                 zone->zsz_deleted = B_TRUE;
1236                                 zone->zsz_active = B_TRUE;
1237                         }
1238                 }
1239                 zone = list_next(&ctl->zsctl_zones, zone);
1240         }
1241 }
1242 
1243 /*
1244  * Mark cpus as not existing.  If a cpu is found, it will be updated.  If
1245  * a cpu is not found, then it must have gone offline, so it will be
1246  * deleted.
1247  *
1248  * The kstat tracking data is rolled so that the usage since the previous
1249  * interval can be determined.
1250  */
1251 static void
1252 zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll)
1253 {
1254         zsd_cpu_t *cpu;
1255 
1256         /*
1257          * Mark all cpus as not existing.  As cpus are found, they will
1258          * be marked as existing.
1259          */
1260         for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL;
1261             cpu = list_next(&ctl->zsctl_cpus, cpu)) {
1262                 cpu->zsc_found = B_FALSE;
1263                 if (cpu->zsc_active == B_TRUE && roll) {
1264                         cpu->zsc_psetid_prev = cpu->zsc_psetid;
1265                         cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1266                         cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1267                         cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1268                         cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1269                 }
1270         }
1271 }
1272 
1273 /*
1274  * An array the size of the maximum number of cpus is kept.  Within this array
1275  * a list of the online cpus is maintained.
1276  */
1277 zsd_cpu_t *
1278 zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid)
1279 {
1280         zsd_cpu_t *cpu;
1281 
1282         assert(cpuid < ctl->zsctl_maxcpuid);
1283         cpu = &(ctl->zsctl_cpu_array[cpuid]);
1284         assert(cpuid == cpu->zsc_id);
1285 
1286         if (cpu->zsc_allocated == B_FALSE) {
1287                 cpu->zsc_allocated = B_TRUE;
1288                 list_insert_tail(&ctl->zsctl_cpus, cpu);
1289         }
1290         return (cpu);
1291 }
1292 
1293 /* A cpu has been found.  Update its information */
1294 static void
1295 zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid)
1296 {
1297         /*
1298          * legacy processor sets, the cpu may move while zonestatd is
1299          * inspecting, causing it to be found twice.  In this case, just
1300          * leave cpu in the first processor set in which it was found.
1301          */
1302         if (cpu->zsc_found == B_TRUE)
1303                 return;
1304 
1305         /* Mark cpu as online */
1306         cpu->zsc_found = B_TRUE;
1307         cpu->zsc_offlined = B_FALSE;
1308         cpu->zsc_pset = pset;
1309         /*
1310          * cpu is newly online.
1311          */
1312         if (cpu->zsc_active == B_FALSE) {
1313                 /*
1314                  * Cpu is newly online.
1315                  */
1316                 cpu->zsc_onlined = B_TRUE;
1317                 cpu->zsc_psetid = psetid;
1318                 cpu->zsc_psetid_prev = psetid;
1319         } else {
1320                 /*
1321                  * cpu online during previous interval.  Save properties at
1322                  * start of interval
1323                  */
1324                 cpu->zsc_onlined = B_FALSE;
1325                 cpu->zsc_psetid = psetid;
1326 
1327         }
1328         cpu->zsc_active = B_TRUE;
1329 }
1330 
1331 /* Remove all offlined cpus from the list of tracked cpus */
1332 static void
1333 zsd_mark_cpus_end(zsd_ctl_t *ctl)
1334 {
1335         zsd_cpu_t *cpu, *tmp;
1336         int id;
1337 
1338         /* Mark cpu as online or offline */
1339         cpu = list_head(&ctl->zsctl_cpus);
1340         while (cpu != NULL) {
1341                 if (cpu->zsc_found == B_FALSE) {
1342                         if (cpu->zsc_offlined == B_TRUE) {
1343                                 /*
1344                                  * cpu offlined in prior interval. It is gone.
1345                                  */
1346                                 tmp = cpu;
1347                                 cpu = list_next(&ctl->zsctl_cpus, cpu);
1348                                 list_remove(&ctl->zsctl_cpus, tmp);
1349                                 /* Clear structure for future use */
1350                                 id = tmp->zsc_id;
1351                                 bzero(tmp, sizeof (zsd_cpu_t));
1352                                 tmp->zsc_id = id;
1353                                 tmp->zsc_allocated = B_FALSE;
1354                                 tmp->zsc_psetid = ZS_PSET_ERROR;
1355                                 tmp->zsc_psetid_prev = ZS_PSET_ERROR;
1356 
1357                         } else {
1358                                 /*
1359                                  * cpu online at start of interval.  Treat
1360                                  * as still online, since it was online for
1361                                  * some portion of the interval.
1362                                  */
1363                                 cpu->zsc_offlined = B_TRUE;
1364                                 cpu->zsc_onlined = B_FALSE;
1365                                 cpu->zsc_active = B_TRUE;
1366                                 cpu->zsc_psetid = cpu->zsc_psetid_prev;
1367                                 cpu->zsc_pset = NULL;
1368                         }
1369                 }
1370                 cpu = list_next(&ctl->zsctl_cpus, cpu);
1371         }
1372 }
1373 
1374 /* Some utility functions for managing the list of processor sets */
1375 static zsd_pset_t *
1376 zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid)
1377 {
1378         zsd_pset_t *pset;
1379 
1380         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1381             pset = list_next(&ctl->zsctl_psets, pset)) {
1382                 if (pset->zsp_id == psetid)
1383                         return (pset);
1384         }
1385         return (NULL);
1386 }
1387 
1388 static zsd_pset_t *
1389 zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1390 {
1391         zsd_pset_t *pset;
1392 
1393         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1394             pset = list_next(&ctl->zsctl_psets, pset)) {
1395                 if (strcmp(pset->zsp_name, psetname) == 0) {
1396                         if (psetid != -1)
1397                                 pset->zsp_id = psetid;
1398                         return (pset);
1399                 }
1400         }
1401         return (NULL);
1402 }
1403 
1404 static zsd_pset_t *
1405 zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1406 {
1407         zsd_pset_t *pset;
1408 
1409         if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL)
1410                 return (NULL);
1411 
1412         (void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name));
1413         pset->zsp_id = psetid;
1414         pset->zsp_found = B_FALSE;
1415         /*
1416          * Allocate as deleted so if not found in first pass, pset is deleted
1417          * from list.  This can happen if pset is returned by pset_list, but
1418          * is destroyed before first attempt to fetch pset details.
1419          */
1420         list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t),
1421             offsetof(zsd_pset_usage_t, zsu_next));
1422 
1423         pset->zsp_hrstart = g_hrnow;
1424         pset->zsp_deleted = B_TRUE;
1425         pset->zsp_empty = B_TRUE;
1426         ctl->zsctl_npsets++;
1427 
1428         return (pset);
1429 }
1430 
1431 static zsd_pset_t *
1432 zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1433 {
1434         zsd_pset_t *pset, *tmp;
1435 
1436         if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL)
1437                 return (pset);
1438 
1439         if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL)
1440                 return (NULL);
1441 
1442         /* Insert sorted by psetname */
1443         tmp = list_head(&ctl->zsctl_psets);
1444         while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0)
1445                 tmp = list_next(&ctl->zsctl_psets, tmp);
1446 
1447         list_insert_before(&ctl->zsctl_psets, tmp, pset);
1448         return (pset);
1449 }
1450 
1451 /* Some utility functions for managing the list of zones using each pset */
1452 static zsd_pset_usage_t *
1453 zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone)
1454 {
1455         zsd_pset_usage_t *usage;
1456 
1457         for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
1458             usage = list_next(&pset->zsp_usage_list, usage))
1459                 if (usage->zsu_zone == zone)
1460                         return (usage);
1461 
1462         return (NULL);
1463 }
1464 
1465 static zsd_pset_usage_t *
1466 zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1467 {
1468         zsd_pset_usage_t *usage;
1469 
1470         if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t)))
1471             == NULL)
1472                 return (NULL);
1473 
1474         list_link_init(&usage->zsu_next);
1475         usage->zsu_zone = zone;
1476         usage->zsu_zoneid = zone->zsz_id;
1477         usage->zsu_pset = pset;
1478         usage->zsu_found = B_FALSE;
1479         usage->zsu_active = B_FALSE;
1480         usage->zsu_new = B_FALSE;
1481         /*
1482          * Allocate as not deleted.  If a process is found in a pset for
1483          * a zone, the usage will not be deleted until at least the next
1484          * interval.
1485          */
1486         usage->zsu_start = g_now;
1487         usage->zsu_hrstart = g_hrnow;
1488         usage->zsu_deleted = B_FALSE;
1489         usage->zsu_empty = B_TRUE;
1490         usage->zsu_scheds = 0;
1491         usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1492 
1493         ctl->zsctl_npset_usages++;
1494         pset->zsp_nusage++;
1495 
1496         return (usage);
1497 }
1498 
1499 static zsd_pset_usage_t *
1500 zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1501 {
1502         zsd_pset_usage_t *usage, *tmp;
1503 
1504         if ((usage = zsd_lookup_usage(pset, zone))
1505             != NULL)
1506                 return (usage);
1507 
1508         if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL)
1509                 return (NULL);
1510 
1511         tmp = list_head(&pset->zsp_usage_list);
1512         while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name)
1513             > 0)
1514                 tmp = list_next(&pset->zsp_usage_list, tmp);
1515 
1516         list_insert_before(&pset->zsp_usage_list, tmp, usage);
1517         return (usage);
1518 }
1519 
1520 static void
1521 zsd_refresh_system(zsd_ctl_t *ctl)
1522 {
1523         zsd_system_t *system = ctl->zsctl_system;
1524 
1525         /* Re-count these values each interval */
1526         system->zss_processes = 0;
1527         system->zss_lwps = 0;
1528         system->zss_shm = 0;
1529         system->zss_shmids = 0;
1530         system->zss_semids = 0;
1531         system->zss_msgids = 0;
1532         system->zss_lofi = 0;
1533 }
1534 
1535 
1536 /* Reads each cpu's kstats, and adds the usage to the cpu's pset */
1537 static void
1538 zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu)
1539 {
1540         zsd_system_t *sys;
1541         processorid_t cpuid;
1542         zsd_pset_t *pset_prev;
1543         zsd_pset_t *pset;
1544         kstat_t *kstat;
1545         kstat_named_t *knp;
1546         kid_t kid;
1547         uint64_t idle, intr, kern, user;
1548 
1549         sys = ctl->zsctl_system;
1550         pset = cpu->zsc_pset;
1551         knp = NULL;
1552         kid = -1;
1553         cpuid = cpu->zsc_id;
1554 
1555         /* Get the cpu time totals for this cpu */
1556         kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys");
1557         if (kstat == NULL)
1558                 return;
1559 
1560         kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
1561         if (kid == -1)
1562                 return;
1563 
1564         knp = kstat_data_lookup(kstat, "cpu_nsec_idle");
1565         if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1566                 return;
1567 
1568         idle = knp->value.ui64;
1569 
1570         knp = kstat_data_lookup(kstat, "cpu_nsec_kernel");
1571         if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1572                 return;
1573 
1574         kern = knp->value.ui64;
1575 
1576         knp = kstat_data_lookup(kstat, "cpu_nsec_user");
1577         if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1578                 return;
1579 
1580         user = knp->value.ui64;
1581 
1582         /*
1583          * Tracking intr time per cpu just exists for future enhancements.
1584          * The value is presently always zero.
1585          */
1586         intr = 0;
1587         cpu->zsc_nsec_idle = idle;
1588         cpu->zsc_nsec_intr = intr;
1589         cpu->zsc_nsec_kern = kern;
1590         cpu->zsc_nsec_user = user;
1591 
1592         if (cpu->zsc_onlined == B_TRUE) {
1593                 /*
1594                  * cpu is newly online.  There is no reference value,
1595                  * so just record its current stats for comparison
1596                  * on next stat read.
1597                  */
1598                 cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1599                 cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1600                 cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1601                 cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1602                 return;
1603         }
1604 
1605         /*
1606          * Calculate relative time since previous refresh.
1607          * Paranoia.  Don't let time  go backwards.
1608          */
1609         idle = intr = kern = user = 0;
1610         if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev)
1611                 idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev;
1612 
1613         if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev)
1614                 intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev;
1615 
1616         if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev)
1617                 kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev;
1618 
1619         if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev)
1620                 user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev;
1621 
1622         /* Update totals for cpu usage */
1623         TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle);
1624         TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr);
1625         TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern);
1626         TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user);
1627 
1628         /*
1629          * Add cpu's stats to its pset if it is known to be in
1630          * the pset since previous read.
1631          */
1632         if (cpu->zsc_psetid == cpu->zsc_psetid_prev ||
1633             cpu->zsc_psetid_prev == ZS_PSET_ERROR ||
1634             (pset_prev = zsd_lookup_pset_byid(ctl,
1635             cpu->zsc_psetid_prev)) == NULL) {
1636                 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle);
1637                 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr);
1638                 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern);
1639                 TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user);
1640         } else {
1641                 /*
1642                  * Last pset was different than current pset.
1643                  * Best guess is to split usage between the two.
1644                  */
1645                 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2);
1646                 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2);
1647                 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2);
1648                 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2);
1649 
1650                 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle,
1651                     (idle / 2) + (idle % 2));
1652                 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr,
1653                     (intr / 2) + (intr % 2));
1654                 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern,
1655                     (kern / 2) + (kern % 2));
1656                 TIMESTRUC_ADD_NANOSEC(pset->zsp_user,
1657                     (user / 2) + (user % 2));
1658         }
1659         TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle);
1660         TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr);
1661         TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern);
1662         TIMESTRUC_ADD_NANOSEC(sys->zss_user, user);
1663 }
1664 
1665 /* Determine the details of a processor set by pset_id */
1666 static int
1667 zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname,
1668     size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size,
1669     uint64_t *min, uint64_t *max, int64_t *importance)
1670 {
1671         uint_t old, num;
1672 
1673         pool_conf_t *conf = ctl->zsctl_pool_conf;
1674         pool_value_t **vals = ctl->zsctl_pool_vals;
1675         pool_resource_t **res_list = NULL;
1676         pool_resource_t *pset;
1677         pool_component_t **cpus = NULL;
1678         processorid_t *cache;
1679         const char *string;
1680         uint64_t uint64;
1681         int64_t int64;
1682         int i, ret, type;
1683 
1684         if (ctl->zsctl_pool_status == POOL_DISABLED) {
1685 
1686                 /*
1687                  * Inspect legacy psets
1688                  */
1689                 for (;;) {
1690                         old = num = ctl->zsctl_cpu_ncache;
1691                         ret = pset_info(psetid, &type, &num,
1692                             ctl->zsctl_cpu_cache);
1693                         if (ret < 0) {
1694                                 /* pset is gone.  Tell caller to retry */
1695                                 errno = EINTR;
1696                                 return (-1);
1697                         }
1698                         if (num <= old) {
1699                         /* Success */
1700                                 break;
1701                         }
1702                         if ((cache = (processorid_t *)realloc(
1703                             ctl->zsctl_cpu_cache, num *
1704                             sizeof (processorid_t))) != NULL) {
1705                                 ctl->zsctl_cpu_ncache = num;
1706                                 ctl->zsctl_cpu_cache = cache;
1707                         } else {
1708                                 /*
1709                                  * Could not allocate to get new cpu list.
1710                                  */
1711                                 zsd_warn(gettext(
1712                                     "Could not allocate for cpu list"));
1713                                 errno = ENOMEM;
1714                                 return (-1);
1715                         }
1716                 }
1717                 /*
1718                  * Old school pset.  Just make min and max equal
1719                  * to its size
1720                  */
1721                 if (psetid == ZS_PSET_DEFAULT) {
1722                         *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1723                         (void) strlcpy(psetname, "pset_default", namelen);
1724                 } else {
1725                         *cputype = ZS_CPUTYPE_PSRSET_PSET;
1726                         (void) snprintf(psetname, namelen,
1727                             "SUNWlegacy_pset_%d", psetid);
1728                 }
1729 
1730                 /*
1731                  * Just treat legacy pset as a simple pool pset
1732                  */
1733                 *online = num;
1734                 *size = num;
1735                 *min = num;
1736                 *max = num;
1737                 *importance = 1;
1738 
1739                 return (0);
1740         }
1741 
1742         /* Look up the pool pset using the pset id */
1743         res_list = NULL;
1744         pool_value_set_int64(vals[1], psetid);
1745         if (pool_value_set_name(vals[1], "pset.sys_id")
1746             != PO_SUCCESS)
1747                 goto err;
1748 
1749         if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1750                 goto err;
1751         if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1752                 goto err;
1753         if ((res_list = pool_query_resources(conf, &num, vals)) == NULL)
1754                 goto err;
1755         if (num != 1)
1756                 goto err;
1757         pset = res_list[0];
1758         free(res_list);
1759         res_list = NULL;
1760         if (pool_get_property(conf, pool_resource_to_elem(conf, pset),
1761             "pset.name", vals[0]) != POC_STRING ||
1762             pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1763                 goto err;
1764 
1765         (void) strlcpy(psetname, string, namelen);
1766         if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0)
1767                 *cputype = ZS_CPUTYPE_DEDICATED;
1768         else if (psetid == ZS_PSET_DEFAULT)
1769                 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1770         else
1771                 *cputype = ZS_CPUTYPE_POOL_PSET;
1772 
1773         /* Get size, min, max, and importance */
1774         if (pool_get_property(conf, pool_resource_to_elem(conf,
1775             pset), "pset.size", vals[0]) == POC_UINT &&
1776             pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1777                 *size = uint64;
1778         else
1779                 *size = 0;
1780 
1781                 /* Get size, min, max, and importance */
1782         if (pool_get_property(conf, pool_resource_to_elem(conf,
1783             pset), "pset.min", vals[0]) == POC_UINT &&
1784             pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1785                 *min = uint64;
1786         else
1787                 *min = 0;
1788         if (*min >= ZSD_PSET_UNLIMITED)
1789                 *min = ZS_LIMIT_NONE;
1790 
1791         if (pool_get_property(conf, pool_resource_to_elem(conf,
1792             pset), "pset.max", vals[0]) == POC_UINT &&
1793             pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1794                 *max = uint64;
1795         else
1796                 *max = ZS_LIMIT_NONE;
1797 
1798         if (*max >= ZSD_PSET_UNLIMITED)
1799                 *max = ZS_LIMIT_NONE;
1800 
1801         if (pool_get_property(conf, pool_resource_to_elem(conf,
1802             pset), "pset.importance", vals[0]) == POC_INT &&
1803             pool_value_get_int64(vals[0], &int64) == PO_SUCCESS)
1804                 *importance = int64;
1805         else
1806                 *importance = (uint64_t)1;
1807 
1808         *online = 0;
1809         if (*size == 0)
1810                 return (0);
1811 
1812         /* get cpus */
1813         cpus = pool_query_resource_components(conf, pset, &num, NULL);
1814         if (cpus == NULL)
1815                 goto err;
1816 
1817         /* Make sure there is space for cpu id list */
1818         if (num > ctl->zsctl_cpu_ncache) {
1819                 if ((cache = (processorid_t *)realloc(
1820                     ctl->zsctl_cpu_cache, num *
1821                     sizeof (processorid_t))) != NULL) {
1822                         ctl->zsctl_cpu_ncache = num;
1823                         ctl->zsctl_cpu_cache = cache;
1824                 } else {
1825                         /*
1826                          * Could not allocate to get new cpu list.
1827                          */
1828                         zsd_warn(gettext(
1829                             "Could not allocate for cpu list"));
1830                         goto err;
1831                 }
1832         }
1833 
1834         /* count the online cpus */
1835         for (i = 0; i < num; i++) {
1836                 if (pool_get_property(conf, pool_component_to_elem(
1837                     conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING ||
1838                     pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1839                         goto err;
1840 
1841                 if (strcmp(string, "on-line") != 0 &&
1842                     strcmp(string, "no-intr") != 0)
1843                         continue;
1844 
1845                 if (pool_get_property(conf, pool_component_to_elem(
1846                     conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT ||
1847                     pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
1848                         goto err;
1849 
1850                 (*online)++;
1851                 ctl->zsctl_cpu_cache[i] = (psetid_t)int64;
1852         }
1853         free(cpus);
1854         return (0);
1855 err:
1856         if (res_list != NULL)
1857                 free(res_list);
1858         if (cpus != NULL)
1859                 free(cpus);
1860 
1861         /*
1862          * The pools operations should succeed since the conf is a consistent
1863          * snapshot.  Tell caller there is no need to retry.
1864          */
1865         errno = EINVAL;
1866         return (-1);
1867 }
1868 
1869 /*
1870  * Update the current list of processor sets.
1871  * This also updates the list of online cpus, and each cpu's pset membership.
1872  */
1873 static void
1874 zsd_refresh_psets(zsd_ctl_t *ctl)
1875 {
1876         int i, j, ret, state;
1877         uint_t old, num;
1878         uint_t cputype;
1879         int64_t sys_id, importance;
1880         uint64_t online, size, min, max;
1881         zsd_system_t *system;
1882         zsd_pset_t *pset;
1883         zsd_cpu_t *cpu;
1884         psetid_t *cache;
1885         char psetname[ZS_PSETNAME_MAX];
1886         processorid_t cpuid;
1887         pool_value_t *pv_save = NULL;
1888         pool_resource_t **res_list = NULL;
1889         pool_resource_t *res;
1890         pool_value_t **vals;
1891         pool_conf_t *conf;
1892         boolean_t roll_cpus = B_TRUE;
1893 
1894         /* Zero cpu counters to recount them */
1895         system = ctl->zsctl_system;
1896         system->zss_ncpus = 0;
1897         system->zss_ncpus_online = 0;
1898 retry:
1899         ret = pool_get_status(&state);
1900         if (ret == 0 && state == POOL_ENABLED) {
1901 
1902                 conf = ctl->zsctl_pool_conf;
1903                 vals = ctl->zsctl_pool_vals;
1904                 pv_save = vals[1];
1905                 vals[1] = NULL;
1906 
1907                 if (ctl->zsctl_pool_status == POOL_DISABLED) {
1908                         if (pool_conf_open(ctl->zsctl_pool_conf,
1909                             pool_dynamic_location(), PO_RDONLY) == 0) {
1910                                 ctl->zsctl_pool_status = POOL_ENABLED;
1911                                 ctl->zsctl_pool_changed = POU_PSET;
1912                         }
1913                 } else {
1914                         ctl->zsctl_pool_changed = 0;
1915                         ret = pool_conf_update(ctl->zsctl_pool_conf,
1916                             &(ctl->zsctl_pool_changed));
1917                         if (ret < 0) {
1918                                 /* Pools must have become disabled */
1919                                 (void) pool_conf_close(ctl->zsctl_pool_conf);
1920                                 ctl->zsctl_pool_status = POOL_DISABLED;
1921                                 if (pool_error() == POE_SYSTEM && errno ==
1922                                     ENOTACTIVE)
1923                                         goto retry;
1924 
1925                                 zsd_warn(gettext(
1926                                     "Unable to update pool configuration"));
1927                                 /* Not able to get pool info.  Don't update. */
1928                                 goto err;
1929                         }
1930                 }
1931                 /* Get the list of psets using libpool */
1932                 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1933                         goto err;
1934 
1935                 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1936                         goto err;
1937                 if ((res_list = pool_query_resources(conf, &num, vals))
1938                     == NULL)
1939                         goto err;
1940 
1941                 if (num > ctl->zsctl_pset_ncache)  {
1942                         if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1943                             (num) * sizeof (psetid_t))) == NULL) {
1944                                 goto err;
1945                         }
1946                         ctl->zsctl_pset_ncache = num;
1947                         ctl->zsctl_pset_cache = cache;
1948                 }
1949                 /* Save the pset id of each pset */
1950                 for (i = 0; i < num; i++) {
1951                         res = res_list[i];
1952                         if (pool_get_property(conf, pool_resource_to_elem(conf,
1953                             res), "pset.sys_id", vals[0]) != POC_INT ||
1954                             pool_value_get_int64(vals[0], &sys_id)
1955                             != PO_SUCCESS)
1956                                 goto err;
1957                         ctl->zsctl_pset_cache[i] = (int)sys_id;
1958                 }
1959                 vals[1] = pv_save;
1960                 pv_save = NULL;
1961         } else {
1962                 if (ctl->zsctl_pool_status == POOL_ENABLED) {
1963                         (void) pool_conf_close(ctl->zsctl_pool_conf);
1964                         ctl->zsctl_pool_status = POOL_DISABLED;
1965                 }
1966                 /* Get the pset list using legacy psets */
1967                 for (;;) {
1968                         old = num = ctl->zsctl_pset_ncache;
1969                         (void) pset_list(ctl->zsctl_pset_cache, &num);
1970                         if ((num + 1) <= old) {
1971                                 break;
1972                         }
1973                         if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1974                             (num + 1) * sizeof (psetid_t))) != NULL) {
1975                                 ctl->zsctl_pset_ncache = num + 1;
1976                                 ctl->zsctl_pset_cache = cache;
1977                         } else {
1978                                 /*
1979                                  * Could not allocate to get new pset list.
1980                                  * Give up
1981                                  */
1982                                 return;
1983                         }
1984                 }
1985                 /* Add the default pset to list */
1986                 ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0];
1987                 ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT;
1988                 num++;
1989         }
1990 psets_changed:
1991         zsd_mark_cpus_start(ctl, roll_cpus);
1992         zsd_mark_psets_start(ctl);
1993         roll_cpus = B_FALSE;
1994 
1995         /* Refresh cpu membership of all psets */
1996         for (i = 0; i < num; i++) {
1997 
1998                 /* Get pool pset information */
1999                 sys_id = ctl->zsctl_pset_cache[i];
2000                 if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname),
2001                     &cputype, &online, &size, &min, &max, &importance)
2002                     != 0) {
2003                         if (errno == EINTR)
2004                                 goto psets_changed;
2005                         zsd_warn(gettext("Failed to get info for pset %d"),
2006                             sys_id);
2007                         continue;
2008                 }
2009 
2010                 system->zss_ncpus += size;
2011                 system->zss_ncpus_online += online;
2012 
2013                 pset = zsd_lookup_insert_pset(ctl, psetname,
2014                     ctl->zsctl_pset_cache[i]);
2015 
2016                 /* update pset info */
2017                 zsd_mark_pset_found(pset, cputype, online, size, min,
2018                     max, importance);
2019 
2020                 /* update each cpu in pset */
2021                 for (j = 0; j < pset->zsp_online; j++) {
2022                         cpuid = ctl->zsctl_cpu_cache[j];
2023                         cpu = zsd_lookup_insert_cpu(ctl, cpuid);
2024                         zsd_mark_cpu_found(cpu, pset, sys_id);
2025                 }
2026         }
2027 err:
2028         if (res_list != NULL)
2029                 free(res_list);
2030         if (pv_save != NULL)
2031                 vals[1] = pv_save;
2032 }
2033 
2034 
2035 
2036 /*
2037  * Fetch the current pool and pset name for the given zone.
2038  */
2039 static void
2040 zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone,
2041     char *pool, int poollen, char *pset, int psetlen, uint_t *cputype)
2042 {
2043         poolid_t poolid;
2044         pool_t **pools = NULL;
2045         pool_resource_t **res_list = NULL;
2046         char poolname[ZS_POOLNAME_MAX];
2047         char psetname[ZS_PSETNAME_MAX];
2048         pool_conf_t *conf = ctl->zsctl_pool_conf;
2049         pool_value_t *pv_save = NULL;
2050         pool_value_t **vals = ctl->zsctl_pool_vals;
2051         const char *string;
2052         int ret;
2053         int64_t int64;
2054         uint_t num;
2055 
2056         ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID,
2057             &poolid, sizeof (poolid));
2058         if (ret < 0)
2059                 goto lookup_done;
2060 
2061         pv_save = vals[1];
2062         vals[1] = NULL;
2063         pools = NULL;
2064         res_list = NULL;
2065 
2066         /* Default values if lookup fails */
2067         (void) strlcpy(poolname, "pool_default", sizeof (poolname));
2068         (void) strlcpy(psetname, "pset_default", sizeof (poolname));
2069         *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2070 
2071         /* no dedicated cpu if pools are disabled */
2072         if (ctl->zsctl_pool_status == POOL_DISABLED)
2073                 goto lookup_done;
2074 
2075         /* Get the pool name using the id */
2076         pool_value_set_int64(vals[0], poolid);
2077         if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS)
2078                 goto lookup_done;
2079 
2080         if ((pools = pool_query_pools(conf, &num, vals)) == NULL)
2081                 goto lookup_done;
2082 
2083         if (num != 1)
2084                 goto lookup_done;
2085 
2086         if (pool_get_property(conf, pool_to_elem(conf, pools[0]),
2087             "pool.name", vals[0]) != POC_STRING ||
2088             pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2089                 goto lookup_done;
2090         (void) strlcpy(poolname, (char *)string, sizeof (poolname));
2091 
2092         /* Get the name of the pset for the pool */
2093         if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
2094                 goto lookup_done;
2095 
2096         if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
2097                 goto lookup_done;
2098 
2099         if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals))
2100             == NULL)
2101                 goto lookup_done;
2102 
2103         if (num != 1)
2104                 goto lookup_done;
2105 
2106         if (pool_get_property(conf, pool_resource_to_elem(conf,
2107             res_list[0]), "pset.sys_id", vals[0]) != POC_INT ||
2108             pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
2109                 goto lookup_done;
2110 
2111         if (int64 == ZS_PSET_DEFAULT)
2112                 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2113 
2114         if (pool_get_property(conf, pool_resource_to_elem(conf,
2115             res_list[0]), "pset.name", vals[0]) != POC_STRING ||
2116             pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2117                 goto lookup_done;
2118 
2119         (void) strlcpy(psetname, (char *)string, sizeof (psetname));
2120 
2121         if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0)
2122                 *cputype = ZS_CPUTYPE_DEDICATED;
2123         if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0)
2124                 *cputype = ZS_CPUTYPE_PSRSET_PSET;
2125         else
2126                 *cputype = ZS_CPUTYPE_POOL_PSET;
2127 
2128 lookup_done:
2129 
2130         if (pv_save != NULL)
2131                 vals[1] = pv_save;
2132 
2133         if (res_list)
2134                 free(res_list);
2135         if (pools)
2136                 free(pools);
2137 
2138         (void) strlcpy(pool, poolname, poollen);
2139         (void) strlcpy(pset, psetname, psetlen);
2140 }
2141 
2142 /* Convert scheduler names to ZS_* scheduler flags */
2143 static uint_t
2144 zsd_schedname2int(char *clname, int pri)
2145 {
2146         uint_t sched = 0;
2147 
2148         if (strcmp(clname, "TS") == 0) {
2149                 sched = ZS_SCHED_TS;
2150         } else if (strcmp(clname, "IA") == 0) {
2151                 sched = ZS_SCHED_IA;
2152         } else if (strcmp(clname, "FX") == 0) {
2153                 if (pri > 59) {
2154                         sched = ZS_SCHED_FX_60;
2155                 } else {
2156                         sched = ZS_SCHED_FX;
2157                 }
2158         } else if (strcmp(clname, "RT") == 0) {
2159                 sched = ZS_SCHED_RT;
2160 
2161         } else if (strcmp(clname, "FSS") == 0) {
2162                 sched = ZS_SCHED_FSS;
2163         }
2164         return (sched);
2165 }
2166 
2167 static uint64_t
2168 zsd_get_zone_rctl_limit(char *name)
2169 {
2170         rctlblk_t *rblk;
2171 
2172         rblk = (rctlblk_t *)alloca(rctlblk_size());
2173         if (getrctl(name, NULL, rblk, RCTL_FIRST)
2174             != 0) {
2175                 return (ZS_LIMIT_NONE);
2176         }
2177         return (rctlblk_get_value(rblk));
2178 }
2179 
2180 static uint64_t
2181 zsd_get_zone_rctl_usage(char *name)
2182 {
2183         rctlblk_t *rblk;
2184 
2185         rblk = (rctlblk_t *)alloca(rctlblk_size());
2186         if (getrctl(name, NULL, rblk, RCTL_USAGE)
2187             != 0) {
2188                 return (0);
2189         }
2190         return (rctlblk_get_value(rblk));
2191 }
2192 
2193 #define ZSD_NUM_RCTL_VALS 19
2194 
2195 /*
2196  * Fetch the limit information for a zone.  This uses zone_enter() as the
2197  * getrctl(2) system call only returns rctl information for the zone of
2198  * the caller.
2199  */
2200 static int
2201 zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares,
2202     uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap,
2203     uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes,
2204     uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm,
2205     uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap,
2206     uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids,
2207     uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched)
2208 {
2209         int p[2], pid, tmpl_fd, ret;
2210         ctid_t ct;
2211         char class[PC_CLNMSZ];
2212         uint64_t vals[ZSD_NUM_RCTL_VALS];
2213         zsd_system_t *sys = ctl->zsctl_system;
2214         int i = 0;
2215         int res = 0;
2216 
2217         /* Treat all caps as no cap on error */
2218         *cpu_shares = ZS_LIMIT_NONE;
2219         *cpu_cap = ZS_LIMIT_NONE;
2220         *ram_cap = ZS_LIMIT_NONE;
2221         *locked_cap = ZS_LIMIT_NONE;
2222         *vm_cap = ZS_LIMIT_NONE;
2223 
2224         *processes_cap = ZS_LIMIT_NONE;
2225         *lwps_cap = ZS_LIMIT_NONE;
2226         *shm_cap = ZS_LIMIT_NONE;
2227         *shmids_cap = ZS_LIMIT_NONE;
2228         *semids_cap = ZS_LIMIT_NONE;
2229         *msgids_cap = ZS_LIMIT_NONE;
2230         *lofi_cap = ZS_LIMIT_NONE;
2231 
2232         *processes = 0;
2233         *lwps = 0;
2234         *shm = 0;
2235         *shmids = 0;
2236         *semids = 0;
2237         *msgids = 0;
2238         *lofi = 0;
2239 
2240         /* Get the ram cap first since it is a zone attr */
2241         ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP,
2242             ram_cap, sizeof (*ram_cap));
2243         if (ret < 0 || *ram_cap == 0)
2244                 *ram_cap = ZS_LIMIT_NONE;
2245 
2246         /* Get the zone's default scheduling class */
2247         ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS,
2248             class, sizeof (class));
2249         if (ret < 0)
2250                 return (-1);
2251 
2252         *sched = zsd_schedname2int(class, 0);
2253 
2254         /* rctl caps must be fetched from within the zone */
2255         if (pipe(p) != 0)
2256                 return (-1);
2257 
2258         if ((tmpl_fd = init_template()) == -1) {
2259                 (void) close(p[0]);
2260                 (void) close(p[1]);
2261                 return (-1);
2262         }
2263         pid = forkx(0);
2264         if (pid < 0) {
2265                 (void) ct_tmpl_clear(tmpl_fd);
2266                 (void) close(p[0]);
2267                 (void) close(p[1]);
2268                 return (-1);
2269         }
2270         if (pid == 0) {
2271 
2272                 (void) ct_tmpl_clear(tmpl_fd);
2273                 (void) close(tmpl_fd);
2274                 (void) close(p[0]);
2275                 if (zone->zsz_id != getzoneid()) {
2276                         if (zone_enter(zone->zsz_id) < 0) {
2277                                 (void) close(p[1]);
2278                                 _exit(0);
2279                         }
2280                 }
2281 
2282                 /* Get caps for zone, and write them to zonestatd parent. */
2283                 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares");
2284                 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap");
2285                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory");
2286                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap");
2287                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes");
2288                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes");
2289                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps");
2290                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps");
2291                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory");
2292                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory");
2293                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids");
2294                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids");
2295                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids");
2296                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids");
2297                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids");
2298                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids");
2299                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi");
2300                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi");
2301 
2302                 if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2303                     ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2304                         (void) close(p[1]);
2305                         _exit(1);
2306                 }
2307 
2308                 (void) close(p[1]);
2309                 _exit(0);
2310         }
2311         if (contract_latest(&ct) == -1)
2312                 ct = -1;
2313 
2314         (void) ct_tmpl_clear(tmpl_fd);
2315         (void) close(tmpl_fd);
2316         (void) close(p[1]);
2317         while (waitpid(pid, NULL, 0) != pid)
2318                 ;
2319 
2320         /* Read cap from child in zone */
2321         if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2322             ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2323                 res = -1;
2324                 goto cleanup;
2325         }
2326         i = 0;
2327         *cpu_shares = vals[i++];
2328         *cpu_cap = vals[i++];
2329         *locked_cap = vals[i++];
2330         *vm_cap = vals[i++];
2331         *processes_cap = vals[i++];
2332         *processes = vals[i++];
2333         *lwps_cap = vals[i++];
2334         *lwps = vals[i++];
2335         *shm_cap = vals[i++];
2336         *shm = vals[i++];
2337         *shmids_cap = vals[i++];
2338         *shmids = vals[i++];
2339         *semids_cap = vals[i++];
2340         *semids = vals[i++];
2341         *msgids_cap = vals[i++];
2342         *msgids = vals[i++];
2343         *lofi_cap = vals[i++];
2344         *lofi = vals[i++];
2345 
2346         /* Interpret maximum values as no cap */
2347         if (*cpu_cap == UINT32_MAX || *cpu_cap == 0)
2348                 *cpu_cap = ZS_LIMIT_NONE;
2349         if (*processes_cap == sys->zss_processes_max)
2350                 *processes_cap = ZS_LIMIT_NONE;
2351         if (*lwps_cap == sys->zss_lwps_max)
2352                 *lwps_cap = ZS_LIMIT_NONE;
2353         if (*shm_cap == sys->zss_shm_max)
2354                 *shm_cap = ZS_LIMIT_NONE;
2355         if (*shmids_cap == sys->zss_shmids_max)
2356                 *shmids_cap = ZS_LIMIT_NONE;
2357         if (*semids_cap == sys->zss_semids_max)
2358                 *semids_cap = ZS_LIMIT_NONE;
2359         if (*msgids_cap == sys->zss_msgids_max)
2360                 *msgids_cap = ZS_LIMIT_NONE;
2361         if (*lofi_cap == sys->zss_lofi_max)
2362                 *lofi_cap = ZS_LIMIT_NONE;
2363 
2364 
2365 cleanup:
2366         (void) close(p[0]);
2367         (void) ct_tmpl_clear(tmpl_fd);
2368         (void) close(tmpl_fd);
2369         (void) contract_abandon_id(ct);
2370 
2371         return (res);
2372 }
2373 
2374 /* Update the current list of running zones */
2375 static void
2376 zsd_refresh_zones(zsd_ctl_t *ctl)
2377 {
2378         zsd_zone_t *zone;
2379         uint_t old, num;
2380         ushort_t flags;
2381         int i, ret;
2382         zoneid_t *cache;
2383         uint64_t cpu_shares;
2384         uint64_t cpu_cap;
2385         uint64_t ram_cap;
2386         uint64_t locked_cap;
2387         uint64_t vm_cap;
2388         uint64_t processes_cap;
2389         uint64_t processes;
2390         uint64_t lwps_cap;
2391         uint64_t lwps;
2392         uint64_t shm_cap;
2393         uint64_t shm;
2394         uint64_t shmids_cap;
2395         uint64_t shmids;
2396         uint64_t semids_cap;
2397         uint64_t semids;
2398         uint64_t msgids_cap;
2399         uint64_t msgids;
2400         uint64_t lofi_cap;
2401         uint64_t lofi;
2402 
2403         char zonename[ZS_ZONENAME_MAX];
2404         char poolname[ZS_POOLNAME_MAX];
2405         char psetname[ZS_PSETNAME_MAX];
2406         uint_t sched;
2407         uint_t cputype;
2408         uint_t iptype;
2409 
2410         /* Get the current list of running zones */
2411         for (;;) {
2412                 old = num = ctl->zsctl_zone_ncache;
2413                 (void) zone_list(ctl->zsctl_zone_cache, &num);
2414                 if (num <= old)
2415                         break;
2416                 if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache,
2417                     (num) * sizeof (zoneid_t))) != NULL) {
2418                         ctl->zsctl_zone_ncache = num;
2419                         ctl->zsctl_zone_cache = cache;
2420                 } else {
2421                         /* Could not allocate to get new zone list.  Give up */
2422                         return;
2423                 }
2424         }
2425 
2426         zsd_mark_zones_start(ctl);
2427 
2428         for (i = 0; i < num; i++) {
2429 
2430                 ret = getzonenamebyid(ctl->zsctl_zone_cache[i],
2431                     zonename, sizeof (zonename));
2432                 if (ret < 0)
2433                         continue;
2434 
2435                 zone = zsd_lookup_insert_zone(ctl, zonename,
2436                     ctl->zsctl_zone_cache[i]);
2437 
2438                 ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS,
2439                     &flags, sizeof (flags));
2440                 if (ret < 0)
2441                         continue;
2442 
2443                 if (flags & ZF_NET_EXCL)
2444                         iptype = ZS_IPTYPE_EXCLUSIVE;
2445                 else
2446                         iptype = ZS_IPTYPE_SHARED;
2447 
2448                 zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname),
2449                     psetname, sizeof (psetname), &cputype);
2450 
2451                 if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap,
2452                     &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes,
2453                     &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids,
2454                     &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap,
2455                     &lofi, &sched) != 0)
2456                         continue;
2457 
2458                 zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap,
2459                     locked_cap, vm_cap, processes_cap, processes, lwps_cap,
2460                     lwps, shm_cap, shm, shmids_cap, shmids, semids_cap,
2461                     semids, msgids_cap, msgids, lofi_cap, lofi, poolname,
2462                     psetname, sched, cputype, iptype);
2463         }
2464 }
2465 
2466 /* Fetch the details of a process from its psinfo_t */
2467 static void
2468 zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid,
2469     psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid,
2470     timestruc_t *delta, uint_t *sched)
2471 {
2472         timestruc_t d;
2473         zsd_proc_t *proc;
2474 
2475         /* Get cached data for proc */
2476         proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]);
2477         *psetid = psinfo->pr_lwp.pr_bindpset;
2478 
2479         if (proc->zspr_psetid == ZS_PSET_ERROR)
2480                 *prev_psetid = *psetid;
2481         else
2482                 *prev_psetid = proc->zspr_psetid;
2483 
2484         *zoneid = psinfo->pr_zoneid;
2485         if (proc->zspr_zoneid == -1)
2486                 *prev_zoneid = *zoneid;
2487         else
2488                 *prev_zoneid = proc->zspr_zoneid;
2489 
2490         TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage);
2491         *delta = d;
2492 
2493         *sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname,
2494             psinfo->pr_lwp.pr_pri);
2495 
2496         /* Update cached data for proc */
2497         proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset;
2498         proc->zspr_zoneid = psinfo->pr_zoneid;
2499         proc->zspr_sched = *sched;
2500         proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec;
2501         proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec;
2502         proc->zspr_ppid = psinfo->pr_ppid;
2503 }
2504 
2505 /*
2506  * Reset the known cpu usage of a process. This is done after a process
2507  * exits so that if the pid is recycled, data from its previous life is
2508  * not reused
2509  */
2510 static void
2511 zsd_flush_proc_info(zsd_proc_t *proc)
2512 {
2513         proc->zspr_usage.tv_sec = 0;
2514         proc->zspr_usage.tv_nsec = 0;
2515 }
2516 
2517 /*
2518  * Open the current extended accounting file.  On initialization, open the
2519  * file as the current file to be used.  Otherwise, open the file as the
2520  * next file to use of the current file reaches EOF.
2521  */
2522 static int
2523 zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init)
2524 {
2525         int ret, oret, state, trys = 0, flags;
2526         int *fd, *open;
2527         ea_file_t *eaf;
2528         struct stat64 *stat;
2529         char path[MAXPATHLEN];
2530 
2531         /*
2532          * The accounting file is first opened at the tail.  Following
2533          * opens to new accounting files are opened at the head.
2534          */
2535         if (init == B_TRUE) {
2536                 flags = EO_NO_VALID_HDR | EO_TAIL;
2537                 fd = &ctl->zsctl_proc_fd;
2538                 eaf = &ctl->zsctl_proc_eaf;
2539                 stat = &ctl->zsctl_proc_stat;
2540                 open = &ctl->zsctl_proc_open;
2541         } else {
2542                 flags = EO_NO_VALID_HDR | EO_HEAD;
2543                 fd = &ctl->zsctl_proc_fd_next;
2544                 eaf = &ctl->zsctl_proc_eaf_next;
2545                 stat = &ctl->zsctl_proc_stat_next;
2546                 open = &ctl->zsctl_proc_open_next;
2547         }
2548 
2549         *fd = -1;
2550         *open = 0;
2551 retry:
2552         /* open accounting files for cpu consumption */
2553         ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state));
2554         if (ret != 0) {
2555                 zsd_warn(gettext("Unable to get process accounting state"));
2556                 goto err;
2557         }
2558         if (state != AC_ON) {
2559                 if (trys > 0) {
2560                         zsd_warn(gettext(
2561                             "Unable to enable process accounting"));
2562                         goto err;
2563                 }
2564                 (void) zsd_enable_cpu_stats();
2565                 trys++;
2566                 goto retry;
2567         }
2568 
2569         ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path));
2570         if (ret != 0) {
2571                 zsd_warn(gettext("Unable to get process accounting file"));
2572                 goto err;
2573         }
2574 
2575         if ((*fd = open64(path, O_RDONLY, 0)) >= 0 &&
2576             (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0)
2577                 ret = fstat64(*fd, stat);
2578 
2579         if (*fd < 0 || oret < 0 || ret < 0) {
2580                 struct timespec ts;
2581 
2582                 /*
2583                  * It is possible the accounting file is momentarily unavailable
2584                  * because it is being rolled.  Try for up to half a second.
2585                  *
2586                  * If failure to open accounting file persists, give up.
2587                  */
2588                 if (oret == 0)
2589                         (void) ea_close(eaf);
2590                 else if (*fd >= 0)
2591                         (void) close(*fd);
2592                 if (trys > 500) {
2593                         zsd_warn(gettext(
2594                             "Unable to open process accounting file"));
2595                         goto err;
2596                 }
2597                 /* wait one millisecond */
2598                 ts.tv_sec = 0;
2599                 ts.tv_nsec = NANOSEC / 1000;
2600                 (void) nanosleep(&ts, NULL);
2601                 goto retry;
2602         }
2603         *open = 1;
2604         return (0);
2605 err:
2606         if (*fd >= 0)
2607                 (void) close(*fd);
2608         *open = 0;
2609         *fd = -1;
2610         return (-1);
2611 }
2612 
2613 /*
2614  * Walk /proc and charge each process to its zone and processor set.
2615  * Then read exacct data for exited processes, and charge them as well.
2616  */
2617 static void
2618 zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init)
2619 {
2620         DIR *dir;
2621         struct dirent *dent;
2622         psinfo_t psinfo;
2623         int fd, ret;
2624         zsd_proc_t *proc, *pproc, *tmp, *next;
2625         list_t pplist, plist;
2626         zsd_zone_t *zone, *prev_zone;
2627         zsd_pset_t *pset, *prev_pset;
2628         psetid_t psetid, prev_psetid;
2629         zoneid_t zoneid, prev_zoneid;
2630         zsd_pset_usage_t *usage, *prev_usage;
2631         char path[MAXPATHLEN];
2632 
2633         ea_object_t object;
2634         ea_object_t pobject;
2635         boolean_t hrtime_expired = B_FALSE;
2636         struct timeval interval_end;
2637 
2638         timestruc_t delta, d1, d2;
2639         uint_t sched = 0;
2640 
2641         /*
2642          * Get the current accounting file.  The current accounting file
2643          * may be different than the file in use, as the accounting file
2644          * may have been rolled, or manually changed by an admin.
2645          */
2646         ret = zsd_open_exacct(ctl, init);
2647         if (ret != 0) {
2648                 zsd_warn(gettext("Unable to track process accounting"));
2649                 return;
2650         }
2651 
2652         /*
2653          * Mark the current time as the interval end time.  Don't track
2654          * processes that exit after this time.
2655          */
2656         (void) gettimeofday(&interval_end, NULL);
2657 
2658         dir = opendir("/proc");
2659         if (dir == NULL) {
2660                 zsd_warn(gettext("Unable to open /proc"));
2661                 return;
2662         }
2663 
2664         dent = ctl->zsctl_procfs_dent;
2665 
2666         (void) memset(dent, 0, ctl->zsctl_procfs_dent_size);
2667 
2668         /* Walk all processes and compute each zone's usage on each pset. */
2669         while (readdir_r(dir, dent) != 0) {
2670 
2671                 if (strcmp(dent->d_name, ".") == 0 ||
2672                     strcmp(dent->d_name, "..") == 0)
2673                         continue;
2674 
2675                 (void) snprintf(path, sizeof (path), "/proc/%s/psinfo",
2676                     dent->d_name);
2677 
2678                 fd = open(path, O_RDONLY);
2679                 if (fd < 0)
2680                         continue;
2681 
2682                 if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) {
2683                         (void) close(fd);
2684                         continue;
2685                 }
2686                 (void) close(fd);
2687 
2688                 zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid,
2689                     &zoneid, &prev_zoneid, &delta, &sched);
2690 
2691                 d1.tv_sec = delta.tv_sec / 2;
2692                 d1.tv_nsec = delta.tv_nsec / 2;
2693                 d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2);
2694                 d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2);
2695 
2696                 /* Get the zone and pset this process is running in */
2697                 zone = zsd_lookup_zone_byid(ctl, zoneid);
2698                 if (zone == NULL)
2699                         continue;
2700                 pset = zsd_lookup_pset_byid(ctl, psetid);
2701                 if (pset == NULL)
2702                         continue;
2703                 usage = zsd_lookup_insert_usage(ctl, pset, zone);
2704                 if (usage == NULL)
2705                         continue;
2706 
2707                 /*
2708                  * Get the usage of the previous zone and pset if they were
2709                  * different.
2710                  */
2711                 if (zoneid != prev_zoneid)
2712                         prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid);
2713                 else
2714                         prev_zone = NULL;
2715 
2716                 if (psetid != prev_psetid)
2717                         prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2718                 else
2719                         prev_pset = NULL;
2720 
2721                 prev_usage = NULL;
2722                 if (prev_zone != NULL || prev_pset != NULL) {
2723                         if (prev_zone == NULL)
2724                                 prev_zone = zone;
2725                         if (prev_pset == NULL)
2726                                 prev_pset = pset;
2727 
2728                         prev_usage = zsd_lookup_insert_usage(ctl, prev_pset,
2729                             prev_zone);
2730                 }
2731 
2732                 /* Update the usage with the processes info */
2733                 if (prev_usage == NULL) {
2734                         zsd_mark_pset_usage_found(usage, sched);
2735                 } else {
2736                         zsd_mark_pset_usage_found(usage, sched);
2737                         zsd_mark_pset_usage_found(prev_usage, sched);
2738                 }
2739 
2740                 /*
2741                  * First time around is just to get a starting point.  All
2742                  * usages will be zero.
2743                  */
2744                 if (init == B_TRUE)
2745                         continue;
2746 
2747                 if (prev_usage == NULL) {
2748                         zsd_add_usage(ctl, usage, &delta);
2749                 } else {
2750                         zsd_add_usage(ctl, usage, &d1);
2751                         zsd_add_usage(ctl, prev_usage, &d2);
2752                 }
2753         }
2754         (void) closedir(dir);
2755 
2756         /*
2757          * No need to collect exited proc data on initialization.  Just
2758          * caching the usage of the known processes to get a zero starting
2759          * point.
2760          */
2761         if (init == B_TRUE)
2762                 return;
2763 
2764         /*
2765          * Add accounting records to account for processes which have
2766          * exited.
2767          */
2768         list_create(&plist, sizeof (zsd_proc_t),
2769             offsetof(zsd_proc_t, zspr_next));
2770         list_create(&pplist, sizeof (zsd_proc_t),
2771             offsetof(zsd_proc_t, zspr_next));
2772 
2773         for (;;) {
2774                 pid_t pid;
2775                 pid_t ppid;
2776                 timestruc_t user, sys, proc_usage;
2777                 timestruc_t finish;
2778                 int numfound = 0;
2779 
2780                 bzero(&object, sizeof (object));
2781                 proc = NULL;
2782                 zone = NULL;
2783                 pset = NULL;
2784                 usage = NULL;
2785                 ret = ea_get_object(&ctl->zsctl_proc_eaf, &object);
2786                 if (ret == EO_ERROR) {
2787                         if (ea_error() == EXR_EOF) {
2788 
2789                                 struct stat64 *stat;
2790                                 struct stat64 *stat_next;
2791 
2792                                 /*
2793                                  * See if the next accounting file is the
2794                                  * same as the current accounting file.
2795                                  */
2796                                 stat = &(ctl->zsctl_proc_stat);
2797                                 stat_next = &(ctl->zsctl_proc_stat_next);
2798                                 if (stat->st_ino == stat_next->st_ino &&
2799                                     stat->st_dev == stat_next->st_dev) {
2800                                         /*
2801                                          * End of current accounting file is
2802                                          * reached, so finished.  Clear EOF
2803                                          * bit for next time around.
2804                                          */
2805                                         ea_clear(&ctl->zsctl_proc_eaf);
2806                                         break;
2807                                 } else {
2808                                         /*
2809                                          * Accounting file has changed.  Move
2810                                          * to current accounting file.
2811                                          */
2812                                         (void) ea_close(&ctl->zsctl_proc_eaf);
2813 
2814                                         ctl->zsctl_proc_fd =
2815                                             ctl->zsctl_proc_fd_next;
2816                                         ctl->zsctl_proc_eaf =
2817                                             ctl->zsctl_proc_eaf_next;
2818                                         ctl->zsctl_proc_stat =
2819                                             ctl->zsctl_proc_stat_next;
2820 
2821                                         ctl->zsctl_proc_fd_next = -1;
2822                                         ctl->zsctl_proc_open_next = 0;
2823                                         continue;
2824                                 }
2825                         } else {
2826                                 /*
2827                                  * Other accounting error.  Give up on
2828                                  * accounting.
2829                                  */
2830                                 goto ea_err;
2831                         }
2832                 }
2833                 /* Skip if not a process group */
2834                 if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP ||
2835                     (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) {
2836                         (void) ea_free_item(&object, EUP_ALLOC);
2837                         continue;
2838                 }
2839 
2840                 /* The process group entry should be complete */
2841                 while (numfound < 9) {
2842                         bzero(&pobject, sizeof (pobject));
2843                         ret = ea_get_object(&ctl->zsctl_proc_eaf,
2844                             &pobject);
2845                         if (ret < 0) {
2846                                 (void) ea_free_item(&object, EUP_ALLOC);
2847                                 zsd_warn(
2848                                     "unable to get process accounting data");
2849                                 goto ea_err;
2850                         }
2851                         /* Next entries should be process data */
2852                         if ((pobject.eo_catalog & EXT_TYPE_MASK) ==
2853                             EXT_GROUP) {
2854                                 (void) ea_free_item(&object, EUP_ALLOC);
2855                                 (void) ea_free_item(&pobject, EUP_ALLOC);
2856                                 zsd_warn(
2857                                     "process data of wrong type");
2858                                 goto ea_err;
2859                         }
2860                         switch (pobject.eo_catalog & EXD_DATA_MASK) {
2861                         case EXD_PROC_PID:
2862                                 pid = pobject.eo_item.ei_uint32;
2863                                 proc = &(ctl->zsctl_proc_array[pid]);
2864                                 /*
2865                                  * This process should not be currently in
2866                                  * the list of processes to process.
2867                                  */
2868                                 assert(!list_link_active(&proc->zspr_next));
2869                                 numfound++;
2870                                 break;
2871                         case EXD_PROC_ANCPID:
2872                                 ppid = pobject.eo_item.ei_uint32;
2873                                 pproc = &(ctl->zsctl_proc_array[ppid]);
2874                                 numfound++;
2875                                 break;
2876                         case EXD_PROC_ZONENAME:
2877                                 zone = zsd_lookup_zone(ctl,
2878                                     pobject.eo_item.ei_string, -1);
2879                                 numfound++;
2880                                 break;
2881                         case EXD_PROC_CPU_USER_SEC:
2882                                 user.tv_sec =
2883                                     pobject.eo_item.ei_uint64;
2884                                 numfound++;
2885                                 break;
2886                         case EXD_PROC_CPU_USER_NSEC:
2887                                 user.tv_nsec =
2888                                     pobject.eo_item.ei_uint64;
2889                                 numfound++;
2890                                 break;
2891                         case EXD_PROC_CPU_SYS_SEC:
2892                                 sys.tv_sec =
2893                                     pobject.eo_item.ei_uint64;
2894                                 numfound++;
2895                                 break;
2896                         case EXD_PROC_CPU_SYS_NSEC:
2897                                 sys.tv_nsec =
2898                                     pobject.eo_item.ei_uint64;
2899                                 numfound++;
2900                                 break;
2901                         case EXD_PROC_FINISH_SEC:
2902                                 finish.tv_sec =
2903                                     pobject.eo_item.ei_uint64;
2904                                 numfound++;
2905                                 break;
2906                         case EXD_PROC_FINISH_NSEC:
2907                                 finish.tv_nsec =
2908                                     pobject.eo_item.ei_uint64;
2909                                 numfound++;
2910                                 break;
2911                         }
2912                         (void) ea_free_item(&pobject, EUP_ALLOC);
2913                 }
2914                 (void) ea_free_item(&object, EUP_ALLOC);
2915                 if (numfound != 9) {
2916                         zsd_warn(gettext(
2917                             "Malformed process accounting entry found"));
2918                         goto proc_done;
2919                 }
2920 
2921                 if (finish.tv_sec > interval_end.tv_sec ||
2922                     (finish.tv_sec == interval_end.tv_sec &&
2923                     finish.tv_nsec > (interval_end.tv_usec * 1000)))
2924                         hrtime_expired = B_TRUE;
2925 
2926                 /*
2927                  * Try to identify the zone and pset to which this
2928                  * exited process belongs.
2929                  */
2930                 if (zone == NULL)
2931                         goto proc_done;
2932 
2933                 /* Save proc info */
2934                 proc->zspr_ppid = ppid;
2935                 proc->zspr_zoneid = zone->zsz_id;
2936 
2937                 prev_psetid = ZS_PSET_ERROR;
2938                 sched = 0;
2939 
2940                 /*
2941                  * The following tries to deduce the processes pset.
2942                  *
2943                  * First choose pset and sched using cached value from the
2944                  * most recent time the process has been seen.
2945                  *
2946                  * pset and sched can change across zone_enter, so make sure
2947                  * most recent sighting of this process was in the same
2948                  * zone before using most recent known value.
2949                  *
2950                  * If there is no known value, use value of processes
2951                  * parent.  If parent is unknown, walk parents until a known
2952                  * parent is found.
2953                  *
2954                  * If no parent in the zone is found, use the zone's default
2955                  * pset and scheduling class.
2956                  */
2957                 if (proc->zspr_psetid != ZS_PSET_ERROR) {
2958                         prev_psetid = proc->zspr_psetid;
2959                         pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2960                         sched = proc->zspr_sched;
2961                 } else if (pproc->zspr_zoneid == zone->zsz_id &&
2962                     pproc->zspr_psetid != ZS_PSET_ERROR) {
2963                         prev_psetid = pproc->zspr_psetid;
2964                         pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2965                         sched = pproc->zspr_sched;
2966                 }
2967 
2968                 if (pset == NULL) {
2969                         /*
2970                          * Process or processes parent has never been seen.
2971                          * Save to deduce a known parent later.
2972                          */
2973                         proc_usage = sys;
2974                         TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2975                         TIMESTRUC_DELTA(delta, proc_usage,
2976                             proc->zspr_usage);
2977                         proc->zspr_usage = delta;
2978                         list_insert_tail(&plist, proc);
2979                         continue;
2980                 }
2981 
2982                 /* Add the zone's usage to the pset */
2983                 usage = zsd_lookup_insert_usage(ctl, pset, zone);
2984                 if (usage == NULL)
2985                         goto proc_done;
2986 
2987                 zsd_mark_pset_usage_found(usage, sched);
2988 
2989                 /* compute the usage to add for the exited proc */
2990                 proc_usage = sys;
2991                 TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2992                 TIMESTRUC_DELTA(delta, proc_usage,
2993                     proc->zspr_usage);
2994 
2995                 zsd_add_usage(ctl, usage, &delta);
2996 proc_done:
2997                 zsd_flush_proc_info(proc);
2998 
2999                 if (hrtime_expired == B_TRUE)
3000                         break;
3001         }
3002         /*
3003          * close next accounting file.
3004          */
3005         if (ctl->zsctl_proc_open_next) {
3006                 (void) ea_close(
3007                     &ctl->zsctl_proc_eaf_next);
3008                 ctl->zsctl_proc_open_next = 0;
3009                 ctl->zsctl_proc_fd_next = -1;
3010         }
3011 
3012         /* For the remaining processes, use pset and sched of a known parent */
3013         proc = list_head(&plist);
3014         while (proc != NULL) {
3015                 next = proc;
3016                 for (;;) {
3017                         if (next->zspr_ppid == 0 || next->zspr_ppid == -1) {
3018                                 /*
3019                                  * Kernel process, or parent is unknown, skip
3020                                  * process, remove from process list.
3021                                  */
3022                                 tmp = proc;
3023                                 proc = list_next(&plist, proc);
3024                                 list_link_init(&tmp->zspr_next);
3025                                 break;
3026                         }
3027                         pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]);
3028                         if (pproc->zspr_zoneid != proc->zspr_zoneid) {
3029                                 /*
3030                                  * Parent in different zone.  Save process and
3031                                  * use zone's default pset and sched below
3032                                  */
3033                                 tmp = proc;
3034                                 proc = list_next(&plist, proc);
3035                                 list_remove(&plist, tmp);
3036                                 list_insert_tail(&pplist, tmp);
3037                                 break;
3038                         }
3039                         /* Parent has unknown pset, Search parent's parent  */
3040                         if (pproc->zspr_psetid == ZS_PSET_ERROR) {
3041                                 next = pproc;
3042                                 continue;
3043                         }
3044                         /* Found parent with known pset.  Use its info */
3045                         proc->zspr_psetid = pproc->zspr_psetid;
3046                         proc->zspr_sched = pproc->zspr_sched;
3047                         next->zspr_psetid = pproc->zspr_psetid;
3048                         next->zspr_sched = pproc->zspr_sched;
3049                         zone = zsd_lookup_zone_byid(ctl,
3050                             proc->zspr_zoneid);
3051                         if (zone == NULL) {
3052                                 tmp = proc;
3053                                 proc = list_next(&plist, proc);
3054                                 list_remove(&plist, tmp);
3055                                 list_link_init(&tmp->zspr_next);
3056                                 break;
3057                         }
3058                         pset = zsd_lookup_pset_byid(ctl,
3059                             proc->zspr_psetid);
3060                         if (pset == NULL) {
3061                                 tmp = proc;
3062                                 proc = list_next(&plist, proc);
3063                                 list_remove(&plist, tmp);
3064                                 list_link_init(&tmp->zspr_next);
3065                                 break;
3066                         }
3067                         /* Add the zone's usage to the pset */
3068                         usage = zsd_lookup_insert_usage(ctl, pset, zone);
3069                         if (usage == NULL) {
3070                                 tmp = proc;
3071                                 proc = list_next(&plist, proc);
3072                                 list_remove(&plist, tmp);
3073                                 list_link_init(&tmp->zspr_next);
3074                                 break;
3075                         }
3076                         zsd_mark_pset_usage_found(usage, proc->zspr_sched);
3077                         zsd_add_usage(ctl, usage, &proc->zspr_usage);
3078                         zsd_flush_proc_info(proc);
3079                         tmp = proc;
3080                         proc = list_next(&plist, proc);
3081                         list_remove(&plist, tmp);
3082                         list_link_init(&tmp->zspr_next);
3083                         break;
3084                 }
3085         }
3086         /*
3087          * Process has never been seen.  Using zone info to
3088          * determine pset and scheduling class.
3089          */
3090         proc = list_head(&pplist);
3091         while (proc != NULL) {
3092 
3093                 zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid);
3094                 if (zone == NULL)
3095                         goto next;
3096                 if (zone->zsz_psetid != ZS_PSET_ERROR &&
3097                     zone->zsz_psetid != ZS_PSET_MULTI) {
3098                         prev_psetid = zone->zsz_psetid;
3099                         pset = zsd_lookup_pset_byid(ctl, prev_psetid);
3100                 } else {
3101                         pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1);
3102                         if (pset != NULL)
3103                                 prev_psetid = pset->zsp_id;
3104                 }
3105                 if (pset == NULL)
3106                         goto next;
3107 
3108                 sched = zone->zsz_scheds;
3109                 /*
3110                  * Ignore FX high scheduling class if it is not the
3111                  * only scheduling class in the zone.
3112                  */
3113                 if (sched != ZS_SCHED_FX_60)
3114                         sched &= (~ZS_SCHED_FX_60);
3115                 /*
3116                  * If more than one scheduling class has been found
3117                  * in the zone, use zone's default scheduling class for
3118                  * this process.
3119                  */
3120                 if ((sched & (sched - 1)) != 0)
3121                         sched = zone->zsz_default_sched;
3122 
3123                 /* Add the zone's usage to the pset */
3124                 usage = zsd_lookup_insert_usage(ctl, pset, zone);
3125                 if (usage == NULL)
3126                         goto next;
3127 
3128                 zsd_mark_pset_usage_found(usage, sched);
3129                 zsd_add_usage(ctl, usage, &proc->zspr_usage);
3130 next:
3131                 tmp = proc;
3132                 proc = list_next(&pplist, proc);
3133                 zsd_flush_proc_info(tmp);
3134                 list_link_init(&tmp->zspr_next);
3135         }
3136         return;
3137 ea_err:
3138         /*
3139          * Close the next accounting file if we have not transitioned to it
3140          * yet.
3141          */
3142         if (ctl->zsctl_proc_open_next) {
3143                 (void) ea_close(&ctl->zsctl_proc_eaf_next);
3144                 ctl->zsctl_proc_open_next = 0;
3145                 ctl->zsctl_proc_fd_next = -1;
3146         }
3147 }
3148 
3149 /*
3150  * getvmusage(2) uses size_t's in the passwd data structure, which differ
3151  * in size for 32bit and 64 bit kernels.  Since this is a contracted interface,
3152  * and zonestatd does not necessarily match the kernel's bitness, marshal
3153  * results appropriately.
3154  */
3155 static int
3156 zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf,
3157     uint64_t *nres)
3158 {
3159         zsd_vmusage32_t *vmu32;
3160         zsd_vmusage64_t *vmu64;
3161         uint32_t nres32;
3162         int i;
3163         int ret;
3164 
3165         if (ctl->zsctl_kern_bits == 32)  {
3166                 nres32 = *nres;
3167                 ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3168                     flags, age, (uintptr_t)buf, (uintptr_t)&nres32);
3169                 *nres = nres32;
3170                 if (ret == 0 && buf != NULL) {
3171                         /*
3172                          * An array of vmusage32_t's has been returned.
3173                          * Convert it to an array of vmusage64_t's.
3174                          */
3175                         vmu32 = (zsd_vmusage32_t *)buf;
3176                         vmu64 = (zsd_vmusage64_t *)buf;
3177                         for (i = nres32 - 1; i >= 0; i--) {
3178 
3179                                 vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid;
3180                                 vmu64[i].vmu_type = vmu32[i].vmu_type;
3181                                 vmu64[i].vmu_type = vmu32[i].vmu_type;
3182                                 vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all;
3183                                 vmu64[i].vmu_rss_private =
3184                                     vmu32[i].vmu_rss_private;
3185                                 vmu64[i].vmu_rss_shared =
3186                                     vmu32[i].vmu_rss_shared;
3187                                 vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all;
3188                                 vmu64[i].vmu_swap_private =
3189                                     vmu32[i].vmu_swap_private;
3190                                 vmu64[i].vmu_swap_shared =
3191                                     vmu32[i].vmu_swap_shared;
3192                         }
3193                 }
3194                 return (ret);
3195         } else {
3196                 /*
3197                  * kernel is 64 bit, so use 64 bit structures as zonestat
3198                  * expects.
3199                  */
3200                 return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3201                     flags, age, (uintptr_t)buf, (uintptr_t)nres));
3202 
3203         }
3204 }
3205 
3206 /*
3207  * Update the current physical, virtual, and locked memory usage of the
3208  * running zones.
3209  */
3210 static void
3211 zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init)
3212 {
3213 
3214         uint64_t phys_total;
3215         uint64_t phys_used;
3216         uint64_t phys_zones;
3217         uint64_t phys_zones_overcount;
3218         uint64_t phys_zones_extra;
3219         uint64_t phys_zones_credit;
3220 
3221         uint64_t vm_free;
3222         uint64_t vm_used;
3223 
3224         uint64_t disk_swap_total;
3225         uint64_t disk_swap_used;        /* disk swap with contents */
3226 
3227         uint64_t physmem;
3228         uint64_t pp_kernel;
3229         uint64_t arc_size = 0;
3230         struct anoninfo ani;
3231 
3232         int num_swap_devices;
3233         struct swaptable *swt;
3234         struct swapent *swent;
3235         size_t swt_size;
3236         char *path;
3237 
3238         zsd_vmusage64_t *vmusage;
3239         uint64_t num_vmusage;
3240 
3241         int i, ret;
3242 
3243         zsd_system_t *sys;
3244         zsd_zone_t *zone;
3245         int vmu_nzones;
3246 
3247         kstat_t *kstat;
3248         char kstat_name[KSTAT_STRLEN];
3249         kstat_named_t *knp;
3250         kid_t kid;
3251 
3252         if (init)
3253                 return;
3254 
3255         sys = ctl->zsctl_system;
3256 
3257         /* interrogate swap devices to find the amount of disk swap */
3258 disk_swap_again:
3259         num_swap_devices = swapctl(SC_GETNSWP, NULL);
3260 
3261         if (num_swap_devices == 0) {
3262                 sys->zss_swap_total = disk_swap_total = 0;
3263                 sys->zss_swap_used = disk_swap_used = 0;
3264                 /* No disk swap */
3265                 goto disk_swap_done;
3266         }
3267         /* see if swap table needs to be larger */
3268         if (num_swap_devices > ctl->zsctl_swap_cache_num) {
3269                 swt_size = sizeof (int) +
3270                     (num_swap_devices * sizeof (struct swapent)) +
3271                     (num_swap_devices * MAXPATHLEN);
3272                 if (ctl->zsctl_swap_cache != NULL)
3273                         free(ctl->zsctl_swap_cache);
3274 
3275                 swt = (struct swaptable *)malloc(swt_size);
3276                 if (swt == NULL) {
3277                         /*
3278                          * Could not allocate to get list of swap devices.
3279                          * Just use data from the most recent read, which will
3280                          * be zero if this is the first read.
3281                          */
3282                         zsd_warn(gettext("Unable to allocate to determine "
3283                             "virtual memory"));
3284                         disk_swap_total = sys->zss_swap_total;
3285                         disk_swap_used = sys->zss_swap_used;
3286                         goto disk_swap_done;
3287                 }
3288                 swent = swt->swt_ent;
3289                 path = (char *)swt + (sizeof (int) +
3290                     num_swap_devices * sizeof (swapent_t));
3291                 for (i = 0; i < num_swap_devices; i++, swent++) {
3292                         swent->ste_path = path;
3293                         path += MAXPATHLEN;
3294                 }
3295                 swt->swt_n = num_swap_devices;
3296                 ctl->zsctl_swap_cache = swt;
3297                 ctl->zsctl_swap_cache_size = swt_size;
3298                 ctl->zsctl_swap_cache_num = num_swap_devices;
3299         }
3300         num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache);
3301         if (num_swap_devices < 0) {
3302                 /* More swap devices have arrived */
3303                 if (errno == ENOMEM)
3304                         goto disk_swap_again;
3305 
3306                 zsd_warn(gettext("Unable to determine disk swap devices"));
3307                 /* Unexpected error.  Use existing data */
3308                 disk_swap_total = sys->zss_swap_total;
3309                 disk_swap_used = sys->zss_swap_used;
3310                 goto disk_swap_done;
3311         }
3312 
3313         /* add up the disk swap */
3314         disk_swap_total = 0;
3315         disk_swap_used = 0;
3316         swent = ctl->zsctl_swap_cache->swt_ent;
3317         for (i = 0; i < num_swap_devices; i++, swent++) {
3318                 disk_swap_total += swent->ste_pages;
3319                 disk_swap_used += (swent->ste_pages - swent->ste_free);
3320         }
3321         disk_swap_total *= ctl->zsctl_pagesize;
3322         disk_swap_used *= ctl->zsctl_pagesize;
3323 
3324         sys->zss_swap_total = disk_swap_total;
3325         sys->zss_swap_used = disk_swap_used;
3326 
3327 disk_swap_done:
3328 
3329         /* get system pages kstat */
3330         kid = -1;
3331         kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages");
3332         if (kstat == NULL)
3333                 zsd_warn(gettext("Unable to lookup system pages kstat"));
3334         else
3335                 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3336 
3337         if (kid == -1) {
3338                 zsd_warn(gettext("Unable to read system pages kstat"));
3339                 return;
3340         } else {
3341                 knp = kstat_data_lookup(kstat, "physmem");
3342                 if (knp == NULL) {
3343                         zsd_warn(gettext("Unable to read physmem"));
3344                 } else {
3345                         if (knp->data_type == KSTAT_DATA_UINT64)
3346                                 physmem = knp->value.ui64;
3347                         else if (knp->data_type == KSTAT_DATA_UINT32)
3348                                 physmem = knp->value.ui32;
3349                         else
3350                                 return;
3351                 }
3352                 knp = kstat_data_lookup(kstat, "pp_kernel");
3353                 if (knp == NULL) {
3354                         zsd_warn(gettext("Unable to read pp_kernel"));
3355                 } else {
3356                         if (knp->data_type == KSTAT_DATA_UINT64)
3357                                 pp_kernel = knp->value.ui64;
3358                         else if (knp->data_type == KSTAT_DATA_UINT32)
3359                                 pp_kernel = knp->value.ui32;
3360                         else
3361                                 return;
3362                 }
3363         }
3364         physmem *= ctl->zsctl_pagesize;
3365         pp_kernel *= ctl->zsctl_pagesize;
3366 
3367         /* get the zfs arc size if available */
3368         arc_size = 0;
3369         kid = -1;
3370         kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats");
3371         if (kstat != NULL)
3372                 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3373         if (kid != -1) {
3374                 knp = kstat_data_lookup(kstat, "size");
3375                 if (knp != NULL)
3376                         if (knp->data_type == KSTAT_DATA_UINT64)
3377                                 arc_size = knp->value.ui64;
3378         }
3379 
3380         /* Try to get swap information */
3381         if (swapctl(SC_AINFO, &ani) < 0) {
3382                 zsd_warn(gettext("Unable to get swap info"));
3383                 return;
3384         }
3385 
3386 vmusage_again:
3387         /* getvmusage to get physical memory usage */
3388         vmusage = ctl->zsctl_vmusage_cache;
3389         num_vmusage = ctl->zsctl_vmusage_cache_num;
3390 
3391         ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0,
3392             vmusage, &num_vmusage);
3393 
3394         if (ret != 0) {
3395                 /* Unexpected error.  Use existing data */
3396                 if (errno != EOVERFLOW) {
3397                         zsd_warn(gettext(
3398                             "Unable to read physical memory usage"));
3399                         phys_zones = sys->zss_ram_zones;
3400                         goto vmusage_done;
3401                 }
3402         }
3403         /* vmusage results cache too small */
3404         if (num_vmusage > ctl->zsctl_vmusage_cache_num) {
3405 
3406                 size_t size = sizeof (zsd_vmusage64_t) * num_vmusage;
3407 
3408                 if (ctl->zsctl_vmusage_cache != NULL)
3409                         free(ctl->zsctl_vmusage_cache);
3410                 vmusage = (zsd_vmusage64_t *)malloc(size);
3411                 if (vmusage == NULL) {
3412                         zsd_warn(gettext("Unable to alloc to determine "
3413                             "physical memory usage"));
3414                         phys_zones = sys->zss_ram_zones;
3415                         goto vmusage_done;
3416                 }
3417                 ctl->zsctl_vmusage_cache = vmusage;
3418                 ctl->zsctl_vmusage_cache_num = num_vmusage;
3419                 goto vmusage_again;
3420         }
3421 
3422         phys_zones_overcount = 0;
3423         vmu_nzones = 0;
3424         for (i = 0; i < num_vmusage; i++) {
3425                 switch (vmusage[i].vmu_type) {
3426                 case VMUSAGE_SYSTEM:
3427                         /* total pages backing user process mappings */
3428                         phys_zones = sys->zss_ram_zones =
3429                             vmusage[i].vmu_rss_all;
3430                         break;
3431                 case VMUSAGE_ZONE:
3432                         vmu_nzones++;
3433                         phys_zones_overcount += vmusage[i].vmu_rss_all;
3434                         zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id);
3435                         if (zone != NULL)
3436                                 zone->zsz_usage_ram = vmusage[i].vmu_rss_all;
3437                         break;
3438                 default:
3439                         break;
3440                 }
3441         }
3442         /*
3443          * Figure how much memory was double counted due to text sharing
3444          * between zones.  Credit this back so that the sum of the zones
3445          * equals the total zone ram usage;
3446          */
3447         phys_zones_extra = phys_zones_overcount - phys_zones;
3448         phys_zones_credit = phys_zones_extra / vmu_nzones;
3449 
3450 vmusage_done:
3451 
3452         /* walk the zones to get swap and locked kstats.  Fetch ram cap. */
3453         sys->zss_locked_zones = 0;
3454         sys->zss_vm_zones = 0;
3455         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3456             zone = list_next(&ctl->zsctl_zones, zone)) {
3457 
3458                 /* If zone halted during interval, show memory usage as none */
3459                 if (zone->zsz_active == B_FALSE ||
3460                     zone->zsz_deleted == B_TRUE) {
3461                         zone->zsz_usage_ram = 0;
3462                         zone->zsz_usage_vm = 0;
3463                         zone->zsz_usage_locked = 0;
3464                         continue;
3465                 }
3466 
3467                 if (phys_zones_credit > 0) {
3468                         if (zone->zsz_usage_ram > phys_zones_credit) {
3469                                 zone->zsz_usage_ram -= phys_zones_credit;
3470                         }
3471                 }
3472                 /*
3473                  * Get zone's swap usage.  Since zone could have halted,
3474                  * treats as zero if cannot read
3475                  */
3476                 zone->zsz_usage_vm = 0;
3477                 (void) snprintf(kstat_name, sizeof (kstat_name),
3478                     "swapresv_zone_%d", zone->zsz_id);
3479                 kid = -1;
3480                 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3481                     zone->zsz_id, kstat_name);
3482                 if (kstat != NULL)
3483                         kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3484                 if (kid != -1) {
3485                         knp = kstat_data_lookup(kstat, "usage");
3486                         if (knp != NULL &&
3487                             knp->data_type == KSTAT_DATA_UINT64) {
3488                                 zone->zsz_usage_vm = knp->value.ui64;
3489                                 sys->zss_vm_zones += knp->value.ui64;
3490                         }
3491                 }
3492                 /*
3493                  * Get zone's locked usage.  Since zone could have halted,
3494                  * treats as zero if cannot read
3495                  */
3496                 zone->zsz_usage_locked = 0;
3497                 (void) snprintf(kstat_name, sizeof (kstat_name),
3498                     "lockedmem_zone_%d", zone->zsz_id);
3499                 kid = -1;
3500                 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3501                     zone->zsz_id, kstat_name);
3502                 if (kstat != NULL)
3503                         kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3504                 if (kid != -1) {
3505                         knp = kstat_data_lookup(kstat, "usage");
3506                         if (knp != NULL &&
3507                             knp->data_type == KSTAT_DATA_UINT64) {
3508                                 zone->zsz_usage_locked = knp->value.ui64;
3509                                 /*
3510                                  * Since locked memory accounting for zones
3511                                  * can double count ddi locked memory, cap each
3512                                  * zone's locked usage at its ram usage.
3513                                  */
3514                                 if (zone->zsz_usage_locked >
3515                                     zone->zsz_usage_ram)
3516                                         zone->zsz_usage_locked =
3517                                             zone->zsz_usage_ram;
3518                                 sys->zss_locked_zones +=
3519                                     zone->zsz_usage_locked;
3520                         }
3521                 }
3522         }
3523 
3524         phys_total =
3525             sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize;
3526 
3527         phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES))
3528             * ctl->zsctl_pagesize;
3529 
3530         /* Compute remaining statistics */
3531         sys->zss_ram_total = phys_total;
3532         sys->zss_ram_zones = phys_zones;
3533         sys->zss_ram_kern = phys_used - phys_zones - arc_size;
3534 
3535         /*
3536          * The total for kernel locked memory should include
3537          * segkp locked pages, but oh well.  The arc size is subtracted,
3538          * as that physical memory is reclaimable.
3539          */
3540         sys->zss_locked_kern = pp_kernel - arc_size;
3541         /* Add memory used by kernel startup and obp to kernel locked */
3542         if ((phys_total - physmem) > 0)
3543                 sys->zss_locked_kern += phys_total - physmem;
3544 
3545         /*
3546          * Add in the portion of (RAM+DISK) that is not available as swap,
3547          * and consider it swap used by the kernel.
3548          */
3549         sys->zss_vm_total = phys_total + disk_swap_total;
3550         vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize;
3551         vm_used = sys->zss_vm_total - vm_free;
3552         sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size;
3553 }
3554 
3555 /*
3556  * Charge each cpu's usage to its processor sets.  Also add the cpu's total
3557  * time to each zone using the processor set.  This tracks the maximum
3558  * amount of cpu time that a zone could have used.
3559  */
3560 static void
3561 zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init)
3562 {
3563         zsd_system_t *sys;
3564         zsd_zone_t *zone;
3565         zsd_pset_usage_t *usage;
3566         zsd_cpu_t *cpu;
3567         zsd_cpu_t *cpu_next;
3568         zsd_pset_t *pset;
3569         timestruc_t ts;
3570         uint64_t hrtime;
3571         timestruc_t delta;
3572 
3573         /* Update the per-cpu kstat data */
3574         cpu_next = list_head(&ctl->zsctl_cpus);
3575         while (cpu_next != NULL) {
3576                 cpu = cpu_next;
3577                 cpu_next = list_next(&ctl->zsctl_cpus, cpu);
3578                 zsd_update_cpu_stats(ctl, cpu);
3579         }
3580         /* Update the elapsed real time */
3581         hrtime = gethrtime();
3582         if (init) {
3583                 /* first time around, store hrtime for future comparision */
3584                 ctl->zsctl_hrtime = hrtime;
3585                 ctl->zsctl_hrtime_prev = hrtime;
3586 
3587         } else {
3588                 /* Compute increase in hrtime since the most recent read */
3589                 ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime;
3590                 ctl->zsctl_hrtime = hrtime;
3591                 if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0)
3592                         TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime);
3593         }
3594 
3595         /* On initialization, all psets have zero time  */
3596         if (init)
3597                 return;
3598 
3599         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
3600             pset = list_next(&ctl->zsctl_psets, pset)) {
3601 
3602                 if (pset->zsp_active == B_FALSE) {
3603                         zsd_warn(gettext("Internal error,inactive pset found"));
3604                         continue;
3605                 }
3606 
3607                 /* sum total used time for pset */
3608                 ts.tv_sec = 0;
3609                 ts.tv_nsec = 0;
3610                 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr);
3611                 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern);
3612                 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user);
3613                 /* kernel time in pset is total time minus zone time */
3614                 TIMESTRUC_DELTA(pset->zsp_usage_kern, ts,
3615                     pset->zsp_usage_zones);
3616                 if (pset->zsp_usage_kern.tv_sec < 0 ||
3617                     pset->zsp_usage_kern.tv_nsec < 0) {
3618                         pset->zsp_usage_kern.tv_sec = 0;
3619                         pset->zsp_usage_kern.tv_nsec = 0;
3620                 }
3621                 /* Total pset elapsed time is used time plus idle time */
3622                 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle);
3623 
3624                 TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time);
3625 
3626                 for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
3627                     usage = list_next(&pset->zsp_usage_list, usage)) {
3628 
3629                         zone = usage->zsu_zone;
3630                         if (usage->zsu_cpu_shares != ZS_LIMIT_NONE &&
3631                             usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED &&
3632                             usage->zsu_cpu_shares != 0) {
3633                                 /*
3634                                  * Figure out how many nanoseconds of share time
3635                                  * to give to the zone
3636                                  */
3637                                 hrtime = delta.tv_sec;
3638                                 hrtime *= NANOSEC;
3639                                 hrtime += delta.tv_nsec;
3640                                 hrtime *= usage->zsu_cpu_shares;
3641                                 hrtime /= pset->zsp_cpu_shares;
3642                                 TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time,
3643                                     hrtime);
3644                         }
3645                         /* Add pset time to each zone using pset */
3646                         TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta);
3647 
3648                         zone->zsz_cpus_online += pset->zsp_online;
3649                 }
3650                 pset->zsp_total_time = ts;
3651         }
3652 
3653         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3654             zone = list_next(&ctl->zsctl_zones, zone)) {
3655 
3656                 /* update cpu cap tracking if the zone has a cpu cap */
3657                 if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) {
3658                         uint64_t elapsed;
3659 
3660                         elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev;
3661                         elapsed *= zone->zsz_cpu_cap;
3662                         elapsed = elapsed / 100;
3663                         TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed);
3664                 }
3665         }
3666         sys = ctl->zsctl_system;
3667         ts.tv_sec = 0;
3668         ts.tv_nsec = 0;
3669         TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr);
3670         TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern);
3671         TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user);
3672 
3673         /* kernel time in pset is total time minus zone time */
3674         TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts,
3675             sys->zss_cpu_usage_zones);
3676         if (sys->zss_cpu_usage_kern.tv_sec < 0 ||
3677             sys->zss_cpu_usage_kern.tv_nsec < 0) {
3678                 sys->zss_cpu_usage_kern.tv_sec = 0;
3679                 sys->zss_cpu_usage_kern.tv_nsec = 0;
3680         }
3681         /* Total pset elapsed time is used time plus idle time */
3682         TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle);
3683         sys->zss_cpu_total_time = ts;
3684 }
3685 
3686 /*
3687  * Saves current usage data to a cache that is read by libzonestat when
3688  * calling zs_usage_read().
3689  *
3690  * All pointers in the cached data structure are set to NULL.  When
3691  * libzonestat reads the cached data, it will set the pointers relative to
3692  * its address space.
3693  */
3694 static void
3695 zsd_usage_cache_update(zsd_ctl_t *ctl)
3696 {
3697         zs_usage_cache_t *cache;
3698         zs_usage_cache_t *old;
3699         zs_usage_t *usage;
3700 
3701         zs_system_t *sys;
3702         zsd_system_t *dsys;
3703         zs_zone_t *zone = NULL;
3704         zsd_zone_t *dzone;
3705         zs_pset_t *pset = NULL;
3706         zsd_pset_t *dpset;
3707         zs_pset_zone_t *pusage;
3708         zsd_pset_usage_t *dpusage;
3709 
3710         char *next;
3711         uint_t size, i, j;
3712 
3713         size =
3714             sizeof (zs_usage_cache_t) +
3715             sizeof (zs_usage_t) +
3716             sizeof (zs_system_t) +
3717             sizeof (zs_zone_t) * ctl->zsctl_nzones +
3718             sizeof (zs_pset_t) *  ctl->zsctl_npsets +
3719             sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages;
3720 
3721         cache = (zs_usage_cache_t *)malloc(size);
3722         if (cache == NULL) {
3723                 zsd_warn(gettext("Unable to allocate usage cache\n"));
3724                 return;
3725         }
3726 
3727         next = (char *)cache;
3728         cache->zsuc_size = size - sizeof (zs_usage_cache_t);
3729         next += sizeof (zs_usage_cache_t);
3730 
3731         /* LINTED */
3732         usage = cache->zsuc_usage = (zs_usage_t *)next;
3733         next += sizeof (zs_usage_t);
3734         usage->zsu_start = g_start;
3735         usage->zsu_hrstart = g_hrstart;
3736         usage->zsu_time = g_now;
3737         usage->zsu_hrtime = g_hrnow;
3738         usage->zsu_nzones = ctl->zsctl_nzones;
3739         usage->zsu_npsets = ctl->zsctl_npsets;
3740         usage->zsu_system = NULL;
3741 
3742         /* LINTED */
3743         sys = (zs_system_t *)next;
3744         next += sizeof (zs_system_t);
3745         dsys = ctl->zsctl_system;
3746         sys->zss_ram_total = dsys->zss_ram_total;
3747         sys->zss_ram_kern = dsys->zss_ram_kern;
3748         sys->zss_ram_zones = dsys->zss_ram_zones;
3749         sys->zss_locked_kern = dsys->zss_locked_kern;
3750         sys->zss_locked_zones = dsys->zss_locked_zones;
3751         sys->zss_vm_total = dsys->zss_vm_total;
3752         sys->zss_vm_kern = dsys->zss_vm_kern;
3753         sys->zss_vm_zones = dsys->zss_vm_zones;
3754         sys->zss_swap_total = dsys->zss_swap_total;
3755         sys->zss_swap_used = dsys->zss_swap_used;
3756         sys->zss_ncpus = dsys->zss_ncpus;
3757         sys->zss_ncpus_online = dsys->zss_ncpus_online;
3758 
3759         sys->zss_processes_max = dsys->zss_maxpid;
3760         sys->zss_lwps_max = dsys->zss_lwps_max;
3761         sys->zss_shm_max = dsys->zss_shm_max;
3762         sys->zss_shmids_max = dsys->zss_shmids_max;
3763         sys->zss_semids_max = dsys->zss_semids_max;
3764         sys->zss_msgids_max = dsys->zss_msgids_max;
3765         sys->zss_lofi_max = dsys->zss_lofi_max;
3766 
3767         sys->zss_processes = dsys->zss_processes;
3768         sys->zss_lwps = dsys->zss_lwps;
3769         sys->zss_shm = dsys->zss_shm;
3770         sys->zss_shmids = dsys->zss_shmids;
3771         sys->zss_semids = dsys->zss_semids;
3772         sys->zss_msgids = dsys->zss_msgids;
3773         sys->zss_lofi = dsys->zss_lofi;
3774 
3775         sys->zss_cpu_total_time = dsys->zss_cpu_total_time;
3776         sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones;
3777         sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern;
3778 
3779         for (i = 0, dzone = list_head(&ctl->zsctl_zones);
3780             i < ctl->zsctl_nzones;
3781             i++, dzone = list_next(&ctl->zsctl_zones, dzone)) {
3782                 /* LINTED */
3783                 zone = (zs_zone_t *)next;
3784                 next += sizeof (zs_zone_t);
3785                 list_link_init(&zone->zsz_next);
3786                 zone->zsz_system = NULL;
3787 
3788                 (void) strlcpy(zone->zsz_name, dzone->zsz_name,
3789                     sizeof (zone->zsz_name));
3790                 (void) strlcpy(zone->zsz_pool, dzone->zsz_pool,
3791                     sizeof (zone->zsz_pool));
3792                 (void) strlcpy(zone->zsz_pset, dzone->zsz_pset,
3793                     sizeof (zone->zsz_pset));
3794                 zone->zsz_id = dzone->zsz_id;
3795                 zone->zsz_cputype = dzone->zsz_cputype;
3796                 zone->zsz_iptype = dzone->zsz_iptype;
3797                 zone->zsz_start = dzone->zsz_start;
3798                 zone->zsz_hrstart = dzone->zsz_hrstart;
3799                 zone->zsz_scheds = dzone->zsz_scheds;
3800                 zone->zsz_cpu_shares = dzone->zsz_cpu_shares;
3801                 zone->zsz_cpu_cap = dzone->zsz_cpu_cap;
3802                 zone->zsz_ram_cap = dzone->zsz_ram_cap;
3803                 zone->zsz_vm_cap = dzone->zsz_vm_cap;
3804                 zone->zsz_locked_cap = dzone->zsz_locked_cap;
3805                 zone->zsz_cpu_usage = dzone->zsz_cpu_usage;
3806                 zone->zsz_cpus_online = dzone->zsz_cpus_online;
3807                 zone->zsz_pset_time = dzone->zsz_pset_time;
3808                 zone->zsz_cap_time = dzone->zsz_cap_time;
3809                 zone->zsz_share_time = dzone->zsz_share_time;
3810                 zone->zsz_usage_ram = dzone->zsz_usage_ram;
3811                 zone->zsz_usage_locked = dzone->zsz_usage_locked;
3812                 zone->zsz_usage_vm = dzone->zsz_usage_vm;
3813 
3814                 zone->zsz_processes_cap = dzone->zsz_processes_cap;
3815                 zone->zsz_lwps_cap = dzone->zsz_lwps_cap;
3816                 zone->zsz_shm_cap = dzone->zsz_shm_cap;
3817                 zone->zsz_shmids_cap = dzone->zsz_shmids_cap;
3818                 zone->zsz_semids_cap = dzone->zsz_semids_cap;
3819                 zone->zsz_msgids_cap = dzone->zsz_msgids_cap;
3820                 zone->zsz_lofi_cap = dzone->zsz_lofi_cap;
3821 
3822                 zone->zsz_processes = dzone->zsz_processes;
3823                 zone->zsz_lwps = dzone->zsz_lwps;
3824                 zone->zsz_shm = dzone->zsz_shm;
3825                 zone->zsz_shmids = dzone->zsz_shmids;
3826                 zone->zsz_semids = dzone->zsz_semids;
3827                 zone->zsz_msgids = dzone->zsz_msgids;
3828                 zone->zsz_lofi = dzone->zsz_lofi;
3829         }
3830 
3831         for (i = 0, dpset = list_head(&ctl->zsctl_psets);
3832             i < ctl->zsctl_npsets;
3833             i++, dpset = list_next(&ctl->zsctl_psets, dpset)) {
3834                 /* LINTED */
3835                 pset = (zs_pset_t *)next;
3836                 next += sizeof (zs_pset_t);
3837                 list_link_init(&pset->zsp_next);
3838                 (void) strlcpy(pset->zsp_name, dpset->zsp_name,
3839                     sizeof (pset->zsp_name));
3840                 pset->zsp_id = dpset->zsp_id;
3841                 pset->zsp_cputype = dpset->zsp_cputype;
3842                 pset->zsp_start = dpset->zsp_start;
3843                 pset->zsp_hrstart = dpset->zsp_hrstart;
3844                 pset->zsp_online = dpset->zsp_online;
3845                 pset->zsp_size = dpset->zsp_size;
3846                 pset->zsp_min = dpset->zsp_min;
3847                 pset->zsp_max = dpset->zsp_max;
3848                 pset->zsp_importance = dpset->zsp_importance;
3849                 pset->zsp_scheds = dpset->zsp_scheds;
3850                 pset->zsp_cpu_shares = dpset->zsp_cpu_shares;
3851                 pset->zsp_total_time = dpset->zsp_total_time;
3852                 pset->zsp_usage_kern = dpset->zsp_usage_kern;
3853                 pset->zsp_usage_zones = dpset->zsp_usage_zones;
3854                 pset->zsp_nusage = dpset->zsp_nusage;
3855                 /* Add pset usages for pset */
3856                 for (j = 0, dpusage = list_head(&dpset->zsp_usage_list);
3857                     j < dpset->zsp_nusage;
3858                     j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) {
3859                         /* LINTED */
3860                         pusage = (zs_pset_zone_t *)next;
3861                         next += sizeof (zs_pset_zone_t);
3862                         /* pointers are computed by client */
3863                         pusage->zspz_pset = NULL;
3864                         pusage->zspz_zone = NULL;
3865                         list_link_init(&pusage->zspz_next);
3866                         pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id;
3867                         pusage->zspz_start = dpusage->zsu_start;
3868                         pusage->zspz_hrstart = dpusage->zsu_hrstart;
3869                         pusage->zspz_hrstart = dpusage->zsu_hrstart;
3870                         pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares;
3871                         pusage->zspz_scheds = dpusage->zsu_scheds;
3872                         pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage;
3873                 }
3874         }
3875 
3876         /* Update the current cache pointer */
3877         (void) mutex_lock(&g_usage_cache_lock);
3878                 old = g_usage_cache;
3879                 cache->zsuc_ref = 1;
3880                 cache->zsuc_gen = g_gen_next;
3881                 usage->zsu_gen = g_gen_next;
3882                 usage->zsu_size = size;
3883                 g_usage_cache = cache;
3884                 if (old != NULL) {
3885                         old->zsuc_ref--;
3886                         if (old->zsuc_ref == 0)
3887                                 free(old);
3888                 }
3889                 g_gen_next++;
3890         /* Wake up any clients that are waiting for this calculation */
3891         if (g_usage_cache_kickers > 0) {
3892                 (void) cond_broadcast(&g_usage_cache_wait);
3893         }
3894         (void) mutex_unlock(&g_usage_cache_lock);
3895 }
3896 
3897 static zs_usage_cache_t *
3898 zsd_usage_cache_hold_locked()
3899 {
3900         zs_usage_cache_t *ret;
3901 
3902         ret = g_usage_cache;
3903         ret->zsuc_ref++;
3904         return (ret);
3905 }
3906 
3907 void
3908 zsd_usage_cache_rele(zs_usage_cache_t *cache)
3909 {
3910         (void) mutex_lock(&g_usage_cache_lock);
3911         cache->zsuc_ref--;
3912         if (cache->zsuc_ref == 0)
3913                 free(cache);
3914         (void) mutex_unlock(&g_usage_cache_lock);
3915 }
3916 
3917 /* Close the handles held by zsd_open() */
3918 void
3919 zsd_close(zsd_ctl_t *ctl)
3920 {
3921         zsd_zone_t *zone;
3922         zsd_pset_t *pset;
3923         zsd_pset_usage_t *usage;
3924         zsd_cpu_t *cpu;
3925         int id;
3926 
3927         if (ctl->zsctl_kstat_ctl) {
3928                 (void) kstat_close(ctl->zsctl_kstat_ctl);
3929                 ctl->zsctl_kstat_ctl = NULL;
3930         }
3931         if (ctl->zsctl_proc_open) {
3932                 (void) ea_close(&ctl->zsctl_proc_eaf);
3933                 ctl->zsctl_proc_open = 0;
3934                 ctl->zsctl_proc_fd = -1;
3935         }
3936         if (ctl->zsctl_pool_conf) {
3937                 if (ctl->zsctl_pool_status == POOL_ENABLED)
3938                         (void) pool_conf_close(ctl->zsctl_pool_conf);
3939                 ctl->zsctl_pool_status = POOL_DISABLED;
3940         }
3941 
3942         while ((zone = list_head(&ctl->zsctl_zones)) != NULL) {
3943                 list_remove(&ctl->zsctl_zones, zone);
3944                 free(zone);
3945                 ctl->zsctl_nzones--;
3946         }
3947 
3948         while ((pset = list_head(&ctl->zsctl_psets)) != NULL) {
3949                 while ((usage = list_head(&pset->zsp_usage_list))
3950                     != NULL) {
3951                         list_remove(&pset->zsp_usage_list, usage);
3952                         ctl->zsctl_npset_usages--;
3953                         free(usage);
3954                 }
3955                 list_remove(&ctl->zsctl_psets, pset);
3956                 free(pset);
3957                 ctl->zsctl_npsets--;
3958         }
3959 
3960         /* Release all cpus being tracked */
3961         while (cpu = list_head(&ctl->zsctl_cpus)) {
3962                 list_remove(&ctl->zsctl_cpus, cpu);
3963                 id = cpu->zsc_id;
3964                 bzero(cpu, sizeof (zsd_cpu_t));
3965                 cpu->zsc_id = id;
3966                 cpu->zsc_allocated = B_FALSE;
3967                 cpu->zsc_psetid = ZS_PSET_ERROR;
3968                 cpu->zsc_psetid_prev = ZS_PSET_ERROR;
3969         }
3970 
3971         assert(ctl->zsctl_npset_usages == 0);
3972         assert(ctl->zsctl_npsets == 0);
3973         assert(ctl->zsctl_nzones == 0);
3974         (void) zsd_disable_cpu_stats();
3975 }
3976 
3977 
3978 /*
3979  * Update the utilization data for all zones and processor sets.
3980  */
3981 static int
3982 zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory)
3983 {
3984         (void) kstat_chain_update(ctl->zsctl_kstat_ctl);
3985         (void) gettimeofday(&(ctl->zsctl_timeofday), NULL);
3986 
3987         zsd_refresh_system(ctl);
3988 
3989         /*
3990          * Memory calculation is expensive.  Only update it on sample
3991          * intervals.
3992          */
3993         if (do_memory == B_TRUE)
3994                 zsd_refresh_memory(ctl, init);
3995         zsd_refresh_zones(ctl);
3996         zsd_refresh_psets(ctl);
3997         zsd_refresh_procs(ctl, init);
3998         zsd_refresh_cpu_stats(ctl, init);
3999 
4000         /*
4001          * Delete objects that no longer exist.
4002          * Pset usages must be deleted first as they point to zone and
4003          * pset objects.
4004          */
4005         zsd_mark_pset_usages_end(ctl);
4006         zsd_mark_psets_end(ctl);
4007         zsd_mark_cpus_end(ctl);
4008         zsd_mark_zones_end(ctl);
4009 
4010         /*
4011          * Save results for clients.
4012          */
4013         zsd_usage_cache_update(ctl);
4014 
4015         /*
4016          * Roll process accounting file.
4017          */
4018         (void) zsd_roll_exacct();
4019         return (0);
4020 }
4021 
4022 /*
4023  * Get the system rctl, which is the upper most limit
4024  */
4025 static uint64_t
4026 zsd_get_system_rctl(char *name)
4027 {
4028         rctlblk_t *rblk, *rblk_last;
4029 
4030         rblk = (rctlblk_t *)alloca(rctlblk_size());
4031         rblk_last = (rctlblk_t *)alloca(rctlblk_size());
4032 
4033         if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0)
4034                 return (ZS_LIMIT_NONE);
4035 
4036         while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0)
4037                 (void) bcopy(rblk, rblk_last, rctlblk_size());
4038 
4039         return (rctlblk_get_value(rblk_last));
4040 }
4041 
4042 /*
4043  * Open any necessary subsystems for collecting utilization data,
4044  * allocate and initialize data structures, and get initial utilization.
4045  *
4046  * Errors:
4047  *      ENOMEM  out of memory
4048  *      EINVAL  other error
4049  */
4050 static zsd_ctl_t *
4051 zsd_open(zsd_ctl_t *ctl)
4052 {
4053         zsd_system_t *system;
4054 
4055         char path[MAXPATHLEN];
4056         long pathmax;
4057         struct statvfs svfs;
4058         int ret;
4059         int i;
4060         size_t size;
4061         int err;
4062 
4063         if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1,
4064             sizeof (zsd_ctl_t))) == NULL) {
4065                         zsd_warn(gettext("Out of Memory"));
4066                         errno = ENOMEM;
4067                         goto err;
4068         }
4069         ctl->zsctl_proc_fd = -1;
4070 
4071         /* open kstats */
4072         if (ctl->zsctl_kstat_ctl == NULL &&
4073             (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) {
4074                 err = errno;
4075                 zsd_warn(gettext("Unable to open kstats"));
4076                 errno = err;
4077                 if (errno != ENOMEM)
4078                         errno = EAGAIN;
4079                 goto err;
4080         }
4081 
4082         /*
4083          * These are set when the accounting file is opened by
4084          * zsd_update_procs()
4085          */
4086         ctl->zsctl_proc_fd = -1;
4087         ctl->zsctl_proc_fd_next = -1;
4088         ctl->zsctl_proc_open = 0;
4089         ctl->zsctl_proc_open_next = 0;
4090 
4091 check_exacct:
4092         (void) zsd_enable_cpu_stats();
4093 
4094         /* Create structures to track usage */
4095         if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *)
4096             calloc(1, sizeof (zsd_system_t))) == NULL) {
4097                 ret = -1;
4098                 zsd_warn(gettext("Out of Memory"));
4099                 errno = ENOMEM;
4100                 goto err;
4101         }
4102         system = ctl->zsctl_system;
4103         /* get the kernel bitness to know structure layout for getvmusage */
4104         ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path));
4105         if (ret < 0)
4106                 ctl->zsctl_kern_bits = 32;
4107         else
4108                 ctl->zsctl_kern_bits = 64;
4109         ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE);
4110 
4111         size = sysconf(_SC_CPUID_MAX);
4112         ctl->zsctl_maxcpuid = size;
4113         if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array =
4114             (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) {
4115                 zsd_warn(gettext("Out of Memory"));
4116                 errno = ENOMEM;
4117                 goto err;
4118         }
4119         for (i = 0; i <= ctl->zsctl_maxcpuid; i++) {
4120                 ctl->zsctl_cpu_array[i].zsc_id = i;
4121                 ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE;
4122                 ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR;
4123                 ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR;
4124         }
4125         if (statvfs("/proc", &svfs) != 0 ||
4126             strcmp("/proc", svfs.f_fstr) != 0) {
4127                 zsd_warn(gettext("/proc not a procfs filesystem"));
4128                 errno = EINVAL;
4129                 goto err;
4130         }
4131 
4132         size = sysconf(_SC_MAXPID) + 1;
4133         ctl->zsctl_maxproc = size;
4134         if (ctl->zsctl_proc_array == NULL &&
4135             (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size,
4136             sizeof (zsd_proc_t))) == NULL) {
4137                 zsd_warn(gettext("Out of Memory"));
4138                 errno = ENOMEM;
4139                 goto err;
4140         }
4141         for (i = 0; i <= ctl->zsctl_maxproc; i++) {
4142                 list_link_init(&(ctl->zsctl_proc_array[i].zspr_next));
4143                 ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR;
4144                 ctl->zsctl_proc_array[i].zspr_zoneid = -1;
4145                 ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0;
4146                 ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0;
4147                 ctl->zsctl_proc_array[i].zspr_ppid = -1;
4148         }
4149 
4150         list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t),
4151             offsetof(zsd_zone_t, zsz_next));
4152 
4153         list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t),
4154             offsetof(zsd_pset_t, zsp_next));
4155 
4156         list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t),
4157             offsetof(zsd_cpu_t, zsc_next));
4158 
4159         pathmax = pathconf("/proc", _PC_NAME_MAX);
4160         if (pathmax < 0) {
4161                 zsd_warn(gettext("Unable to determine max path of /proc"));
4162                 errno = EINVAL;
4163                 goto err;
4164         }
4165         size = sizeof (struct dirent) + pathmax + 1;
4166 
4167         ctl->zsctl_procfs_dent_size = size;
4168         if (ctl->zsctl_procfs_dent == NULL &&
4169             (ctl->zsctl_procfs_dent = (struct dirent *)calloc(1, size))
4170             == NULL) {
4171                 zsd_warn(gettext("Out of Memory"));
4172                 errno = ENOMEM;
4173                 goto err;
4174         }
4175 
4176         if (ctl->zsctl_pool_conf == NULL &&
4177             (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) {
4178                 zsd_warn(gettext("Out of Memory"));
4179                 errno = ENOMEM;
4180                 goto err;
4181         }
4182         ctl->zsctl_pool_status = POOL_DISABLED;
4183         ctl->zsctl_pool_changed = 0;
4184 
4185         if (ctl->zsctl_pool_vals[0] == NULL &&
4186             (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) {
4187                 zsd_warn(gettext("Out of Memory"));
4188                 errno = ENOMEM;
4189                 goto err;
4190         }
4191         if (ctl->zsctl_pool_vals[1] == NULL &&
4192             (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) {
4193                 zsd_warn(gettext("Out of Memory"));
4194                 errno = ENOMEM;
4195                 goto err;
4196         }
4197         ctl->zsctl_pool_vals[2] = NULL;
4198 
4199         /*
4200          * get system limits
4201          */
4202         system->zss_maxpid = size = sysconf(_SC_MAXPID);
4203         system->zss_processes_max = zsd_get_system_rctl("zone.max-processes");
4204         system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps");
4205         system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory");
4206         system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids");
4207         system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids");
4208         system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids");
4209         system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi");
4210 
4211         g_gen_next = 1;
4212 
4213         if (zsd_read(ctl, B_TRUE, B_FALSE) != 0)
4214                 zsd_warn(gettext("Reading zone statistics failed"));
4215 
4216         return (ctl);
4217 err:
4218         if (ctl)
4219                 zsd_close(ctl);
4220 
4221         return (NULL);
4222 }
4223 
4224 /* Copy utilization data to buffer, filtering data if non-global zone. */
4225 static void
4226 zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage,
4227     boolean_t is_gz)
4228 {
4229         zs_usage_t *cusage;
4230         zs_system_t *sys, *csys;
4231         zs_zone_t *zone, *czone;
4232         zs_pset_t *pset, *cpset;
4233         zs_pset_zone_t *pz, *cpz, *foundpz;
4234         size_t size = 0, csize = 0;
4235         char *start, *cstart;
4236         int i, j;
4237         timestruc_t delta;
4238 
4239         /* Privileged users in the global zone get everything */
4240         if (is_gz) {
4241                 cusage = cache->zsuc_usage;
4242                 (void) bcopy(cusage, usage, cusage->zsu_size);
4243                 return;
4244         }
4245 
4246         /* Zones just get their own usage */
4247         cusage = cache->zsuc_usage;
4248 
4249         start = (char *)usage;
4250         cstart = (char *)cusage;
4251         size += sizeof (zs_usage_t);
4252         csize += sizeof (zs_usage_t);
4253 
4254         usage->zsu_start = cusage->zsu_start;
4255         usage->zsu_hrstart = cusage->zsu_hrstart;
4256         usage->zsu_time = cusage->zsu_time;
4257         usage->zsu_hrtime = cusage->zsu_hrtime;
4258         usage->zsu_gen = cusage->zsu_gen;
4259         usage->zsu_nzones = 1;
4260         usage->zsu_npsets = 0;
4261 
4262         /* LINTED */
4263         sys = (zs_system_t *)(start + size);
4264         /* LINTED */
4265         csys = (zs_system_t *)(cstart + csize);
4266         size += sizeof (zs_system_t);
4267         csize += sizeof (zs_system_t);
4268 
4269         /* Save system limits but not usage */
4270         *sys = *csys;
4271         sys->zss_ncpus = 0;
4272         sys->zss_ncpus_online = 0;
4273 
4274         /* LINTED */
4275         zone = (zs_zone_t *)(start + size);
4276         /* LINTED */
4277         czone = (zs_zone_t *)(cstart + csize);
4278         /* Find the matching zone */
4279         for (i = 0; i < cusage->zsu_nzones; i++) {
4280                 if (czone->zsz_id == zid) {
4281                         *zone = *czone;
4282                         size += sizeof (zs_zone_t);
4283                 }
4284                 csize += sizeof (zs_zone_t);
4285                 /* LINTED */
4286                 czone = (zs_zone_t *)(cstart + csize);
4287         }
4288         sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram);
4289         sys->zss_ram_zones = zone->zsz_usage_ram;
4290 
4291         sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm);
4292         sys->zss_vm_zones = zone->zsz_usage_vm;
4293 
4294         sys->zss_locked_kern += (sys->zss_locked_zones -
4295             zone->zsz_usage_locked);
4296         sys->zss_locked_zones = zone->zsz_usage_locked;
4297 
4298         TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage);
4299         TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta);
4300         sys->zss_cpu_usage_zones = zone->zsz_cpu_usage;
4301 
4302         /* LINTED */
4303         pset = (zs_pset_t *)(start + size);
4304         /* LINTED */
4305         cpset = (zs_pset_t *)(cstart + csize);
4306         for (i = 0; i < cusage->zsu_npsets; i++) {
4307                 csize += sizeof (zs_pset_t);
4308                 /* LINTED */
4309                 cpz = (zs_pset_zone_t *)(csize + cstart);
4310                 foundpz = NULL;
4311                 for (j = 0; j < cpset->zsp_nusage; j++) {
4312                         if (cpz->zspz_zoneid == zid)
4313                                 foundpz = cpz;
4314 
4315                         csize += sizeof (zs_pset_zone_t);
4316                         /* LINTED */
4317                         cpz = (zs_pset_zone_t *)(csize + cstart);
4318                 }
4319                 if (foundpz != NULL) {
4320                         size += sizeof (zs_pset_t);
4321                         /* LINTED */
4322                         pz = (zs_pset_zone_t *)(start + size);
4323                         size += sizeof (zs_pset_zone_t);
4324 
4325                         *pset = *cpset;
4326                         *pz = *foundpz;
4327 
4328                         TIMESTRUC_DELTA(delta, pset->zsp_usage_zones,
4329                             pz->zspz_cpu_usage);
4330                         TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta);
4331                         pset->zsp_usage_zones = pz->zspz_cpu_usage;
4332                         pset->zsp_nusage = 1;
4333                         usage->zsu_npsets++;
4334                         sys->zss_ncpus += pset->zsp_size;
4335                         sys->zss_ncpus_online += pset->zsp_online;
4336                 }
4337                 /* LINTED */
4338                 cpset = (zs_pset_t *)(cstart + csize);
4339         }
4340         usage->zsu_size = size;
4341 }
4342 
4343 /*
4344  * Respond to new connections from libzonestat.so.  Also respond to zoneadmd,
4345  * which reports new zones.
4346  */
4347 /* ARGSUSED */
4348 static void
4349 zsd_server(void *cookie, char *argp, size_t arg_size,
4350     door_desc_t *dp, uint_t n_desc)
4351 {
4352         int *args, cmd;
4353         door_desc_t door;
4354         ucred_t *ucred;
4355         const priv_set_t *eset;
4356 
4357         if (argp == DOOR_UNREF_DATA) {
4358                 (void) door_return(NULL, 0, NULL, 0);
4359                 thr_exit(NULL);
4360         }
4361 
4362         if (arg_size != sizeof (cmd) * 2) {
4363                 (void) door_return(NULL, 0, NULL, 0);
4364                 thr_exit(NULL);
4365         }
4366 
4367         /* LINTED */
4368         args = (int *)argp;
4369         cmd = args[0];
4370 
4371         /* If connection, return door to stat server */
4372         if (cmd == ZSD_CMD_CONNECT) {
4373 
4374                 /* Verify client compilation version */
4375                 if (args[1] != ZS_VERSION) {
4376                         args[1] = ZSD_STATUS_VERSION_MISMATCH;
4377                         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4378                         thr_exit(NULL);
4379                 }
4380                 ucred = alloca(ucred_size());
4381                 /* Verify client permission */
4382                 if (door_ucred(&ucred) != 0) {
4383                         args[1] = ZSD_STATUS_INTERNAL_ERROR;
4384                         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4385                         thr_exit(NULL);
4386                 }
4387 
4388                 eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4389                 if (eset == NULL) {
4390                         args[1] = ZSD_STATUS_INTERNAL_ERROR;
4391                         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4392                         thr_exit(NULL);
4393                 }
4394                 if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4395                         args[1] = ZSD_STATUS_PERMISSION;
4396                         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4397                         thr_exit(NULL);
4398                 }
4399 
4400                 /* Return stat server door */
4401                 args[1] = ZSD_STATUS_OK;
4402                 door.d_attributes = DOOR_DESCRIPTOR;
4403                 door.d_data.d_desc.d_descriptor = g_stat_door;
4404                 (void) door_return(argp, sizeof (cmd) * 2, &door, 1);
4405                 thr_exit(NULL);
4406         }
4407 
4408         /* Respond to zoneadmd informing zonestatd of a new zone */
4409         if (cmd == ZSD_CMD_NEW_ZONE) {
4410                 zsd_fattach_zone(args[1], g_server_door, B_FALSE);
4411                 (void) door_return(NULL, 0, NULL, 0);
4412                 thr_exit(NULL);
4413         }
4414 
4415         args[1] = ZSD_STATUS_INTERNAL_ERROR;
4416         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4417         thr_exit(NULL);
4418 }
4419 
4420 /*
4421  * Respond to libzonestat.so clients with the current utlilzation data.
4422  */
4423 /* ARGSUSED */
4424 static void
4425 zsd_stat_server(void *cookie, char *argp, size_t arg_size,
4426     door_desc_t *dp, uint_t n_desc)
4427 {
4428         uint64_t *args, cmd;
4429         zs_usage_cache_t *cache;
4430         int ret;
4431         char *rvalp;
4432         size_t rvals;
4433         zs_usage_t *usage;
4434         ucred_t *ucred;
4435         zoneid_t zoneid;
4436         const priv_set_t *eset;
4437         boolean_t is_gz = B_FALSE;
4438 
4439         /* Tell stat thread there are no more clients */
4440         if (argp == DOOR_UNREF_DATA) {
4441                 (void) mutex_lock(&g_usage_cache_lock);
4442                 g_hasclient = B_FALSE;
4443                 (void) cond_signal(&g_usage_cache_kick);
4444                 (void) mutex_unlock(&g_usage_cache_lock);
4445                 (void) door_return(NULL, 0, NULL, 0);
4446                 thr_exit(NULL);
4447         }
4448         if (arg_size != sizeof (cmd) * 2) {
4449                 (void) door_return(NULL, 0, NULL, 0);
4450                 thr_exit(NULL);
4451         }
4452         /* LINTED */
4453         args = (uint64_t *)argp;
4454         cmd = args[0];
4455         if (cmd != ZSD_CMD_READ) {
4456                 (void) door_return(NULL, 0, NULL, 0);
4457                 thr_exit(NULL);
4458         }
4459         ucred = alloca(ucred_size());
4460         if (door_ucred(&ucred) != 0) {
4461                 (void) door_return(NULL, 0, NULL, 0);
4462                 thr_exit(NULL);
4463         }
4464         zoneid = ucred_getzoneid(ucred);
4465 
4466         if (zoneid == GLOBAL_ZONEID)
4467                 is_gz = B_TRUE;
4468 
4469         eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4470         if (eset == NULL) {
4471                 (void) door_return(NULL, 0, NULL, 0);
4472                 thr_exit(NULL);
4473         }
4474         if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4475                 (void) door_return(NULL, 0, NULL, 0);
4476                 thr_exit(NULL);
4477         }
4478         (void) mutex_lock(&g_usage_cache_lock);
4479         g_hasclient = B_TRUE;
4480 
4481         /*
4482          * Force a new cpu calculation for client.  This will force a
4483          * new memory calculation if the memory data is older than the
4484          * sample period.
4485          */
4486         g_usage_cache_kickers++;
4487         (void) cond_signal(&g_usage_cache_kick);
4488         ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock);
4489         g_usage_cache_kickers--;
4490         if (ret != 0 && errno == EINTR) {
4491                 (void) mutex_unlock(&g_usage_cache_lock);
4492                 zsd_warn(gettext(
4493                     "Interrupted before writing usage size to client\n"));
4494                 (void) door_return(NULL, 0, NULL, 0);
4495                 thr_exit(NULL);
4496         }
4497         cache = zsd_usage_cache_hold_locked();
4498         if (cache == NULL) {
4499                 zsd_warn(gettext("Usage cache empty.\n"));
4500                 (void) door_return(NULL, 0, NULL, 0);
4501                 thr_exit(NULL);
4502         }
4503         (void) mutex_unlock(&g_usage_cache_lock);
4504 
4505         /* Copy current usage data to stack to send to client */
4506         usage = (zs_usage_t *)alloca(cache->zsuc_size);
4507 
4508         /* Filter out results if caller is non-global zone */
4509         zsd_usage_filter(zoneid, cache, usage, is_gz);
4510 
4511         rvalp = (void *)usage;
4512         rvals = usage->zsu_size;
4513         zsd_usage_cache_rele(cache);
4514 
4515         (void) door_return(rvalp, rvals, 0, NULL);
4516         thr_exit(NULL);
4517 }
4518 
4519 static volatile boolean_t g_quit;
4520 
4521 /* ARGSUSED */
4522 static void
4523 zonestat_quithandler(int sig)
4524 {
4525         g_quit = B_TRUE;
4526 }
4527 
4528 /*
4529  * The stat thread generates new utilization data when clients request
4530  * it.  It also manages opening and closing the subsystems used to gather
4531  * data depending on if clients exist.
4532  */
4533 /* ARGSUSED */
4534 void *
4535 stat_thread(void *arg)
4536 {
4537         time_t start;
4538         time_t now;
4539         time_t next_memory;
4540         boolean_t do_memory;
4541         boolean_t do_read;
4542         boolean_t do_close;
4543 
4544         start = time(NULL);
4545         if (start < 0) {
4546                 if (g_quit == B_TRUE)
4547                         goto quit;
4548                 zsd_warn(gettext("Unable to fetch current time"));
4549                 g_quit = B_TRUE;
4550                 goto quit;
4551         }
4552 
4553         next_memory = start;
4554         while (g_quit == B_FALSE) {
4555                 for (;;) {
4556                         /*
4557                          * These are used to decide if the most recent memory
4558                          * calculation was within a sample interval,
4559                          * and weather or not the usage collection needs to
4560                          * be opened or closed.
4561                          */
4562                         do_memory = B_FALSE;
4563                         do_read = B_FALSE;
4564                         do_close = B_FALSE;
4565 
4566                         /*
4567                          * If all clients have gone, close usage collecting
4568                          */
4569                         (void) mutex_lock(&g_usage_cache_lock);
4570                         if (!g_hasclient && g_open == B_TRUE) {
4571                                 do_close = B_TRUE;
4572                                 (void) mutex_unlock(&g_usage_cache_lock);
4573                                 break;
4574                         }
4575                         if (g_quit == B_TRUE) {
4576                                 (void) mutex_unlock(
4577                                     &g_usage_cache_lock);
4578                                 break;
4579                         }
4580                         /*
4581                          * Wait for a usage data request
4582                          */
4583                         if (g_usage_cache_kickers == 0) {
4584                                 (void) cond_wait(&g_usage_cache_kick,
4585                                     &g_usage_cache_lock);
4586                         }
4587                         now = time(NULL);
4588                         if (now < 0) {
4589                                 if (g_quit == B_TRUE) {
4590                                         (void) mutex_unlock(
4591                                             &g_usage_cache_lock);
4592                                         goto quit;
4593                                 }
4594                                 g_quit = B_TRUE;
4595                                 (void) mutex_unlock(&g_usage_cache_lock);
4596                                 zsd_warn(gettext(
4597                                     "Unable to fetch current time"));
4598                                 goto quit;
4599                         }
4600                         if (g_hasclient) {
4601                                 do_read = B_TRUE;
4602                                 if (now >= next_memory) {
4603                                         do_memory = B_TRUE;
4604                                         next_memory = now + g_interval;
4605                                 }
4606                         } else {
4607                                 do_close = B_TRUE;
4608                         }
4609                         (void) mutex_unlock(&g_usage_cache_lock);
4610                         if (do_read || do_close)
4611                                 break;
4612                 }
4613                 g_now = now;
4614                 g_hrnow = gethrtime();
4615                 if (g_hasclient && g_open == B_FALSE) {
4616                         g_start = g_now;
4617                         g_hrstart = g_hrnow;
4618                         g_ctl = zsd_open(g_ctl);
4619                         if (g_ctl == NULL)
4620                                 zsd_warn(gettext(
4621                                     "Unable to open zone statistics"));
4622                         else
4623                                 g_open = B_TRUE;
4624                 }
4625                 if (do_read && g_ctl) {
4626                         if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) {
4627                                 zsd_warn(gettext(
4628                                     "Unable to read zone statistics"));
4629                                 g_quit = B_TRUE;
4630                                 return (NULL);
4631                         }
4632                 }
4633                 (void) mutex_lock(&g_usage_cache_lock);
4634                 if (!g_hasclient && g_open == B_TRUE && g_ctl) {
4635                         (void) mutex_unlock(&g_usage_cache_lock);
4636                         zsd_close(g_ctl);
4637                         g_open = B_FALSE;
4638                 } else {
4639                         (void) mutex_unlock(&g_usage_cache_lock);
4640                 }
4641         }
4642 quit:
4643         if (g_open)
4644                 zsd_close(g_ctl);
4645 
4646         (void) thr_kill(g_main, SIGINT);
4647         thr_exit(NULL);
4648         return (NULL);
4649 }
4650 
4651 void
4652 zsd_set_fx()
4653 {
4654         pcinfo_t pcinfo;
4655         pcparms_t pcparms;
4656 
4657         (void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname));
4658         if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) {
4659                 zsd_warn(gettext("cannot get FX class parameters"));
4660                 return;
4661         }
4662         pcparms.pc_cid = pcinfo.pc_cid;
4663         ((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60;
4664         ((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60;
4665         ((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0;
4666         ((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE;
4667         if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1)
4668                 zsd_warn(gettext("cannot enter the FX class"));
4669 }
4670 
4671 static int pipe_fd;
4672 
4673 static void
4674 daemonize_ready(char status)
4675 {
4676         /*
4677          * wake the parent with a clue
4678          */
4679         (void) write(pipe_fd, &status, 1);
4680         (void) close(pipe_fd);
4681 }
4682 
4683 static int
4684 daemonize_start(void)
4685 {
4686         char data;
4687         int status;
4688 
4689         int filedes[2];
4690         pid_t pid;
4691 
4692         (void) close(0);
4693         (void) dup2(2, 1);
4694 
4695         if (pipe(filedes) < 0)
4696                 return (-1);
4697 
4698         (void) fflush(NULL);
4699 
4700         if ((pid = fork1()) < 0)
4701                 return (-1);
4702 
4703         if (pid != 0) {
4704                 /*
4705                  * parent
4706                  */
4707                 struct sigaction act;
4708 
4709                 act.sa_sigaction = SIG_DFL;
4710                 (void) sigemptyset(&act.sa_mask);
4711                 act.sa_flags = 0;
4712 
4713                 (void) sigaction(SIGPIPE, &act, NULL);  /* ignore SIGPIPE */
4714 
4715                 (void) close(filedes[1]);
4716                 if (read(filedes[0], &data, 1) == 1) {
4717                         /* forward ready code via exit status */
4718                         exit(data);
4719                 }
4720                 status = -1;
4721                 (void) wait4(pid, &status, 0, NULL);
4722                 /* daemon process exited before becoming ready */
4723                 if (WIFEXITED(status)) {
4724                         /* assume daemon process printed useful message */
4725                         exit(WEXITSTATUS(status));
4726                 } else {
4727                         zsd_warn(gettext("daemon process killed or died"));
4728                         exit(1);
4729                 }
4730         }
4731 
4732         /*
4733          * child
4734          */
4735         pipe_fd = filedes[1];
4736         (void) close(filedes[0]);
4737 
4738         /*
4739          * generic Unix setup
4740          */
4741         (void) setsid();
4742         (void) umask(0000);
4743 
4744         return (0);
4745 }
4746 
4747 static void
4748 fattach_all_zones(boolean_t detach_only)
4749 {
4750         zoneid_t *zids;
4751         uint_t nzids, nzids_last;
4752         int i;
4753 
4754 again:
4755         (void) zone_list(NULL, &nzids);
4756         nzids_last = nzids;
4757         zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last);
4758         if (zids == NULL)
4759                 zsd_error(gettext("Out of memory"));
4760 
4761         (void) zone_list(zids, &nzids);
4762         if (nzids > nzids_last) {
4763                 free(zids);
4764                 goto again;
4765         }
4766         for (i = 0; i < nzids; i++)
4767                 zsd_fattach_zone(zids[i], g_server_door, detach_only);
4768 
4769         free(zids);
4770 }
4771 
4772 int
4773 main(int argc, char *argv[])
4774 {
4775 
4776         int arg;
4777         thread_t tid;
4778         scf_simple_prop_t *prop;
4779         uint64_t *intervalp;
4780         boolean_t opt_cleanup = B_FALSE;
4781 
4782         g_main = thr_self();
4783         g_quit = B_FALSE;
4784         (void) signal(SIGINT, zonestat_quithandler);
4785         (void) signal(SIGTERM, zonestat_quithandler);
4786         (void) signal(SIGHUP, zonestat_quithandler);
4787 /*      (void) sigignore(SIGCHLD); */
4788         (void) sigignore(SIGPIPE);
4789 
4790         if (getzoneid() != GLOBAL_ZONEID)
4791                 zsd_error(gettext("Must be run from global zone only"));
4792 
4793         while ((arg = getopt(argc, argv, "c"))
4794             != EOF) {
4795                 switch (arg) {
4796                 case 'c':
4797                         opt_cleanup = B_TRUE;
4798                         break;
4799                 default:
4800                         zsd_error(gettext("Invalid option"));
4801                 }
4802         }
4803 
4804         if (opt_cleanup) {
4805                 if (zsd_disable_cpu_stats() != 0)
4806                         exit(1);
4807                 else
4808                         exit(0);
4809         }
4810 
4811         /* Get the configured sample interval */
4812         prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default",
4813             "config", "sample_interval");
4814         if (prop == NULL)
4815                 zsd_error(gettext("Unable to fetch SMF property "
4816                     "\"config/sample_interval\""));
4817 
4818         if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT)
4819                 zsd_error(gettext("Malformed SMF property "
4820                     "\"config/sample_interval\".  Must be of type \"count\""));
4821 
4822         intervalp = scf_simple_prop_next_count(prop);
4823         g_interval = *intervalp;
4824         if (g_interval == 0)
4825                 zsd_error(gettext("Malformed SMF property "
4826                     "\"config/sample_interval\".  Must be greater than zero"));
4827 
4828         scf_simple_prop_free(prop);
4829 
4830         if (daemonize_start() < 0)
4831                 zsd_error(gettext("Unable to start daemon\n"));
4832 
4833         /* Run at high priority */
4834         zsd_set_fx();
4835 
4836         (void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL);
4837         (void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL);
4838         (void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL);
4839 
4840         g_server_door = door_create(zsd_server, NULL,
4841             DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4842         if (g_server_door < 0)
4843                 zsd_error(gettext("Unable to create server door\n"));
4844 
4845 
4846         g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI |
4847             DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4848         if (g_stat_door < 0)
4849                 zsd_error(gettext("Unable to create statistics door\n"));
4850 
4851         fattach_all_zones(B_FALSE);
4852 
4853         if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0)
4854                 zsd_error(gettext("Unable to create statistics thread\n"));
4855 
4856         daemonize_ready(0);
4857 
4858         /* Wait for signal to quit */
4859         while (g_quit == B_FALSE)
4860                 (void) pause();
4861 
4862         /* detach doors */
4863         fattach_all_zones(B_TRUE);
4864 
4865         (void) door_revoke(g_server_door);
4866         (void) door_revoke(g_stat_door);
4867 
4868         /* kick stat thread and wait for it to close the statistics */
4869         (void) mutex_lock(&g_usage_cache_lock);
4870         g_quit = B_TRUE;
4871         (void) cond_signal(&g_usage_cache_kick);
4872         (void) mutex_unlock(&g_usage_cache_lock);
4873 end:
4874         (void) thr_join(tid, NULL, NULL);
4875         return (0);
4876 }