1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  25  */
  26 #include <alloca.h>
  27 #include <assert.h>
  28 #include <dirent.h>
  29 #include <dlfcn.h>
  30 #include <door.h>
  31 #include <errno.h>
  32 #include <exacct.h>
  33 #include <ctype.h>
  34 #include <fcntl.h>
  35 #include <kstat.h>
  36 #include <libcontract.h>
  37 #include <libintl.h>
  38 #include <libscf.h>
  39 #include <zonestat.h>
  40 #include <zonestat_impl.h>
  41 #include <limits.h>
  42 #include <pool.h>
  43 #include <procfs.h>
  44 #include <rctl.h>
  45 #include <thread.h>
  46 #include <signal.h>
  47 #include <stdarg.h>
  48 #include <stddef.h>
  49 #include <stdio.h>
  50 #include <stdlib.h>
  51 #include <strings.h>
  52 #include <synch.h>
  53 #include <sys/acctctl.h>
  54 #include <sys/contract/process.h>
  55 #include <sys/ctfs.h>
  56 #include <sys/fork.h>
  57 #include <sys/param.h>
  58 #include <sys/priocntl.h>
  59 #include <sys/fxpriocntl.h>
  60 #include <sys/processor.h>
  61 #include <sys/pset.h>
  62 #include <sys/socket.h>
  63 #include <sys/stat.h>
  64 #include <sys/statvfs.h>
  65 #include <sys/swap.h>
  66 #include <sys/systeminfo.h>
  67 #include <thread.h>
  68 #include <sys/list.h>
  69 #include <sys/time.h>
  70 #include <sys/types.h>
  71 #include <sys/vm_usage.h>
  72 #include <sys/wait.h>
  73 #include <sys/zone.h>
  74 #include <time.h>
  75 #include <ucred.h>
  76 #include <unistd.h>
  77 #include <vm/anon.h>
  78 #include <zone.h>
  79 #include <zonestat.h>
  80 
  81 #define MAX_PSET_NAME   1024    /* Taken from PV_NAME_MAX_LEN */
  82 #define ZSD_PSET_UNLIMITED      UINT16_MAX
  83 #define ZONESTAT_EXACCT_FILE    "/var/adm/exacct/zonestat-process"
  84 
  85 /*
  86  * zonestatd implements gathering cpu and memory utilization data for
  87  * running zones.  It has these components:
  88  *
  89  * zsd_server:
  90  *      Door server to respond to client connections.  Each client
  91  *      will connect using libzonestat.so, which will open and
  92  *      call /var/tmp/.zonestat_door.  Each connecting client is given
  93  *      a file descriptor to the stat server.
  94  *
  95  *      The zsd_server also responds to zoneadmd, which reports when a
  96  *      new zone is booted.  This is used to fattach the zsd_server door
  97  *      into the new zone.
  98  *
  99  * zsd_stat_server:
 100  *      Receives client requests for the current utilization data.  Each
 101  *      client request will cause zonestatd to update the current utilization
 102  *      data by kicking the stat_thread.
 103  *
 104  *      If the client is in a non-global zone, the utilization data will
 105  *      be filtered to only show the given zone.  The usage by all other zones
 106  *      will be added to the system utilization.
 107  *
 108  * stat_thread:
 109  *      The stat thread implements querying the system to determine the
 110  *      current utilization data for each running zone.  This includes
 111  *      inspecting the system's processor set configuration, as well as details
 112  *      of each zone, such as their configured limits, and which processor
 113  *      sets they are running in.
 114  *
 115  *      The stat_thread will only update memory utilization data as often as
 116  *      the configured config/sample_interval on the zones-monitoring service.
 117  */
 118 
 119 /*
 120  * The private vmusage structure unfortunately uses size_t types, and assumes
 121  * the caller's bitness matches the kernel's bitness.  Since the getvmusage()
 122  * system call is contracted, and zonestatd is 32 bit, the following structures
 123  * are used to interact with a 32bit or 64 bit kernel.
 124  */
 125 typedef struct zsd_vmusage32 {
 126         id_t vmu_zoneid;
 127         uint_t vmu_type;
 128         id_t vmu_id;
 129 
 130         uint32_t vmu_rss_all;
 131         uint32_t vmu_rss_private;
 132         uint32_t vmu_rss_shared;
 133         uint32_t vmu_swap_all;
 134         uint32_t vmu_swap_private;
 135         uint32_t vmu_swap_shared;
 136 } zsd_vmusage32_t;
 137 
 138 typedef struct zsd_vmusage64 {
 139         id_t vmu_zoneid;
 140         uint_t vmu_type;
 141         id_t vmu_id;
 142         /*
 143          * An amd64 kernel will align the following uint64_t members, but a
 144          * 32bit i386 process will not without help.
 145          */
 146         int vmu_align_next_members_on_8_bytes;
 147         uint64_t vmu_rss_all;
 148         uint64_t vmu_rss_private;
 149         uint64_t vmu_rss_shared;
 150         uint64_t vmu_swap_all;
 151         uint64_t vmu_swap_private;
 152         uint64_t vmu_swap_shared;
 153 } zsd_vmusage64_t;
 154 
 155 struct zsd_zone;
 156 
 157 /* Used to store a zone's usage of a pset */
 158 typedef struct zsd_pset_usage {
 159         struct zsd_zone *zsu_zone;
 160         struct zsd_pset *zsu_pset;
 161 
 162         list_node_t     zsu_next;
 163 
 164         zoneid_t        zsu_zoneid;
 165         boolean_t       zsu_found;      /* zone bound at end of interval */
 166         boolean_t       zsu_active;     /* zone was bound during interval */
 167         boolean_t       zsu_new;        /* zone newly bound in this interval */
 168         boolean_t       zsu_deleted;    /* zone was unbound in this interval */
 169         boolean_t       zsu_empty;      /* no procs in pset in this interval */
 170         time_t          zsu_start;      /* time when zone was found in pset */
 171         hrtime_t        zsu_hrstart;    /* time when zone  was found in pset */
 172         uint64_t        zsu_cpu_shares;
 173         uint_t          zsu_scheds;     /* schedulers found in this pass */
 174         timestruc_t     zsu_cpu_usage;  /* cpu time used */
 175 } zsd_pset_usage_t;
 176 
 177 /* Used to store a pset's utilization */
 178 typedef struct zsd_pset {
 179         psetid_t        zsp_id;
 180         list_node_t     zsp_next;
 181         char            zsp_name[ZS_PSETNAME_MAX];
 182 
 183         uint_t          zsp_cputype;    /* default, dedicated or shared */
 184         boolean_t       zsp_found;      /* pset found at end of interval */
 185         boolean_t       zsp_new;        /* pset new in this interval */
 186         boolean_t       zsp_deleted;    /* pset deleted in this interval */
 187         boolean_t       zsp_active;     /* pset existed during interval */
 188         boolean_t       zsp_empty;      /* no processes in pset */
 189         time_t          zsp_start;
 190         hrtime_t        zsp_hrstart;
 191 
 192         uint64_t        zsp_online;     /* online cpus in interval */
 193         uint64_t        zsp_size;       /* size in this interval */
 194         uint64_t        zsp_min;        /* configured min in this interval */
 195         uint64_t        zsp_max;        /* configured max in this interval */
 196         int64_t         zsp_importance; /* configured max in this interval */
 197 
 198         uint_t          zsp_scheds;     /* scheds of processes found in pset */
 199         uint64_t        zsp_cpu_shares; /* total shares in this interval */
 200 
 201         timestruc_t     zsp_total_time;
 202         timestruc_t     zsp_usage_kern;
 203         timestruc_t     zsp_usage_zones;
 204 
 205         /* Individual zone usages of pset */
 206         list_t          zsp_usage_list;
 207         int             zsp_nusage;
 208 
 209         /* Summed kstat values from individual cpus in pset */
 210         timestruc_t     zsp_idle;
 211         timestruc_t     zsp_intr;
 212         timestruc_t     zsp_kern;
 213         timestruc_t     zsp_user;
 214 
 215 } zsd_pset_t;
 216 
 217 /* Used to track an individual cpu's utilization as reported by kstats */
 218 typedef struct zsd_cpu {
 219         processorid_t   zsc_id;
 220         list_node_t     zsc_next;
 221         psetid_t        zsc_psetid;
 222         psetid_t        zsc_psetid_prev;
 223         zsd_pset_t      *zsc_pset;
 224 
 225         boolean_t       zsc_found;      /* cpu online in this interval */
 226         boolean_t       zsc_onlined;    /* cpu onlined during this interval */
 227         boolean_t       zsc_offlined;   /* cpu offlined during this interval */
 228         boolean_t       zsc_active;     /* cpu online during this interval */
 229         boolean_t       zsc_allocated;  /* True if cpu has ever been found */
 230 
 231         /* kstats this interval */
 232         uint64_t        zsc_nsec_idle;
 233         uint64_t        zsc_nsec_intr;
 234         uint64_t        zsc_nsec_kern;
 235         uint64_t        zsc_nsec_user;
 236 
 237         /* kstats in most recent interval */
 238         uint64_t        zsc_nsec_idle_prev;
 239         uint64_t        zsc_nsec_intr_prev;
 240         uint64_t        zsc_nsec_kern_prev;
 241         uint64_t        zsc_nsec_user_prev;
 242 
 243         /* Total kstat increases since zonestatd started reading kstats */
 244         timestruc_t     zsc_idle;
 245         timestruc_t     zsc_intr;
 246         timestruc_t     zsc_kern;
 247         timestruc_t     zsc_user;
 248 
 249 } zsd_cpu_t;
 250 
 251 /* Used to describe an individual zone and its utilization */
 252 typedef struct zsd_zone {
 253         zoneid_t        zsz_id;
 254         list_node_t     zsz_next;
 255         char            zsz_name[ZS_ZONENAME_MAX];
 256         uint_t          zsz_cputype;
 257         uint_t          zsz_iptype;
 258         time_t          zsz_start;
 259         hrtime_t        zsz_hrstart;
 260 
 261         char            zsz_pool[ZS_POOLNAME_MAX];
 262         char            zsz_pset[ZS_PSETNAME_MAX];
 263         int             zsz_default_sched;
 264         /* These are deduced by inspecting processes */
 265         psetid_t        zsz_psetid;
 266         uint_t          zsz_scheds;
 267 
 268         boolean_t       zsz_new;        /* zone booted during this interval */
 269         boolean_t       zsz_deleted;    /* halted during this interval */
 270         boolean_t       zsz_active;     /* running in this interval */
 271         boolean_t       zsz_empty;      /* no processes in this interval */
 272         boolean_t       zsz_gone;       /* not installed in this interval */
 273         boolean_t       zsz_found;      /* Running at end of this interval */
 274 
 275         uint64_t        zsz_cpu_shares;
 276         uint64_t        zsz_cpu_cap;
 277         uint64_t        zsz_ram_cap;
 278         uint64_t        zsz_locked_cap;
 279         uint64_t        zsz_vm_cap;
 280 
 281         uint64_t        zsz_cpus_online;
 282         timestruc_t     zsz_cpu_usage;  /* cpu time of cpu cap */
 283         timestruc_t     zsz_cap_time;   /* cpu time of cpu cap */
 284         timestruc_t     zsz_share_time; /* cpu time of share of cpu */
 285         timestruc_t     zsz_pset_time;  /* time of all psets zone is bound to */
 286 
 287         uint64_t        zsz_usage_ram;
 288         uint64_t        zsz_usage_locked;
 289         uint64_t        zsz_usage_vm;
 290 
 291         uint64_t        zsz_processes_cap;
 292         uint64_t        zsz_lwps_cap;
 293         uint64_t        zsz_shm_cap;
 294         uint64_t        zsz_shmids_cap;
 295         uint64_t        zsz_semids_cap;
 296         uint64_t        zsz_msgids_cap;
 297         uint64_t        zsz_lofi_cap;
 298 
 299         uint64_t        zsz_processes;
 300         uint64_t        zsz_lwps;
 301         uint64_t        zsz_shm;
 302         uint64_t        zsz_shmids;
 303         uint64_t        zsz_semids;
 304         uint64_t        zsz_msgids;
 305         uint64_t        zsz_lofi;
 306 
 307 } zsd_zone_t;
 308 
 309 /*
 310  * Used to track the cpu usage of an individual processes.
 311  *
 312  * zonestatd sweeps /proc each interval and charges the cpu usage of processes.
 313  * to their zone.  As processes exit, their extended accounting records are
 314  * read and the difference of their total and known usage is charged to their
 315  * zone.
 316  *
 317  * If a process is never seen in /proc, the total usage on its extended
 318  * accounting record will be charged to its zone.
 319  */
 320 typedef struct zsd_proc {
 321         list_node_t     zspr_next;
 322         pid_t           zspr_ppid;
 323         psetid_t        zspr_psetid;
 324         zoneid_t        zspr_zoneid;
 325         int             zspr_sched;
 326         timestruc_t     zspr_usage;
 327 } zsd_proc_t;
 328 
 329 /* Used to track the overall resource usage of the system */
 330 typedef struct zsd_system {
 331 
 332         uint64_t zss_ram_total;
 333         uint64_t zss_ram_kern;
 334         uint64_t zss_ram_zones;
 335 
 336         uint64_t zss_locked_kern;
 337         uint64_t zss_locked_zones;
 338 
 339         uint64_t zss_vm_total;
 340         uint64_t zss_vm_kern;
 341         uint64_t zss_vm_zones;
 342 
 343         uint64_t zss_swap_total;
 344         uint64_t zss_swap_used;
 345 
 346         timestruc_t zss_idle;
 347         timestruc_t zss_intr;
 348         timestruc_t zss_kern;
 349         timestruc_t zss_user;
 350 
 351         timestruc_t zss_cpu_total_time;
 352         timestruc_t zss_cpu_usage_kern;
 353         timestruc_t zss_cpu_usage_zones;
 354 
 355         uint64_t zss_maxpid;
 356         uint64_t zss_processes_max;
 357         uint64_t zss_lwps_max;
 358         uint64_t zss_shm_max;
 359         uint64_t zss_shmids_max;
 360         uint64_t zss_semids_max;
 361         uint64_t zss_msgids_max;
 362         uint64_t zss_lofi_max;
 363 
 364         uint64_t zss_processes;
 365         uint64_t zss_lwps;
 366         uint64_t zss_shm;
 367         uint64_t zss_shmids;
 368         uint64_t zss_semids;
 369         uint64_t zss_msgids;
 370         uint64_t zss_lofi;
 371 
 372         uint64_t zss_ncpus;
 373         uint64_t zss_ncpus_online;
 374 
 375 } zsd_system_t;
 376 
 377 /*
 378  * A dumping ground for various information and structures used to compute
 379  * utilization.
 380  *
 381  * This structure is used to track the system while clients are connected.
 382  * When The first client connects, a zsd_ctl is allocated and configured by
 383  * zsd_open().  When all clients disconnect, the zsd_ctl is closed.
 384  */
 385 typedef struct zsd_ctl {
 386         kstat_ctl_t     *zsctl_kstat_ctl;
 387 
 388         /* To track extended accounting */
 389         int             zsctl_proc_fd;          /* Log currently being used */
 390         ea_file_t       zsctl_proc_eaf;
 391         struct stat64   zsctl_proc_stat;
 392         int             zsctl_proc_open;
 393         int             zsctl_proc_fd_next;     /* Log file to use next */
 394         ea_file_t       zsctl_proc_eaf_next;
 395         struct stat64   zsctl_proc_stat_next;
 396         int             zsctl_proc_open_next;
 397 
 398         /* pool configuration handle */
 399         pool_conf_t     *zsctl_pool_conf;
 400         int             zsctl_pool_status;
 401         int             zsctl_pool_changed;
 402 
 403         /* The above usage tacking structures */
 404         zsd_system_t    *zsctl_system;
 405         list_t          zsctl_zones;
 406         list_t          zsctl_psets;
 407         list_t          zsctl_cpus;
 408         zsd_cpu_t       *zsctl_cpu_array;
 409         zsd_proc_t      *zsctl_proc_array;
 410 
 411         /* Various system info */
 412         uint64_t        zsctl_maxcpuid;
 413         uint64_t        zsctl_maxproc;
 414         uint64_t        zsctl_kern_bits;
 415         uint64_t        zsctl_pagesize;
 416 
 417         /* Used to track time available under a cpu cap. */
 418         uint64_t        zsctl_hrtime;
 419         uint64_t        zsctl_hrtime_prev;
 420         timestruc_t     zsctl_hrtime_total;
 421 
 422         struct timeval  zsctl_timeofday;
 423 
 424         /* Caches for arrays allocated for use by various system calls */
 425         psetid_t        *zsctl_pset_cache;
 426         uint_t          zsctl_pset_ncache;
 427         processorid_t   *zsctl_cpu_cache;
 428         uint_t          zsctl_cpu_ncache;
 429         zoneid_t        *zsctl_zone_cache;
 430         uint_t          zsctl_zone_ncache;
 431         struct swaptable *zsctl_swap_cache;
 432         uint64_t        zsctl_swap_cache_size;
 433         uint64_t        zsctl_swap_cache_num;
 434         zsd_vmusage64_t *zsctl_vmusage_cache;
 435         uint64_t        zsctl_vmusage_cache_num;
 436 
 437         /* Info about procfs for scanning /proc */
 438         struct dirent   *zsctl_procfs_dent;
 439         long            zsctl_procfs_dent_size;
 440         pool_value_t    *zsctl_pool_vals[3];
 441 
 442         /* Counts on tracked entities */
 443         uint_t          zsctl_nzones;
 444         uint_t          zsctl_npsets;
 445         uint_t          zsctl_npset_usages;
 446 } zsd_ctl_t;
 447 
 448 zsd_ctl_t               *g_ctl;
 449 boolean_t               g_open;         /* True if g_ctl is open */
 450 int                     g_hasclient;    /* True if any clients are connected */
 451 
 452 /*
 453  * The usage cache is updated by the stat_thread, and copied to clients by
 454  * the zsd_stat_server.  Mutex and cond are to synchronize between the
 455  * stat_thread and the stat_server.
 456  */
 457 zs_usage_cache_t        *g_usage_cache;
 458 mutex_t                 g_usage_cache_lock;
 459 cond_t                  g_usage_cache_kick;
 460 uint_t                  g_usage_cache_kickers;
 461 cond_t                  g_usage_cache_wait;
 462 char                    *g_usage_cache_buf;
 463 uint_t                  g_usage_cache_bufsz;
 464 uint64_t                g_gen_next;
 465 
 466 /* fds of door servers */
 467 int                     g_server_door;
 468 int                     g_stat_door;
 469 
 470 /*
 471  * Starting and current time.  Used to throttle memory calculation, and to
 472  * mark new zones and psets with their boot and creation time.
 473  */
 474 time_t                  g_now;
 475 time_t                  g_start;
 476 hrtime_t                g_hrnow;
 477 hrtime_t                g_hrstart;
 478 uint64_t                g_interval;
 479 
 480 /*
 481  * main() thread.
 482  */
 483 thread_t                g_main;
 484 
 485 /* PRINTFLIKE1 */
 486 static void
 487 zsd_warn(const char *fmt, ...)
 488 {
 489         va_list alist;
 490 
 491         va_start(alist, fmt);
 492 
 493         (void) fprintf(stderr, gettext("zonestat: Warning: "));
 494         (void) vfprintf(stderr, fmt, alist);
 495         (void) fprintf(stderr, "\n");
 496         va_end(alist);
 497 }
 498 
 499 /* PRINTFLIKE1 */
 500 static void
 501 zsd_error(const char *fmt, ...)
 502 {
 503         va_list alist;
 504 
 505         va_start(alist, fmt);
 506 
 507         (void) fprintf(stderr, gettext("zonestat: Error: "));
 508         (void) vfprintf(stderr, fmt, alist);
 509         (void) fprintf(stderr, "\n");
 510         va_end(alist);
 511         exit(1);
 512 }
 513 
 514 /* Turns on extended accounting if not configured externally */
 515 int
 516 zsd_enable_cpu_stats()
 517 {
 518         char *path = ZONESTAT_EXACCT_FILE;
 519         char oldfile[MAXPATHLEN];
 520         int ret, state = AC_ON;
 521         ac_res_t res[6];
 522 
 523         /*
 524          * Start a new accounting file  if accounting not configured
 525          * externally.
 526          */
 527 
 528         res[0].ar_id = AC_PROC_PID;
 529         res[0].ar_state = AC_ON;
 530         res[1].ar_id = AC_PROC_ANCPID;
 531         res[1].ar_state = AC_ON;
 532         res[2].ar_id = AC_PROC_CPU;
 533         res[2].ar_state = AC_ON;
 534         res[3].ar_id = AC_PROC_TIME;
 535         res[3].ar_state = AC_ON;
 536         res[4].ar_id = AC_PROC_ZONENAME;
 537         res[4].ar_state = AC_ON;
 538         res[5].ar_id = AC_NONE;
 539         res[5].ar_state = AC_ON;
 540         if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
 541                 zsd_warn(gettext("Unable to set accounting resources"));
 542                 return (-1);
 543         }
 544         /* Only set accounting file if none is configured */
 545         ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
 546         if (ret < 0) {
 547 
 548                 (void) unlink(path);
 549                 if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1)
 550                     == -1) {
 551                         zsd_warn(gettext("Unable to set accounting file"));
 552                         return (-1);
 553                 }
 554         }
 555         if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
 556                 zsd_warn(gettext("Unable to enable accounting"));
 557                 return (-1);
 558         }
 559         return (0);
 560 }
 561 
 562 /* Turns off extended accounting if not configured externally */
 563 int
 564 zsd_disable_cpu_stats()
 565 {
 566         char *path = ZONESTAT_EXACCT_FILE;
 567         int ret, state = AC_OFF;
 568         ac_res_t res[6];
 569         char oldfile[MAXPATHLEN];
 570 
 571         /* If accounting file is externally configured, leave it alone */
 572         ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
 573         if (ret == 0 && strcmp(oldfile, path) != 0)
 574                 return (0);
 575 
 576         res[0].ar_id = AC_PROC_PID;
 577         res[0].ar_state = AC_OFF;
 578         res[1].ar_id = AC_PROC_ANCPID;
 579         res[1].ar_state = AC_OFF;
 580         res[2].ar_id = AC_PROC_CPU;
 581         res[2].ar_state = AC_OFF;
 582         res[3].ar_id = AC_PROC_TIME;
 583         res[3].ar_state = AC_OFF;
 584         res[4].ar_id = AC_PROC_ZONENAME;
 585         res[4].ar_state = AC_OFF;
 586         res[5].ar_id = AC_NONE;
 587         res[5].ar_state = AC_OFF;
 588         if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
 589                 zsd_warn(gettext("Unable to clear accounting resources"));
 590                 return (-1);
 591         }
 592         if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) {
 593                 zsd_warn(gettext("Unable to clear accounting file"));
 594                 return (-1);
 595         }
 596         if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
 597                 zsd_warn(gettext("Unable to diable accounting"));
 598                 return (-1);
 599         }
 600 
 601         (void) unlink(path);
 602         return (0);
 603 }
 604 
 605 /*
 606  * If not configured externally, deletes the current extended accounting file
 607  * and starts a new one.
 608  *
 609  * Since the stat_thread holds an open handle to the accounting file, it will
 610  * read all remaining entries from the old file before switching to
 611  * read the new one.
 612  */
 613 int
 614 zsd_roll_exacct(void)
 615 {
 616         int ret;
 617         char *path = ZONESTAT_EXACCT_FILE;
 618         char oldfile[MAXPATHLEN];
 619 
 620         /* If accounting file is externally configured, leave it alone */
 621         ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
 622         if (ret == 0 && strcmp(oldfile, path) != 0)
 623                 return (0);
 624 
 625         if (unlink(path) != 0)
 626                 /* Roll it next time */
 627                 return (0);
 628 
 629         if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) {
 630                 zsd_warn(gettext("Unable to set accounting file"));
 631                 return (-1);
 632         }
 633         return (0);
 634 }
 635 
 636 /* Contract stuff for zone_enter() */
 637 int
 638 init_template(void)
 639 {
 640         int fd;
 641         int err = 0;
 642 
 643         fd = open64(CTFS_ROOT "/process/template", O_RDWR);
 644         if (fd == -1)
 645                 return (-1);
 646 
 647         /*
 648          * For now, zoneadmd doesn't do anything with the contract.
 649          * Deliver no events, don't inherit, and allow it to be orphaned.
 650          */
 651         err |= ct_tmpl_set_critical(fd, 0);
 652         err |= ct_tmpl_set_informative(fd, 0);
 653         err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
 654         err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
 655         if (err || ct_tmpl_activate(fd)) {
 656                 (void) close(fd);
 657                 return (-1);
 658         }
 659 
 660         return (fd);
 661 }
 662 
 663 /*
 664  * Contract stuff for zone_enter()
 665  */
 666 int
 667 contract_latest(ctid_t *id)
 668 {
 669         int cfd, r;
 670         ct_stathdl_t st;
 671         ctid_t result;
 672 
 673         if ((cfd = open64(CTFS_ROOT "/process/latest", O_RDONLY)) == -1)
 674                 return (errno);
 675 
 676         if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) {
 677                 (void) close(cfd);
 678                 return (r);
 679         }
 680 
 681         result = ct_status_get_id(st);
 682         ct_status_free(st);
 683         (void) close(cfd);
 684 
 685         *id = result;
 686         return (0);
 687 }
 688 
 689 static int
 690 close_on_exec(int fd)
 691 {
 692         int flags = fcntl(fd, F_GETFD, 0);
 693         if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1))
 694                 return (0);
 695         return (-1);
 696 }
 697 
 698 int
 699 contract_open(ctid_t ctid, const char *type, const char *file, int oflag)
 700 {
 701         char path[PATH_MAX];
 702         int n, fd;
 703 
 704         if (type == NULL)
 705                 type = "all";
 706 
 707         n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file);
 708         if (n >= sizeof (path)) {
 709                 errno = ENAMETOOLONG;
 710                 return (-1);
 711         }
 712 
 713         fd = open64(path, oflag);
 714         if (fd != -1) {
 715                 if (close_on_exec(fd) == -1) {
 716                         int err = errno;
 717                         (void) close(fd);
 718                         errno = err;
 719                         return (-1);
 720                 }
 721         }
 722         return (fd);
 723 }
 724 
 725 int
 726 contract_abandon_id(ctid_t ctid)
 727 {
 728         int fd, err;
 729 
 730         fd = contract_open(ctid, "all", "ctl", O_WRONLY);
 731         if (fd == -1)
 732                 return (errno);
 733 
 734         err = ct_ctl_abandon(fd);
 735         (void) close(fd);
 736 
 737         return (err);
 738 }
 739 /*
 740  * Attach the zsd_server to a zone.  Called for each zone when zonestatd
 741  * starts, and for each newly booted zone when zoneadmd contacts the zsd_server
 742  *
 743  * Zone_enter is used to avoid reaching into zone to fattach door.
 744  */
 745 static void
 746 zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only)
 747 {
 748         char *path = ZS_DOOR_PATH;
 749         int fd, pid, stat, tmpl_fd;
 750         ctid_t ct;
 751 
 752         if ((tmpl_fd = init_template()) == -1) {
 753                 zsd_warn("Unable to init template");
 754                 return;
 755         }
 756 
 757         pid = forkx(0);
 758         if (pid < 0) {
 759                 (void) ct_tmpl_clear(tmpl_fd);
 760                 zsd_warn(gettext(
 761                     "Unable to fork to add zonestat to zoneid %d\n"), zid);
 762                 return;
 763         }
 764 
 765         if (pid == 0) {
 766                 (void) ct_tmpl_clear(tmpl_fd);
 767                 (void) close(tmpl_fd);
 768                 if (zid != 0 && zone_enter(zid) != 0) {
 769                         if (errno == EINVAL) {
 770                                 _exit(0);
 771                         }
 772                         _exit(1);
 773                 }
 774                 (void) fdetach(path);
 775                 (void) unlink(path);
 776                 if (detach_only)
 777                         _exit(0);
 778                 fd = open(path, O_CREAT|O_RDWR, 0644);
 779                 if (fd < 0)
 780                         _exit(2);
 781                 if (fattach(door, path) != 0)
 782                         _exit(3);
 783                 _exit(0);
 784         }
 785         if (contract_latest(&ct) == -1)
 786                 ct = -1;
 787         (void) ct_tmpl_clear(tmpl_fd);
 788         (void) close(tmpl_fd);
 789         (void) contract_abandon_id(ct);
 790         while (waitpid(pid, &stat, 0) != pid)
 791                 ;
 792         if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0)
 793                 return;
 794 
 795         zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid);
 796 
 797         if (WEXITSTATUS(stat) == 1)
 798                 zsd_warn(gettext("Cannot entering zone"));
 799         else if (WEXITSTATUS(stat) == 2)
 800                 zsd_warn(gettext("Unable to create door file: %s"), path);
 801         else if (WEXITSTATUS(stat) == 3)
 802                 zsd_warn(gettext("Unable to fattach file: %s"), path);
 803 
 804         zsd_warn(gettext("Internal error entering zone: %d"), zid);
 805 }
 806 
 807 /*
 808  * Zone lookup and allocation functions to manage list of currently running
 809  * zones.
 810  */
 811 static zsd_zone_t *
 812 zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
 813 {
 814         zsd_zone_t *zone;
 815 
 816         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
 817             zone = list_next(&ctl->zsctl_zones, zone)) {
 818                 if (strcmp(zone->zsz_name, zonename) == 0) {
 819                         if (zoneid != -1)
 820                                 zone->zsz_id = zoneid;
 821                         return (zone);
 822                 }
 823         }
 824         return (NULL);
 825 }
 826 
 827 static zsd_zone_t *
 828 zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid)
 829 {
 830         zsd_zone_t *zone;
 831 
 832         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
 833             zone = list_next(&ctl->zsctl_zones, zone)) {
 834                 if (zone->zsz_id == zoneid)
 835                         return (zone);
 836         }
 837         return (NULL);
 838 }
 839 
 840 static zsd_zone_t *
 841 zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
 842 {
 843         zsd_zone_t *zone;
 844 
 845         if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL)
 846                 return (NULL);
 847 
 848         (void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name));
 849         zone->zsz_id = zoneid;
 850         zone->zsz_found = B_FALSE;
 851 
 852         /*
 853          * Allocate as deleted so if not found in first pass, zone is deleted
 854          * from list.  This can happen if zone is returned by zone_list, but
 855          * exits before first attempt to fetch zone details.
 856          */
 857         zone->zsz_start = g_now;
 858         zone->zsz_hrstart = g_hrnow;
 859         zone->zsz_deleted = B_TRUE;
 860 
 861         zone->zsz_cpu_shares = ZS_LIMIT_NONE;
 862         zone->zsz_cpu_cap = ZS_LIMIT_NONE;
 863         zone->zsz_ram_cap = ZS_LIMIT_NONE;
 864         zone->zsz_locked_cap = ZS_LIMIT_NONE;
 865         zone->zsz_vm_cap = ZS_LIMIT_NONE;
 866 
 867         zone->zsz_processes_cap = ZS_LIMIT_NONE;
 868         zone->zsz_lwps_cap = ZS_LIMIT_NONE;
 869         zone->zsz_shm_cap = ZS_LIMIT_NONE;
 870         zone->zsz_shmids_cap = ZS_LIMIT_NONE;
 871         zone->zsz_semids_cap = ZS_LIMIT_NONE;
 872         zone->zsz_msgids_cap = ZS_LIMIT_NONE;
 873         zone->zsz_lofi_cap = ZS_LIMIT_NONE;
 874 
 875         ctl->zsctl_nzones++;
 876 
 877         return (zone);
 878 }
 879 
 880 static zsd_zone_t *
 881 zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
 882 {
 883         zsd_zone_t *zone, *tmp;
 884 
 885         if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL)
 886                 return (zone);
 887 
 888         if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL)
 889                 return (NULL);
 890 
 891         /* Insert sorted by zonename */
 892         tmp = list_head(&ctl->zsctl_zones);
 893         while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0)
 894                 tmp = list_next(&ctl->zsctl_zones, tmp);
 895 
 896         list_insert_before(&ctl->zsctl_zones, tmp, zone);
 897         return (zone);
 898 }
 899 
 900 /*
 901  * Mark all zones as not existing.  As zones are found, they will
 902  * be marked as existing.  If a zone is not found, then it must have
 903  * halted.
 904  */
 905 static void
 906 zsd_mark_zones_start(zsd_ctl_t *ctl)
 907 {
 908 
 909         zsd_zone_t *zone;
 910 
 911         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
 912             zone = list_next(&ctl->zsctl_zones, zone)) {
 913                 zone->zsz_found = B_FALSE;
 914         }
 915 }
 916 
 917 /*
 918  * Mark each zone as not using pset.  If processes are found using the
 919  * pset, the zone will remain bound to the pset.  If none of a zones
 920  * processes are bound to the pset, the zone's usage of the pset will
 921  * be deleted.
 922  *
 923  */
 924 static void
 925 zsd_mark_pset_usage_start(zsd_pset_t *pset)
 926 {
 927         zsd_pset_usage_t *usage;
 928 
 929         for (usage = list_head(&pset->zsp_usage_list);
 930             usage != NULL;
 931             usage = list_next(&pset->zsp_usage_list, usage)) {
 932                 usage->zsu_found = B_FALSE;
 933                 usage->zsu_empty = B_TRUE;
 934         }
 935 }
 936 
 937 /*
 938  * Mark each pset as not existing.  If a pset is found, it will be marked
 939  * as existing.  If a pset is not found, it wil be deleted.
 940  */
 941 static void
 942 zsd_mark_psets_start(zsd_ctl_t *ctl)
 943 {
 944         zsd_pset_t *pset;
 945 
 946         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
 947             pset = list_next(&ctl->zsctl_psets, pset)) {
 948                 pset->zsp_found = B_FALSE;
 949                 zsd_mark_pset_usage_start(pset);
 950         }
 951 }
 952 
 953 /*
 954  * A pset was found.  Update its information
 955  */
 956 static void
 957 zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online,
 958     uint64_t size, uint64_t min, uint64_t max, int64_t importance)
 959 {
 960         pset->zsp_empty = B_TRUE;
 961         pset->zsp_deleted = B_FALSE;
 962 
 963         assert(pset->zsp_found == B_FALSE);
 964 
 965         /* update pset flags */
 966         if (pset->zsp_active == B_FALSE)
 967                 /* pset not seen on previous interval.  It is new. */
 968                 pset->zsp_new = B_TRUE;
 969         else
 970                 pset->zsp_new = B_FALSE;
 971 
 972         pset->zsp_found = B_TRUE;
 973         pset->zsp_cputype = type;
 974         pset->zsp_online = online;
 975         pset->zsp_size = size;
 976         pset->zsp_min = min;
 977         pset->zsp_max = max;
 978         pset->zsp_importance = importance;
 979         pset->zsp_cpu_shares = 0;
 980         pset->zsp_scheds = 0;
 981         pset->zsp_active = B_TRUE;
 982 }
 983 
 984 /*
 985  * A zone's process was found using a pset. Charge the process to the pset and
 986  * the per-zone data for the pset.
 987  */
 988 static void
 989 zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched)
 990 {
 991         zsd_zone_t *zone = usage->zsu_zone;
 992         zsd_pset_t *pset = usage->zsu_pset;
 993 
 994         /* Nothing to do if already found */
 995         if (usage->zsu_found == B_TRUE)
 996                 goto add_stats;
 997 
 998         usage->zsu_found = B_TRUE;
 999         usage->zsu_empty = B_FALSE;
1000 
1001         usage->zsu_deleted = B_FALSE;
1002         /* update usage flags */
1003         if (usage->zsu_active == B_FALSE)
1004                 usage->zsu_new = B_TRUE;
1005         else
1006                 usage->zsu_new = B_FALSE;
1007 
1008         usage->zsu_scheds = 0;
1009         usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1010         usage->zsu_active = B_TRUE;
1011         pset->zsp_empty = B_FALSE;
1012         zone->zsz_empty = B_FALSE;
1013 
1014 add_stats:
1015         /* Detect zone's pset id, and if it is bound to multiple psets */
1016         if (zone->zsz_psetid == ZS_PSET_ERROR)
1017                 zone->zsz_psetid = pset->zsp_id;
1018         else if (zone->zsz_psetid != pset->zsp_id)
1019                 zone->zsz_psetid = ZS_PSET_MULTI;
1020 
1021         usage->zsu_scheds |= sched;
1022         pset->zsp_scheds |= sched;
1023         zone->zsz_scheds |= sched;
1024 
1025         /* Record if FSS is co-habitating with conflicting scheduler */
1026         if ((pset->zsp_scheds & ZS_SCHED_FSS) &&
1027             usage->zsu_scheds & (
1028             ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) {
1029                 usage->zsu_scheds |= ZS_SCHED_CONFLICT;
1030 
1031                 pset->zsp_scheds |= ZS_SCHED_CONFLICT;
1032         }
1033 
1034 }
1035 
1036 /* Add cpu time for a process to a pset, zone, and system totals */
1037 static void
1038 zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta)
1039 {
1040         zsd_system_t *system = ctl->zsctl_system;
1041         zsd_zone_t *zone = usage->zsu_zone;
1042         zsd_pset_t *pset = usage->zsu_pset;
1043 
1044         TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta);
1045         TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta);
1046         TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta);
1047         TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta);
1048 }
1049 
1050 /* Determine which processor sets have been deleted */
1051 static void
1052 zsd_mark_psets_end(zsd_ctl_t *ctl)
1053 {
1054         zsd_pset_t *pset, *tmp;
1055 
1056         /*
1057          * Mark pset as not exists, and deleted if it existed
1058          * previous interval.
1059          */
1060         pset = list_head(&ctl->zsctl_psets);
1061         while (pset != NULL) {
1062                 if (pset->zsp_found == B_FALSE) {
1063                         pset->zsp_empty = B_TRUE;
1064                         if (pset->zsp_deleted == B_TRUE) {
1065                                 tmp = pset;
1066                                 pset = list_next(&ctl->zsctl_psets, pset);
1067                                 list_remove(&ctl->zsctl_psets, tmp);
1068                                 free(tmp);
1069                                 ctl->zsctl_npsets--;
1070                                 continue;
1071                         } else {
1072                                 /* Pset vanished during this interval */
1073                                 pset->zsp_new = B_FALSE;
1074                                 pset->zsp_deleted = B_TRUE;
1075                                 pset->zsp_active = B_TRUE;
1076                         }
1077                 }
1078                 pset = list_next(&ctl->zsctl_psets, pset);
1079         }
1080 }
1081 
1082 /* Determine which zones are no longer bound to processor sets */
1083 static void
1084 zsd_mark_pset_usages_end(zsd_ctl_t *ctl)
1085 {
1086         zsd_pset_t *pset;
1087         zsd_zone_t *zone;
1088         zsd_pset_usage_t *usage, *tmp;
1089 
1090         /*
1091          * Mark pset as not exists, and deleted if it existed previous
1092          * interval.
1093          */
1094         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1095             pset = list_next(&ctl->zsctl_psets, pset)) {
1096                 usage = list_head(&pset->zsp_usage_list);
1097                 while (usage != NULL) {
1098                         /*
1099                          * Mark pset as not exists, and deleted if it existed
1100                          * previous interval.
1101                          */
1102                         if (usage->zsu_found == B_FALSE ||
1103                             usage->zsu_zone->zsz_deleted == B_TRUE ||
1104                             usage->zsu_pset->zsp_deleted == B_TRUE) {
1105                                 tmp = usage;
1106                                 usage = list_next(&pset->zsp_usage_list,
1107                                     usage);
1108                                 list_remove(&pset->zsp_usage_list, tmp);
1109                                 free(tmp);
1110                                 pset->zsp_nusage--;
1111                                 ctl->zsctl_npset_usages--;
1112                                 continue;
1113                         } else {
1114                                 usage->zsu_new = B_FALSE;
1115                                 usage->zsu_deleted = B_TRUE;
1116                                 usage->zsu_active = B_TRUE;
1117                         }
1118                         /* Add cpu shares for usages that are in FSS */
1119                         zone = usage->zsu_zone;
1120                         if (usage->zsu_scheds & ZS_SCHED_FSS &&
1121                             zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED &&
1122                             zone->zsz_cpu_shares != 0) {
1123                                 zone = usage->zsu_zone;
1124                                 usage->zsu_cpu_shares = zone->zsz_cpu_shares;
1125                                 pset->zsp_cpu_shares += zone->zsz_cpu_shares;
1126                         }
1127                         usage = list_next(&pset->zsp_usage_list,
1128                             usage);
1129                 }
1130         }
1131 }
1132 
1133 /* A zone has been found.  Update its information */
1134 static void
1135 zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares,
1136     uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap,
1137     uint64_t vm_cap, uint64_t processes_cap, uint64_t processes,
1138     uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm,
1139     uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap,
1140     uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap,
1141     uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype,
1142     uint_t iptype)
1143 {
1144         zsd_system_t *sys = ctl->zsctl_system;
1145 
1146         assert(zone->zsz_found == B_FALSE);
1147 
1148         /*
1149          * Mark zone as exists, and new if it did not exist in previous
1150          * interval.
1151          */
1152         zone->zsz_found = B_TRUE;
1153         zone->zsz_empty = B_TRUE;
1154         zone->zsz_deleted = B_FALSE;
1155 
1156         /*
1157          * Zone is new.  Assume zone's properties are the same over entire
1158          * interval.
1159          */
1160         if (zone->zsz_active == B_FALSE)
1161                 zone->zsz_new = B_TRUE;
1162         else
1163                 zone->zsz_new = B_FALSE;
1164 
1165         (void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool));
1166         (void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset));
1167         zone->zsz_default_sched = sched;
1168 
1169         /* Schedulers updated later as processes are found */
1170         zone->zsz_scheds = 0;
1171 
1172         /* Cpus updated later as psets bound are identified */
1173         zone->zsz_cpus_online = 0;
1174 
1175         zone->zsz_cputype = cputype;
1176         zone->zsz_iptype = iptype;
1177         zone->zsz_psetid = ZS_PSET_ERROR;
1178         zone->zsz_cpu_cap = cpu_cap;
1179         zone->zsz_cpu_shares = cpu_shares;
1180         zone->zsz_ram_cap = ram_cap;
1181         zone->zsz_locked_cap = locked_cap;
1182         zone->zsz_vm_cap = vm_cap;
1183         zone->zsz_processes_cap = processes_cap;
1184         zone->zsz_processes = processes;
1185         zone->zsz_lwps_cap = lwps_cap;
1186         zone->zsz_lwps = lwps;
1187         zone->zsz_shm_cap = shm_cap;
1188         zone->zsz_shm = shm;
1189         zone->zsz_shmids_cap = shmids_cap;
1190         zone->zsz_shmids = shmids;
1191         zone->zsz_semids_cap = semids_cap;
1192         zone->zsz_semids = semids;
1193         zone->zsz_msgids_cap = msgids_cap;
1194         zone->zsz_msgids = msgids;
1195         zone->zsz_lofi_cap = lofi_cap;
1196         zone->zsz_lofi = lofi;
1197 
1198         sys->zss_processes += processes;
1199         sys->zss_lwps += lwps;
1200         sys->zss_shm += shm;
1201         sys->zss_shmids += shmids;
1202         sys->zss_semids += semids;
1203         sys->zss_msgids += msgids;
1204         sys->zss_lofi += lofi;
1205         zone->zsz_active = B_TRUE;
1206 }
1207 
1208 
1209 /* Determine which zones have halted */
1210 static void
1211 zsd_mark_zones_end(zsd_ctl_t *ctl)
1212 {
1213         zsd_zone_t *zone, *tmp;
1214 
1215         /*
1216          * Mark zone as not existing, or delete if it did not exist in
1217          * previous interval.
1218          */
1219         zone = list_head(&ctl->zsctl_zones);
1220         while (zone != NULL) {
1221                 if (zone->zsz_found == B_FALSE) {
1222                         zone->zsz_empty = B_TRUE;
1223                         if (zone->zsz_deleted == B_TRUE) {
1224                                 /*
1225                                  * Zone deleted in prior interval,
1226                                  * so it no longer exists.
1227                                  */
1228                                 tmp = zone;
1229                                 zone = list_next(&ctl->zsctl_zones, zone);
1230                                 list_remove(&ctl->zsctl_zones, tmp);
1231                                 free(tmp);
1232                                 ctl->zsctl_nzones--;
1233                                 continue;
1234                         } else {
1235                                 zone->zsz_new = B_FALSE;
1236                                 zone->zsz_deleted = B_TRUE;
1237                                 zone->zsz_active = B_TRUE;
1238                         }
1239                 }
1240                 zone = list_next(&ctl->zsctl_zones, zone);
1241         }
1242 }
1243 
1244 /*
1245  * Mark cpus as not existing.  If a cpu is found, it will be updated.  If
1246  * a cpu is not found, then it must have gone offline, so it will be
1247  * deleted.
1248  *
1249  * The kstat tracking data is rolled so that the usage since the previous
1250  * interval can be determined.
1251  */
1252 static void
1253 zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll)
1254 {
1255         zsd_cpu_t *cpu;
1256 
1257         /*
1258          * Mark all cpus as not existing.  As cpus are found, they will
1259          * be marked as existing.
1260          */
1261         for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL;
1262             cpu = list_next(&ctl->zsctl_cpus, cpu)) {
1263                 cpu->zsc_found = B_FALSE;
1264                 if (cpu->zsc_active == B_TRUE && roll) {
1265                         cpu->zsc_psetid_prev = cpu->zsc_psetid;
1266                         cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1267                         cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1268                         cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1269                         cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1270                 }
1271         }
1272 }
1273 
1274 /*
1275  * An array the size of the maximum number of cpus is kept.  Within this array
1276  * a list of the online cpus is maintained.
1277  */
1278 zsd_cpu_t *
1279 zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid)
1280 {
1281         zsd_cpu_t *cpu;
1282 
1283         assert(cpuid < ctl->zsctl_maxcpuid);
1284         cpu = &(ctl->zsctl_cpu_array[cpuid]);
1285         assert(cpuid == cpu->zsc_id);
1286 
1287         if (cpu->zsc_allocated == B_FALSE) {
1288                 cpu->zsc_allocated = B_TRUE;
1289                 list_insert_tail(&ctl->zsctl_cpus, cpu);
1290         }
1291         return (cpu);
1292 }
1293 
1294 /* A cpu has been found.  Update its information */
1295 static void
1296 zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid)
1297 {
1298         /*
1299          * legacy processor sets, the cpu may move while zonestatd is
1300          * inspecting, causing it to be found twice.  In this case, just
1301          * leave cpu in the first processor set in which it was found.
1302          */
1303         if (cpu->zsc_found == B_TRUE)
1304                 return;
1305 
1306         /* Mark cpu as online */
1307         cpu->zsc_found = B_TRUE;
1308         cpu->zsc_offlined = B_FALSE;
1309         cpu->zsc_pset = pset;
1310         /*
1311          * cpu is newly online.
1312          */
1313         if (cpu->zsc_active == B_FALSE) {
1314                 /*
1315                  * Cpu is newly online.
1316                  */
1317                 cpu->zsc_onlined = B_TRUE;
1318                 cpu->zsc_psetid = psetid;
1319                 cpu->zsc_psetid_prev = psetid;
1320         } else {
1321                 /*
1322                  * cpu online during previous interval.  Save properties at
1323                  * start of interval
1324                  */
1325                 cpu->zsc_onlined = B_FALSE;
1326                 cpu->zsc_psetid = psetid;
1327 
1328         }
1329         cpu->zsc_active = B_TRUE;
1330 }
1331 
1332 /* Remove all offlined cpus from the list of tracked cpus */
1333 static void
1334 zsd_mark_cpus_end(zsd_ctl_t *ctl)
1335 {
1336         zsd_cpu_t *cpu, *tmp;
1337         int id;
1338 
1339         /* Mark cpu as online or offline */
1340         cpu = list_head(&ctl->zsctl_cpus);
1341         while (cpu != NULL) {
1342                 if (cpu->zsc_found == B_FALSE) {
1343                         if (cpu->zsc_offlined == B_TRUE) {
1344                                 /*
1345                                  * cpu offlined in prior interval. It is gone.
1346                                  */
1347                                 tmp = cpu;
1348                                 cpu = list_next(&ctl->zsctl_cpus, cpu);
1349                                 list_remove(&ctl->zsctl_cpus, tmp);
1350                                 /* Clear structure for future use */
1351                                 id = tmp->zsc_id;
1352                                 bzero(tmp, sizeof (zsd_cpu_t));
1353                                 tmp->zsc_id = id;
1354                                 tmp->zsc_allocated = B_FALSE;
1355                                 tmp->zsc_psetid = ZS_PSET_ERROR;
1356                                 tmp->zsc_psetid_prev = ZS_PSET_ERROR;
1357 
1358                         } else {
1359                                 /*
1360                                  * cpu online at start of interval.  Treat
1361                                  * as still online, since it was online for
1362                                  * some portion of the interval.
1363                                  */
1364                                 cpu->zsc_offlined = B_TRUE;
1365                                 cpu->zsc_onlined = B_FALSE;
1366                                 cpu->zsc_active = B_TRUE;
1367                                 cpu->zsc_psetid = cpu->zsc_psetid_prev;
1368                                 cpu->zsc_pset = NULL;
1369                         }
1370                 }
1371                 cpu = list_next(&ctl->zsctl_cpus, cpu);
1372         }
1373 }
1374 
1375 /* Some utility functions for managing the list of processor sets */
1376 static zsd_pset_t *
1377 zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid)
1378 {
1379         zsd_pset_t *pset;
1380 
1381         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1382             pset = list_next(&ctl->zsctl_psets, pset)) {
1383                 if (pset->zsp_id == psetid)
1384                         return (pset);
1385         }
1386         return (NULL);
1387 }
1388 
1389 static zsd_pset_t *
1390 zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1391 {
1392         zsd_pset_t *pset;
1393 
1394         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1395             pset = list_next(&ctl->zsctl_psets, pset)) {
1396                 if (strcmp(pset->zsp_name, psetname) == 0) {
1397                         if (psetid != -1)
1398                                 pset->zsp_id = psetid;
1399                         return (pset);
1400                 }
1401         }
1402         return (NULL);
1403 }
1404 
1405 static zsd_pset_t *
1406 zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1407 {
1408         zsd_pset_t *pset;
1409 
1410         if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL)
1411                 return (NULL);
1412 
1413         (void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name));
1414         pset->zsp_id = psetid;
1415         pset->zsp_found = B_FALSE;
1416         /*
1417          * Allocate as deleted so if not found in first pass, pset is deleted
1418          * from list.  This can happen if pset is returned by pset_list, but
1419          * is destroyed before first attempt to fetch pset details.
1420          */
1421         list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t),
1422             offsetof(zsd_pset_usage_t, zsu_next));
1423 
1424         pset->zsp_hrstart = g_hrnow;
1425         pset->zsp_deleted = B_TRUE;
1426         pset->zsp_empty = B_TRUE;
1427         ctl->zsctl_npsets++;
1428 
1429         return (pset);
1430 }
1431 
1432 static zsd_pset_t *
1433 zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1434 {
1435         zsd_pset_t *pset, *tmp;
1436 
1437         if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL)
1438                 return (pset);
1439 
1440         if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL)
1441                 return (NULL);
1442 
1443         /* Insert sorted by psetname */
1444         tmp = list_head(&ctl->zsctl_psets);
1445         while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0)
1446                 tmp = list_next(&ctl->zsctl_psets, tmp);
1447 
1448         list_insert_before(&ctl->zsctl_psets, tmp, pset);
1449         return (pset);
1450 }
1451 
1452 /* Some utility functions for managing the list of zones using each pset */
1453 static zsd_pset_usage_t *
1454 zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone)
1455 {
1456         zsd_pset_usage_t *usage;
1457 
1458         for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
1459             usage = list_next(&pset->zsp_usage_list, usage))
1460                 if (usage->zsu_zone == zone)
1461                         return (usage);
1462 
1463         return (NULL);
1464 }
1465 
1466 static zsd_pset_usage_t *
1467 zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1468 {
1469         zsd_pset_usage_t *usage;
1470 
1471         if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t)))
1472             == NULL)
1473                 return (NULL);
1474 
1475         list_link_init(&usage->zsu_next);
1476         usage->zsu_zone = zone;
1477         usage->zsu_zoneid = zone->zsz_id;
1478         usage->zsu_pset = pset;
1479         usage->zsu_found = B_FALSE;
1480         usage->zsu_active = B_FALSE;
1481         usage->zsu_new = B_FALSE;
1482         /*
1483          * Allocate as not deleted.  If a process is found in a pset for
1484          * a zone, the usage will not be deleted until at least the next
1485          * interval.
1486          */
1487         usage->zsu_start = g_now;
1488         usage->zsu_hrstart = g_hrnow;
1489         usage->zsu_deleted = B_FALSE;
1490         usage->zsu_empty = B_TRUE;
1491         usage->zsu_scheds = 0;
1492         usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1493 
1494         ctl->zsctl_npset_usages++;
1495         pset->zsp_nusage++;
1496 
1497         return (usage);
1498 }
1499 
1500 static zsd_pset_usage_t *
1501 zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1502 {
1503         zsd_pset_usage_t *usage, *tmp;
1504 
1505         if ((usage = zsd_lookup_usage(pset, zone))
1506             != NULL)
1507                 return (usage);
1508 
1509         if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL)
1510                 return (NULL);
1511 
1512         tmp = list_head(&pset->zsp_usage_list);
1513         while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name)
1514             > 0)
1515                 tmp = list_next(&pset->zsp_usage_list, tmp);
1516 
1517         list_insert_before(&pset->zsp_usage_list, tmp, usage);
1518         return (usage);
1519 }
1520 
1521 static void
1522 zsd_refresh_system(zsd_ctl_t *ctl)
1523 {
1524         zsd_system_t *system = ctl->zsctl_system;
1525 
1526         /* Re-count these values each interval */
1527         system->zss_processes = 0;
1528         system->zss_lwps = 0;
1529         system->zss_shm = 0;
1530         system->zss_shmids = 0;
1531         system->zss_semids = 0;
1532         system->zss_msgids = 0;
1533         system->zss_lofi = 0;
1534 }
1535 
1536 
1537 /* Reads each cpu's kstats, and adds the usage to the cpu's pset */
1538 static void
1539 zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu)
1540 {
1541         zsd_system_t *sys;
1542         processorid_t cpuid;
1543         zsd_pset_t *pset_prev;
1544         zsd_pset_t *pset;
1545         kstat_t *kstat;
1546         kstat_named_t *knp;
1547         kid_t kid;
1548         uint64_t idle, intr, kern, user;
1549 
1550         sys = ctl->zsctl_system;
1551         pset = cpu->zsc_pset;
1552         knp = NULL;
1553         kid = -1;
1554         cpuid = cpu->zsc_id;
1555 
1556         /* Get the cpu time totals for this cpu */
1557         kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys");
1558         if (kstat == NULL)
1559                 return;
1560 
1561         kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
1562         if (kid == -1)
1563                 return;
1564 
1565         knp = kstat_data_lookup(kstat, "cpu_nsec_idle");
1566         if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1567                 return;
1568 
1569         idle = knp->value.ui64;
1570 
1571         knp = kstat_data_lookup(kstat, "cpu_nsec_kernel");
1572         if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1573                 return;
1574 
1575         kern = knp->value.ui64;
1576 
1577         knp = kstat_data_lookup(kstat, "cpu_nsec_user");
1578         if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1579                 return;
1580 
1581         user = knp->value.ui64;
1582 
1583         /*
1584          * Tracking intr time per cpu just exists for future enhancements.
1585          * The value is presently always zero.
1586          */
1587         intr = 0;
1588         cpu->zsc_nsec_idle = idle;
1589         cpu->zsc_nsec_intr = intr;
1590         cpu->zsc_nsec_kern = kern;
1591         cpu->zsc_nsec_user = user;
1592 
1593         if (cpu->zsc_onlined == B_TRUE) {
1594                 /*
1595                  * cpu is newly online.  There is no reference value,
1596                  * so just record its current stats for comparison
1597                  * on next stat read.
1598                  */
1599                 cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1600                 cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1601                 cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1602                 cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1603                 return;
1604         }
1605 
1606         /*
1607          * Calculate relative time since previous refresh.
1608          * Paranoia.  Don't let time  go backwards.
1609          */
1610         idle = intr = kern = user = 0;
1611         if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev)
1612                 idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev;
1613 
1614         if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev)
1615                 intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev;
1616 
1617         if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev)
1618                 kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev;
1619 
1620         if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev)
1621                 user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev;
1622 
1623         /* Update totals for cpu usage */
1624         TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle);
1625         TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr);
1626         TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern);
1627         TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user);
1628 
1629         /*
1630          * Add cpu's stats to its pset if it is known to be in
1631          * the pset since previous read.
1632          */
1633         if (cpu->zsc_psetid == cpu->zsc_psetid_prev ||
1634             cpu->zsc_psetid_prev == ZS_PSET_ERROR ||
1635             (pset_prev = zsd_lookup_pset_byid(ctl,
1636             cpu->zsc_psetid_prev)) == NULL) {
1637                 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle);
1638                 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr);
1639                 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern);
1640                 TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user);
1641         } else {
1642                 /*
1643                  * Last pset was different than current pset.
1644                  * Best guess is to split usage between the two.
1645                  */
1646                 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2);
1647                 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2);
1648                 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2);
1649                 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2);
1650 
1651                 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle,
1652                     (idle / 2) + (idle % 2));
1653                 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr,
1654                     (intr / 2) + (intr % 2));
1655                 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern,
1656                     (kern / 2) + (kern % 2));
1657                 TIMESTRUC_ADD_NANOSEC(pset->zsp_user,
1658                     (user / 2) + (user % 2));
1659         }
1660         TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle);
1661         TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr);
1662         TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern);
1663         TIMESTRUC_ADD_NANOSEC(sys->zss_user, user);
1664 }
1665 
1666 /* Determine the details of a processor set by pset_id */
1667 static int
1668 zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname,
1669     size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size,
1670     uint64_t *min, uint64_t *max, int64_t *importance)
1671 {
1672         uint_t old, num;
1673 
1674         pool_conf_t *conf = ctl->zsctl_pool_conf;
1675         pool_value_t **vals = ctl->zsctl_pool_vals;
1676         pool_resource_t **res_list = NULL;
1677         pool_resource_t *pset;
1678         pool_component_t **cpus = NULL;
1679         processorid_t *cache;
1680         const char *string;
1681         uint64_t uint64;
1682         int64_t int64;
1683         int i, ret, type;
1684 
1685         if (ctl->zsctl_pool_status == POOL_DISABLED) {
1686 
1687                 /*
1688                  * Inspect legacy psets
1689                  */
1690                 for (;;) {
1691                         old = num = ctl->zsctl_cpu_ncache;
1692                         ret = pset_info(psetid, &type, &num,
1693                             ctl->zsctl_cpu_cache);
1694                         if (ret < 0) {
1695                                 /* pset is gone.  Tell caller to retry */
1696                                 errno = EINTR;
1697                                 return (-1);
1698                         }
1699                         if (num <= old) {
1700                         /* Success */
1701                                 break;
1702                         }
1703                         if ((cache = (processorid_t *)realloc(
1704                             ctl->zsctl_cpu_cache, num *
1705                             sizeof (processorid_t))) != NULL) {
1706                                 ctl->zsctl_cpu_ncache = num;
1707                                 ctl->zsctl_cpu_cache = cache;
1708                         } else {
1709                                 /*
1710                                  * Could not allocate to get new cpu list.
1711                                  */
1712                                 zsd_warn(gettext(
1713                                     "Could not allocate for cpu list"));
1714                                 errno = ENOMEM;
1715                                 return (-1);
1716                         }
1717                 }
1718                 /*
1719                  * Old school pset.  Just make min and max equal
1720                  * to its size
1721                  */
1722                 if (psetid == ZS_PSET_DEFAULT) {
1723                         *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1724                         (void) strlcpy(psetname, "pset_default", namelen);
1725                 } else {
1726                         *cputype = ZS_CPUTYPE_PSRSET_PSET;
1727                         (void) snprintf(psetname, namelen,
1728                             "SUNWlegacy_pset_%d", psetid);
1729                 }
1730 
1731                 /*
1732                  * Just treat legacy pset as a simple pool pset
1733                  */
1734                 *online = num;
1735                 *size = num;
1736                 *min = num;
1737                 *max = num;
1738                 *importance = 1;
1739 
1740                 return (0);
1741         }
1742 
1743         /* Look up the pool pset using the pset id */
1744         res_list = NULL;
1745         pool_value_set_int64(vals[1], psetid);
1746         if (pool_value_set_name(vals[1], "pset.sys_id")
1747             != PO_SUCCESS)
1748                 goto err;
1749 
1750         if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1751                 goto err;
1752         if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1753                 goto err;
1754         if ((res_list = pool_query_resources(conf, &num, vals)) == NULL)
1755                 goto err;
1756         if (num != 1)
1757                 goto err;
1758         pset = res_list[0];
1759         free(res_list);
1760         res_list = NULL;
1761         if (pool_get_property(conf, pool_resource_to_elem(conf, pset),
1762             "pset.name", vals[0]) != POC_STRING ||
1763             pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1764                 goto err;
1765 
1766         (void) strlcpy(psetname, string, namelen);
1767         if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0)
1768                 *cputype = ZS_CPUTYPE_DEDICATED;
1769         else if (psetid == ZS_PSET_DEFAULT)
1770                 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1771         else
1772                 *cputype = ZS_CPUTYPE_POOL_PSET;
1773 
1774         /* Get size, min, max, and importance */
1775         if (pool_get_property(conf, pool_resource_to_elem(conf,
1776             pset), "pset.size", vals[0]) == POC_UINT &&
1777             pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1778                 *size = uint64;
1779         else
1780                 *size = 0;
1781 
1782                 /* Get size, min, max, and importance */
1783         if (pool_get_property(conf, pool_resource_to_elem(conf,
1784             pset), "pset.min", vals[0]) == POC_UINT &&
1785             pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1786                 *min = uint64;
1787         else
1788                 *min = 0;
1789         if (*min >= ZSD_PSET_UNLIMITED)
1790                 *min = ZS_LIMIT_NONE;
1791 
1792         if (pool_get_property(conf, pool_resource_to_elem(conf,
1793             pset), "pset.max", vals[0]) == POC_UINT &&
1794             pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1795                 *max = uint64;
1796         else
1797                 *max = ZS_LIMIT_NONE;
1798 
1799         if (*max >= ZSD_PSET_UNLIMITED)
1800                 *max = ZS_LIMIT_NONE;
1801 
1802         if (pool_get_property(conf, pool_resource_to_elem(conf,
1803             pset), "pset.importance", vals[0]) == POC_INT &&
1804             pool_value_get_int64(vals[0], &int64) == PO_SUCCESS)
1805                 *importance = int64;
1806         else
1807                 *importance = (uint64_t)1;
1808 
1809         *online = 0;
1810         if (*size == 0)
1811                 return (0);
1812 
1813         /* get cpus */
1814         cpus = pool_query_resource_components(conf, pset, &num, NULL);
1815         if (cpus == NULL)
1816                 goto err;
1817 
1818         /* Make sure there is space for cpu id list */
1819         if (num > ctl->zsctl_cpu_ncache) {
1820                 if ((cache = (processorid_t *)realloc(
1821                     ctl->zsctl_cpu_cache, num *
1822                     sizeof (processorid_t))) != NULL) {
1823                         ctl->zsctl_cpu_ncache = num;
1824                         ctl->zsctl_cpu_cache = cache;
1825                 } else {
1826                         /*
1827                          * Could not allocate to get new cpu list.
1828                          */
1829                         zsd_warn(gettext(
1830                             "Could not allocate for cpu list"));
1831                         goto err;
1832                 }
1833         }
1834 
1835         /* count the online cpus */
1836         for (i = 0; i < num; i++) {
1837                 if (pool_get_property(conf, pool_component_to_elem(
1838                     conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING ||
1839                     pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1840                         goto err;
1841 
1842                 if (strcmp(string, "on-line") != 0 &&
1843                     strcmp(string, "no-intr") != 0)
1844                         continue;
1845 
1846                 if (pool_get_property(conf, pool_component_to_elem(
1847                     conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT ||
1848                     pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
1849                         goto err;
1850 
1851                 (*online)++;
1852                 ctl->zsctl_cpu_cache[i] = (psetid_t)int64;
1853         }
1854         free(cpus);
1855         return (0);
1856 err:
1857         if (res_list != NULL)
1858                 free(res_list);
1859         if (cpus != NULL)
1860                 free(cpus);
1861 
1862         /*
1863          * The pools operations should succeed since the conf is a consistent
1864          * snapshot.  Tell caller there is no need to retry.
1865          */
1866         errno = EINVAL;
1867         return (-1);
1868 }
1869 
1870 /*
1871  * Update the current list of processor sets.
1872  * This also updates the list of online cpus, and each cpu's pset membership.
1873  */
1874 static void
1875 zsd_refresh_psets(zsd_ctl_t *ctl)
1876 {
1877         int i, j, ret, state;
1878         uint_t old, num;
1879         uint_t cputype;
1880         int64_t sys_id, importance;
1881         uint64_t online, size, min, max;
1882         zsd_system_t *system;
1883         zsd_pset_t *pset;
1884         zsd_cpu_t *cpu;
1885         psetid_t *cache;
1886         char psetname[ZS_PSETNAME_MAX];
1887         processorid_t cpuid;
1888         pool_value_t *pv_save = NULL;
1889         pool_resource_t **res_list = NULL;
1890         pool_resource_t *res;
1891         pool_value_t **vals;
1892         pool_conf_t *conf;
1893         boolean_t roll_cpus = B_TRUE;
1894 
1895         /* Zero cpu counters to recount them */
1896         system = ctl->zsctl_system;
1897         system->zss_ncpus = 0;
1898         system->zss_ncpus_online = 0;
1899 retry:
1900         ret = pool_get_status(&state);
1901         if (ret == 0 && state == POOL_ENABLED) {
1902 
1903                 conf = ctl->zsctl_pool_conf;
1904                 vals = ctl->zsctl_pool_vals;
1905                 pv_save = vals[1];
1906                 vals[1] = NULL;
1907 
1908                 if (ctl->zsctl_pool_status == POOL_DISABLED) {
1909                         if (pool_conf_open(ctl->zsctl_pool_conf,
1910                             pool_dynamic_location(), PO_RDONLY) == 0) {
1911                                 ctl->zsctl_pool_status = POOL_ENABLED;
1912                                 ctl->zsctl_pool_changed = POU_PSET;
1913                         }
1914                 } else {
1915                         ctl->zsctl_pool_changed = 0;
1916                         ret = pool_conf_update(ctl->zsctl_pool_conf,
1917                             &(ctl->zsctl_pool_changed));
1918                         if (ret < 0) {
1919                                 /* Pools must have become disabled */
1920                                 (void) pool_conf_close(ctl->zsctl_pool_conf);
1921                                 ctl->zsctl_pool_status = POOL_DISABLED;
1922                                 if (pool_error() == POE_SYSTEM && errno ==
1923                                     ENOTACTIVE)
1924                                         goto retry;
1925 
1926                                 zsd_warn(gettext(
1927                                     "Unable to update pool configuration"));
1928                                 /* Not able to get pool info.  Don't update. */
1929                                 goto err;
1930                         }
1931                 }
1932                 /* Get the list of psets using libpool */
1933                 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1934                         goto err;
1935 
1936                 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1937                         goto err;
1938                 if ((res_list = pool_query_resources(conf, &num, vals))
1939                     == NULL)
1940                         goto err;
1941 
1942                 if (num > ctl->zsctl_pset_ncache)  {
1943                         if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1944                             (num) * sizeof (psetid_t))) == NULL) {
1945                                 goto err;
1946                         }
1947                         ctl->zsctl_pset_ncache = num;
1948                         ctl->zsctl_pset_cache = cache;
1949                 }
1950                 /* Save the pset id of each pset */
1951                 for (i = 0; i < num; i++) {
1952                         res = res_list[i];
1953                         if (pool_get_property(conf, pool_resource_to_elem(conf,
1954                             res), "pset.sys_id", vals[0]) != POC_INT ||
1955                             pool_value_get_int64(vals[0], &sys_id)
1956                             != PO_SUCCESS)
1957                                 goto err;
1958                         ctl->zsctl_pset_cache[i] = (int)sys_id;
1959                 }
1960                 vals[1] = pv_save;
1961                 pv_save = NULL;
1962         } else {
1963                 if (ctl->zsctl_pool_status == POOL_ENABLED) {
1964                         (void) pool_conf_close(ctl->zsctl_pool_conf);
1965                         ctl->zsctl_pool_status = POOL_DISABLED;
1966                 }
1967                 /* Get the pset list using legacy psets */
1968                 for (;;) {
1969                         old = num = ctl->zsctl_pset_ncache;
1970                         (void) pset_list(ctl->zsctl_pset_cache, &num);
1971                         if ((num + 1) <= old) {
1972                                 break;
1973                         }
1974                         if ((cache = (psetid_t *)realloc(ctl->zsctl_pset_cache,
1975                             (num + 1) * sizeof (psetid_t))) != NULL) {
1976                                 ctl->zsctl_pset_ncache = num + 1;
1977                                 ctl->zsctl_pset_cache = cache;
1978                         } else {
1979                                 /*
1980                                  * Could not allocate to get new pset list.
1981                                  * Give up
1982                                  */
1983                                 return;
1984                         }
1985                 }
1986                 /* Add the default pset to list */
1987                 ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0];
1988                 ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT;
1989                 num++;
1990         }
1991 psets_changed:
1992         zsd_mark_cpus_start(ctl, roll_cpus);
1993         zsd_mark_psets_start(ctl);
1994         roll_cpus = B_FALSE;
1995 
1996         /* Refresh cpu membership of all psets */
1997         for (i = 0; i < num; i++) {
1998 
1999                 /* Get pool pset information */
2000                 sys_id = ctl->zsctl_pset_cache[i];
2001                 if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname),
2002                     &cputype, &online, &size, &min, &max, &importance)
2003                     != 0) {
2004                         if (errno == EINTR)
2005                                 goto psets_changed;
2006                         zsd_warn(gettext("Failed to get info for pset %d"),
2007                             sys_id);
2008                         continue;
2009                 }
2010 
2011                 system->zss_ncpus += size;
2012                 system->zss_ncpus_online += online;
2013 
2014                 pset = zsd_lookup_insert_pset(ctl, psetname,
2015                     ctl->zsctl_pset_cache[i]);
2016 
2017                 /* update pset info */
2018                 zsd_mark_pset_found(pset, cputype, online, size, min,
2019                     max, importance);
2020 
2021                 /* update each cpu in pset */
2022                 for (j = 0; j < pset->zsp_online; j++) {
2023                         cpuid = ctl->zsctl_cpu_cache[j];
2024                         cpu = zsd_lookup_insert_cpu(ctl, cpuid);
2025                         zsd_mark_cpu_found(cpu, pset, sys_id);
2026                 }
2027         }
2028 err:
2029         if (res_list != NULL)
2030                 free(res_list);
2031         if (pv_save != NULL)
2032                 vals[1] = pv_save;
2033 }
2034 
2035 
2036 
2037 /*
2038  * Fetch the current pool and pset name for the given zone.
2039  */
2040 static void
2041 zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone,
2042     char *pool, int poollen, char *pset, int psetlen, uint_t *cputype)
2043 {
2044         poolid_t poolid;
2045         pool_t **pools = NULL;
2046         pool_resource_t **res_list = NULL;
2047         char poolname[ZS_POOLNAME_MAX];
2048         char psetname[ZS_PSETNAME_MAX];
2049         pool_conf_t *conf = ctl->zsctl_pool_conf;
2050         pool_value_t *pv_save = NULL;
2051         pool_value_t **vals = ctl->zsctl_pool_vals;
2052         const char *string;
2053         int ret;
2054         int64_t int64;
2055         uint_t num;
2056 
2057         ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID,
2058             &poolid, sizeof (poolid));
2059         if (ret < 0)
2060                 goto lookup_done;
2061 
2062         pv_save = vals[1];
2063         vals[1] = NULL;
2064         pools = NULL;
2065         res_list = NULL;
2066 
2067         /* Default values if lookup fails */
2068         (void) strlcpy(poolname, "pool_default", sizeof (poolname));
2069         (void) strlcpy(psetname, "pset_default", sizeof (poolname));
2070         *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2071 
2072         /* no dedicated cpu if pools are disabled */
2073         if (ctl->zsctl_pool_status == POOL_DISABLED)
2074                 goto lookup_done;
2075 
2076         /* Get the pool name using the id */
2077         pool_value_set_int64(vals[0], poolid);
2078         if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS)
2079                 goto lookup_done;
2080 
2081         if ((pools = pool_query_pools(conf, &num, vals)) == NULL)
2082                 goto lookup_done;
2083 
2084         if (num != 1)
2085                 goto lookup_done;
2086 
2087         if (pool_get_property(conf, pool_to_elem(conf, pools[0]),
2088             "pool.name", vals[0]) != POC_STRING ||
2089             pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2090                 goto lookup_done;
2091         (void) strlcpy(poolname, (char *)string, sizeof (poolname));
2092 
2093         /* Get the name of the pset for the pool */
2094         if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
2095                 goto lookup_done;
2096 
2097         if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
2098                 goto lookup_done;
2099 
2100         if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals))
2101             == NULL)
2102                 goto lookup_done;
2103 
2104         if (num != 1)
2105                 goto lookup_done;
2106 
2107         if (pool_get_property(conf, pool_resource_to_elem(conf,
2108             res_list[0]), "pset.sys_id", vals[0]) != POC_INT ||
2109             pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
2110                 goto lookup_done;
2111 
2112         if (int64 == ZS_PSET_DEFAULT)
2113                 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2114 
2115         if (pool_get_property(conf, pool_resource_to_elem(conf,
2116             res_list[0]), "pset.name", vals[0]) != POC_STRING ||
2117             pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2118                 goto lookup_done;
2119 
2120         (void) strlcpy(psetname, (char *)string, sizeof (psetname));
2121 
2122         if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0)
2123                 *cputype = ZS_CPUTYPE_DEDICATED;
2124         if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0)
2125                 *cputype = ZS_CPUTYPE_PSRSET_PSET;
2126         else
2127                 *cputype = ZS_CPUTYPE_POOL_PSET;
2128 
2129 lookup_done:
2130 
2131         if (pv_save != NULL)
2132                 vals[1] = pv_save;
2133 
2134         if (res_list)
2135                 free(res_list);
2136         if (pools)
2137                 free(pools);
2138 
2139         (void) strlcpy(pool, poolname, poollen);
2140         (void) strlcpy(pset, psetname, psetlen);
2141 }
2142 
2143 /* Convert scheduler names to ZS_* scheduler flags */
2144 static uint_t
2145 zsd_schedname2int(char *clname, int pri)
2146 {
2147         uint_t sched = 0;
2148 
2149         if (strcmp(clname, "TS") == 0) {
2150                 sched = ZS_SCHED_TS;
2151         } else if (strcmp(clname, "IA") == 0) {
2152                 sched = ZS_SCHED_IA;
2153         } else if (strcmp(clname, "FX") == 0) {
2154                 if (pri > 59) {
2155                         sched = ZS_SCHED_FX_60;
2156                 } else {
2157                         sched = ZS_SCHED_FX;
2158                 }
2159         } else if (strcmp(clname, "RT") == 0) {
2160                 sched = ZS_SCHED_RT;
2161 
2162         } else if (strcmp(clname, "FSS") == 0) {
2163                 sched = ZS_SCHED_FSS;
2164         }
2165         return (sched);
2166 }
2167 
2168 static uint64_t
2169 zsd_get_zone_rctl_limit(char *name)
2170 {
2171         rctlblk_t *rblk;
2172 
2173         rblk = (rctlblk_t *)alloca(rctlblk_size());
2174         if (getrctl(name, NULL, rblk, RCTL_FIRST)
2175             != 0) {
2176                 return (ZS_LIMIT_NONE);
2177         }
2178         return (rctlblk_get_value(rblk));
2179 }
2180 
2181 static uint64_t
2182 zsd_get_zone_rctl_usage(char *name)
2183 {
2184         rctlblk_t *rblk;
2185 
2186         rblk = (rctlblk_t *)alloca(rctlblk_size());
2187         if (getrctl(name, NULL, rblk, RCTL_USAGE)
2188             != 0) {
2189                 return (0);
2190         }
2191         return (rctlblk_get_value(rblk));
2192 }
2193 
2194 #define ZSD_NUM_RCTL_VALS 20
2195 
2196 /*
2197  * Fetch the limit information for a zone.  This uses zone_enter() as the
2198  * getrctl(2) system call only returns rctl information for the zone of
2199  * the caller.
2200  */
2201 static int
2202 zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares,
2203     uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap,
2204     uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes,
2205     uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm,
2206     uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap,
2207     uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids,
2208     uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched)
2209 {
2210         int p[2], pid, tmpl_fd, ret;
2211         ctid_t ct;
2212         char class[PC_CLNMSZ];
2213         uint64_t vals[ZSD_NUM_RCTL_VALS];
2214         zsd_system_t *sys = ctl->zsctl_system;
2215         int i = 0;
2216         int res = 0;
2217 
2218         /* Treat all caps as no cap on error */
2219         *cpu_shares = ZS_LIMIT_NONE;
2220         *cpu_cap = ZS_LIMIT_NONE;
2221         *ram_cap = ZS_LIMIT_NONE;
2222         *locked_cap = ZS_LIMIT_NONE;
2223         *vm_cap = ZS_LIMIT_NONE;
2224 
2225         *processes_cap = ZS_LIMIT_NONE;
2226         *lwps_cap = ZS_LIMIT_NONE;
2227         *shm_cap = ZS_LIMIT_NONE;
2228         *shmids_cap = ZS_LIMIT_NONE;
2229         *semids_cap = ZS_LIMIT_NONE;
2230         *msgids_cap = ZS_LIMIT_NONE;
2231         *lofi_cap = ZS_LIMIT_NONE;
2232 
2233         *processes = 0;
2234         *lwps = 0;
2235         *shm = 0;
2236         *shmids = 0;
2237         *semids = 0;
2238         *msgids = 0;
2239         *lofi = 0;
2240 
2241         /* Get the zone's default scheduling class */
2242         ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS,
2243             class, sizeof (class));
2244         if (ret < 0)
2245                 return (-1);
2246 
2247         *sched = zsd_schedname2int(class, 0);
2248 
2249         /* rctl caps must be fetched from within the zone */
2250         if (pipe(p) != 0)
2251                 return (-1);
2252 
2253         if ((tmpl_fd = init_template()) == -1) {
2254                 (void) close(p[0]);
2255                 (void) close(p[1]);
2256                 return (-1);
2257         }
2258         pid = forkx(0);
2259         if (pid < 0) {
2260                 (void) ct_tmpl_clear(tmpl_fd);
2261                 (void) close(p[0]);
2262                 (void) close(p[1]);
2263                 return (-1);
2264         }
2265         if (pid == 0) {
2266 
2267                 (void) ct_tmpl_clear(tmpl_fd);
2268                 (void) close(tmpl_fd);
2269                 (void) close(p[0]);
2270                 if (zone->zsz_id != getzoneid()) {
2271                         if (zone_enter(zone->zsz_id) < 0) {
2272                                 (void) close(p[1]);
2273                                 _exit(0);
2274                         }
2275                 }
2276 
2277                 /* Get caps for zone, and write them to zonestatd parent. */
2278                 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares");
2279                 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap");
2280                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory");
2281                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap");
2282                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes");
2283                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes");
2284                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps");
2285                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps");
2286                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory");
2287                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory");
2288                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids");
2289                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids");
2290                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids");
2291                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids");
2292                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids");
2293                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids");
2294                 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi");
2295                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi");
2296                 vals[i++] = zsd_get_zone_rctl_usage("zone.max-physical-memory");
2297 
2298                 if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2299                     ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2300                         (void) close(p[1]);
2301                         _exit(1);
2302                 }
2303 
2304                 (void) close(p[1]);
2305                 _exit(0);
2306         }
2307         if (contract_latest(&ct) == -1)
2308                 ct = -1;
2309 
2310         (void) ct_tmpl_clear(tmpl_fd);
2311         (void) close(tmpl_fd);
2312         (void) close(p[1]);
2313         while (waitpid(pid, NULL, 0) != pid)
2314                 ;
2315 
2316         /* Read cap from child in zone */
2317         if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2318             ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2319                 res = -1;
2320                 goto cleanup;
2321         }
2322         i = 0;
2323         *cpu_shares = vals[i++];
2324         *cpu_cap = vals[i++];
2325         *locked_cap = vals[i++];
2326         *vm_cap = vals[i++];
2327         *processes_cap = vals[i++];
2328         *processes = vals[i++];
2329         *lwps_cap = vals[i++];
2330         *lwps = vals[i++];
2331         *shm_cap = vals[i++];
2332         *shm = vals[i++];
2333         *shmids_cap = vals[i++];
2334         *shmids = vals[i++];
2335         *semids_cap = vals[i++];
2336         *semids = vals[i++];
2337         *msgids_cap = vals[i++];
2338         *msgids = vals[i++];
2339         *lofi_cap = vals[i++];
2340         *lofi = vals[i++];
2341         *ram_cap = vals[i++];
2342 
2343         /* Interpret maximum values as no cap */
2344         if (*cpu_cap == UINT32_MAX || *cpu_cap == 0)
2345                 *cpu_cap = ZS_LIMIT_NONE;
2346         if (*processes_cap == sys->zss_processes_max)
2347                 *processes_cap = ZS_LIMIT_NONE;
2348         if (*lwps_cap == sys->zss_lwps_max)
2349                 *lwps_cap = ZS_LIMIT_NONE;
2350         if (*shm_cap == sys->zss_shm_max)
2351                 *shm_cap = ZS_LIMIT_NONE;
2352         if (*shmids_cap == sys->zss_shmids_max)
2353                 *shmids_cap = ZS_LIMIT_NONE;
2354         if (*semids_cap == sys->zss_semids_max)
2355                 *semids_cap = ZS_LIMIT_NONE;
2356         if (*msgids_cap == sys->zss_msgids_max)
2357                 *msgids_cap = ZS_LIMIT_NONE;
2358         if (*lofi_cap == sys->zss_lofi_max)
2359                 *lofi_cap = ZS_LIMIT_NONE;
2360 
2361 
2362 cleanup:
2363         (void) close(p[0]);
2364         (void) ct_tmpl_clear(tmpl_fd);
2365         (void) close(tmpl_fd);
2366         (void) contract_abandon_id(ct);
2367 
2368         return (res);
2369 }
2370 
2371 /* Update the current list of running zones */
2372 static void
2373 zsd_refresh_zones(zsd_ctl_t *ctl)
2374 {
2375         zsd_zone_t *zone;
2376         uint_t old, num;
2377         ushort_t flags;
2378         int i, ret;
2379         zoneid_t *cache;
2380         uint64_t cpu_shares;
2381         uint64_t cpu_cap;
2382         uint64_t ram_cap;
2383         uint64_t locked_cap;
2384         uint64_t vm_cap;
2385         uint64_t processes_cap;
2386         uint64_t processes;
2387         uint64_t lwps_cap;
2388         uint64_t lwps;
2389         uint64_t shm_cap;
2390         uint64_t shm;
2391         uint64_t shmids_cap;
2392         uint64_t shmids;
2393         uint64_t semids_cap;
2394         uint64_t semids;
2395         uint64_t msgids_cap;
2396         uint64_t msgids;
2397         uint64_t lofi_cap;
2398         uint64_t lofi;
2399 
2400         char zonename[ZS_ZONENAME_MAX];
2401         char poolname[ZS_POOLNAME_MAX];
2402         char psetname[ZS_PSETNAME_MAX];
2403         uint_t sched;
2404         uint_t cputype;
2405         uint_t iptype;
2406 
2407         /* Get the current list of running zones */
2408         for (;;) {
2409                 old = num = ctl->zsctl_zone_ncache;
2410                 (void) zone_list(ctl->zsctl_zone_cache, &num);
2411                 if (num <= old)
2412                         break;
2413                 if ((cache = (zoneid_t *)realloc(ctl->zsctl_zone_cache,
2414                     (num) * sizeof (zoneid_t))) != NULL) {
2415                         ctl->zsctl_zone_ncache = num;
2416                         ctl->zsctl_zone_cache = cache;
2417                 } else {
2418                         /* Could not allocate to get new zone list.  Give up */
2419                         return;
2420                 }
2421         }
2422 
2423         zsd_mark_zones_start(ctl);
2424 
2425         for (i = 0; i < num; i++) {
2426 
2427                 ret = getzonenamebyid(ctl->zsctl_zone_cache[i],
2428                     zonename, sizeof (zonename));
2429                 if (ret < 0)
2430                         continue;
2431 
2432                 zone = zsd_lookup_insert_zone(ctl, zonename,
2433                     ctl->zsctl_zone_cache[i]);
2434 
2435                 ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS,
2436                     &flags, sizeof (flags));
2437                 if (ret < 0)
2438                         continue;
2439 
2440                 if (flags & ZF_NET_EXCL)
2441                         iptype = ZS_IPTYPE_EXCLUSIVE;
2442                 else
2443                         iptype = ZS_IPTYPE_SHARED;
2444 
2445                 zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname),
2446                     psetname, sizeof (psetname), &cputype);
2447 
2448                 if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap,
2449                     &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes,
2450                     &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids,
2451                     &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap,
2452                     &lofi, &sched) != 0)
2453                         continue;
2454 
2455                 zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap,
2456                     locked_cap, vm_cap, processes_cap, processes, lwps_cap,
2457                     lwps, shm_cap, shm, shmids_cap, shmids, semids_cap,
2458                     semids, msgids_cap, msgids, lofi_cap, lofi, poolname,
2459                     psetname, sched, cputype, iptype);
2460         }
2461 }
2462 
2463 /* Fetch the details of a process from its psinfo_t */
2464 static void
2465 zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid,
2466     psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid,
2467     timestruc_t *delta, uint_t *sched)
2468 {
2469         timestruc_t d;
2470         zsd_proc_t *proc;
2471 
2472         /* Get cached data for proc */
2473         proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]);
2474         *psetid = psinfo->pr_lwp.pr_bindpset;
2475 
2476         if (proc->zspr_psetid == ZS_PSET_ERROR)
2477                 *prev_psetid = *psetid;
2478         else
2479                 *prev_psetid = proc->zspr_psetid;
2480 
2481         *zoneid = psinfo->pr_zoneid;
2482         if (proc->zspr_zoneid == -1)
2483                 *prev_zoneid = *zoneid;
2484         else
2485                 *prev_zoneid = proc->zspr_zoneid;
2486 
2487         TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage);
2488         *delta = d;
2489 
2490         *sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname,
2491             psinfo->pr_lwp.pr_pri);
2492 
2493         /* Update cached data for proc */
2494         proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset;
2495         proc->zspr_zoneid = psinfo->pr_zoneid;
2496         proc->zspr_sched = *sched;
2497         proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec;
2498         proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec;
2499         proc->zspr_ppid = psinfo->pr_ppid;
2500 }
2501 
2502 /*
2503  * Reset the known cpu usage of a process. This is done after a process
2504  * exits so that if the pid is recycled, data from its previous life is
2505  * not reused
2506  */
2507 static void
2508 zsd_flush_proc_info(zsd_proc_t *proc)
2509 {
2510         proc->zspr_usage.tv_sec = 0;
2511         proc->zspr_usage.tv_nsec = 0;
2512 }
2513 
2514 /*
2515  * Open the current extended accounting file.  On initialization, open the
2516  * file as the current file to be used.  Otherwise, open the file as the
2517  * next file to use of the current file reaches EOF.
2518  */
2519 static int
2520 zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init)
2521 {
2522         int ret, oret, state, trys = 0, flags;
2523         int *fd, *open;
2524         ea_file_t *eaf;
2525         struct stat64 *stat;
2526         char path[MAXPATHLEN];
2527 
2528         /*
2529          * The accounting file is first opened at the tail.  Following
2530          * opens to new accounting files are opened at the head.
2531          */
2532         if (init == B_TRUE) {
2533                 flags = EO_NO_VALID_HDR | EO_TAIL;
2534                 fd = &ctl->zsctl_proc_fd;
2535                 eaf = &ctl->zsctl_proc_eaf;
2536                 stat = &ctl->zsctl_proc_stat;
2537                 open = &ctl->zsctl_proc_open;
2538         } else {
2539                 flags = EO_NO_VALID_HDR | EO_HEAD;
2540                 fd = &ctl->zsctl_proc_fd_next;
2541                 eaf = &ctl->zsctl_proc_eaf_next;
2542                 stat = &ctl->zsctl_proc_stat_next;
2543                 open = &ctl->zsctl_proc_open_next;
2544         }
2545 
2546         *fd = -1;
2547         *open = 0;
2548 retry:
2549         /* open accounting files for cpu consumption */
2550         ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state));
2551         if (ret != 0) {
2552                 zsd_warn(gettext("Unable to get process accounting state"));
2553                 goto err;
2554         }
2555         if (state != AC_ON) {
2556                 if (trys > 0) {
2557                         zsd_warn(gettext(
2558                             "Unable to enable process accounting"));
2559                         goto err;
2560                 }
2561                 (void) zsd_enable_cpu_stats();
2562                 trys++;
2563                 goto retry;
2564         }
2565 
2566         ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path));
2567         if (ret != 0) {
2568                 zsd_warn(gettext("Unable to get process accounting file"));
2569                 goto err;
2570         }
2571 
2572         if ((*fd = open64(path, O_RDONLY, 0)) >= 0 &&
2573             (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0)
2574                 ret = fstat64(*fd, stat);
2575 
2576         if (*fd < 0 || oret < 0 || ret < 0) {
2577                 struct timespec ts;
2578 
2579                 /*
2580                  * It is possible the accounting file is momentarily unavailable
2581                  * because it is being rolled.  Try for up to half a second.
2582                  *
2583                  * If failure to open accounting file persists, give up.
2584                  */
2585                 if (oret == 0)
2586                         (void) ea_close(eaf);
2587                 else if (*fd >= 0)
2588                         (void) close(*fd);
2589                 if (trys > 500) {
2590                         zsd_warn(gettext(
2591                             "Unable to open process accounting file"));
2592                         goto err;
2593                 }
2594                 /* wait one millisecond */
2595                 ts.tv_sec = 0;
2596                 ts.tv_nsec = NANOSEC / 1000;
2597                 (void) nanosleep(&ts, NULL);
2598                 goto retry;
2599         }
2600         *open = 1;
2601         return (0);
2602 err:
2603         if (*fd >= 0)
2604                 (void) close(*fd);
2605         *open = 0;
2606         *fd = -1;
2607         return (-1);
2608 }
2609 
2610 /*
2611  * Walk /proc and charge each process to its zone and processor set.
2612  * Then read exacct data for exited processes, and charge them as well.
2613  */
2614 static void
2615 zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init)
2616 {
2617         DIR *dir;
2618         struct dirent *dent;
2619         psinfo_t psinfo;
2620         int fd, ret;
2621         zsd_proc_t *proc, *pproc, *tmp, *next;
2622         list_t pplist, plist;
2623         zsd_zone_t *zone, *prev_zone;
2624         zsd_pset_t *pset, *prev_pset;
2625         psetid_t psetid, prev_psetid;
2626         zoneid_t zoneid, prev_zoneid;
2627         zsd_pset_usage_t *usage, *prev_usage;
2628         char path[MAXPATHLEN];
2629 
2630         ea_object_t object;
2631         ea_object_t pobject;
2632         boolean_t hrtime_expired = B_FALSE;
2633         struct timeval interval_end;
2634 
2635         timestruc_t delta, d1, d2;
2636         uint_t sched = 0;
2637 
2638         /*
2639          * Get the current accounting file.  The current accounting file
2640          * may be different than the file in use, as the accounting file
2641          * may have been rolled, or manually changed by an admin.
2642          */
2643         ret = zsd_open_exacct(ctl, init);
2644         if (ret != 0) {
2645                 zsd_warn(gettext("Unable to track process accounting"));
2646                 return;
2647         }
2648 
2649         /*
2650          * Mark the current time as the interval end time.  Don't track
2651          * processes that exit after this time.
2652          */
2653         (void) gettimeofday(&interval_end, NULL);
2654 
2655         dir = opendir("/proc");
2656         if (dir == NULL) {
2657                 zsd_warn(gettext("Unable to open /proc"));
2658                 return;
2659         }
2660 
2661         dent = ctl->zsctl_procfs_dent;
2662 
2663         (void) memset(dent, 0, ctl->zsctl_procfs_dent_size);
2664 
2665         /* Walk all processes and compute each zone's usage on each pset. */
2666         while (readdir_r(dir, dent) != 0) {
2667 
2668                 if (strcmp(dent->d_name, ".") == 0 ||
2669                     strcmp(dent->d_name, "..") == 0)
2670                         continue;
2671 
2672                 (void) snprintf(path, sizeof (path), "/proc/%s/psinfo",
2673                     dent->d_name);
2674 
2675                 fd = open(path, O_RDONLY);
2676                 if (fd < 0)
2677                         continue;
2678 
2679                 if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) {
2680                         (void) close(fd);
2681                         continue;
2682                 }
2683                 (void) close(fd);
2684 
2685                 zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid,
2686                     &zoneid, &prev_zoneid, &delta, &sched);
2687 
2688                 d1.tv_sec = delta.tv_sec / 2;
2689                 d1.tv_nsec = delta.tv_nsec / 2;
2690                 d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2);
2691                 d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2);
2692 
2693                 /* Get the zone and pset this process is running in */
2694                 zone = zsd_lookup_zone_byid(ctl, zoneid);
2695                 if (zone == NULL)
2696                         continue;
2697                 pset = zsd_lookup_pset_byid(ctl, psetid);
2698                 if (pset == NULL)
2699                         continue;
2700                 usage = zsd_lookup_insert_usage(ctl, pset, zone);
2701                 if (usage == NULL)
2702                         continue;
2703 
2704                 /*
2705                  * Get the usage of the previous zone and pset if they were
2706                  * different.
2707                  */
2708                 if (zoneid != prev_zoneid)
2709                         prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid);
2710                 else
2711                         prev_zone = NULL;
2712 
2713                 if (psetid != prev_psetid)
2714                         prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2715                 else
2716                         prev_pset = NULL;
2717 
2718                 prev_usage = NULL;
2719                 if (prev_zone != NULL || prev_pset != NULL) {
2720                         if (prev_zone == NULL)
2721                                 prev_zone = zone;
2722                         if (prev_pset == NULL)
2723                                 prev_pset = pset;
2724 
2725                         prev_usage = zsd_lookup_insert_usage(ctl, prev_pset,
2726                             prev_zone);
2727                 }
2728 
2729                 /* Update the usage with the processes info */
2730                 if (prev_usage == NULL) {
2731                         zsd_mark_pset_usage_found(usage, sched);
2732                 } else {
2733                         zsd_mark_pset_usage_found(usage, sched);
2734                         zsd_mark_pset_usage_found(prev_usage, sched);
2735                 }
2736 
2737                 /*
2738                  * First time around is just to get a starting point.  All
2739                  * usages will be zero.
2740                  */
2741                 if (init == B_TRUE)
2742                         continue;
2743 
2744                 if (prev_usage == NULL) {
2745                         zsd_add_usage(ctl, usage, &delta);
2746                 } else {
2747                         zsd_add_usage(ctl, usage, &d1);
2748                         zsd_add_usage(ctl, prev_usage, &d2);
2749                 }
2750         }
2751         (void) closedir(dir);
2752 
2753         /*
2754          * No need to collect exited proc data on initialization.  Just
2755          * caching the usage of the known processes to get a zero starting
2756          * point.
2757          */
2758         if (init == B_TRUE)
2759                 return;
2760 
2761         /*
2762          * Add accounting records to account for processes which have
2763          * exited.
2764          */
2765         list_create(&plist, sizeof (zsd_proc_t),
2766             offsetof(zsd_proc_t, zspr_next));
2767         list_create(&pplist, sizeof (zsd_proc_t),
2768             offsetof(zsd_proc_t, zspr_next));
2769 
2770         for (;;) {
2771                 pid_t pid;
2772                 pid_t ppid;
2773                 timestruc_t user, sys, proc_usage;
2774                 timestruc_t finish;
2775                 int numfound = 0;
2776 
2777                 bzero(&object, sizeof (object));
2778                 proc = NULL;
2779                 zone = NULL;
2780                 pset = NULL;
2781                 usage = NULL;
2782                 ret = ea_get_object(&ctl->zsctl_proc_eaf, &object);
2783                 if (ret == EO_ERROR) {
2784                         if (ea_error() == EXR_EOF) {
2785 
2786                                 struct stat64 *stat;
2787                                 struct stat64 *stat_next;
2788 
2789                                 /*
2790                                  * See if the next accounting file is the
2791                                  * same as the current accounting file.
2792                                  */
2793                                 stat = &(ctl->zsctl_proc_stat);
2794                                 stat_next = &(ctl->zsctl_proc_stat_next);
2795                                 if (stat->st_ino == stat_next->st_ino &&
2796                                     stat->st_dev == stat_next->st_dev) {
2797                                         /*
2798                                          * End of current accounting file is
2799                                          * reached, so finished.  Clear EOF
2800                                          * bit for next time around.
2801                                          */
2802                                         ea_clear(&ctl->zsctl_proc_eaf);
2803                                         break;
2804                                 } else {
2805                                         /*
2806                                          * Accounting file has changed.  Move
2807                                          * to current accounting file.
2808                                          */
2809                                         (void) ea_close(&ctl->zsctl_proc_eaf);
2810 
2811                                         ctl->zsctl_proc_fd =
2812                                             ctl->zsctl_proc_fd_next;
2813                                         ctl->zsctl_proc_eaf =
2814                                             ctl->zsctl_proc_eaf_next;
2815                                         ctl->zsctl_proc_stat =
2816                                             ctl->zsctl_proc_stat_next;
2817 
2818                                         ctl->zsctl_proc_fd_next = -1;
2819                                         ctl->zsctl_proc_open_next = 0;
2820                                         continue;
2821                                 }
2822                         } else {
2823                                 /*
2824                                  * Other accounting error.  Give up on
2825                                  * accounting.
2826                                  */
2827                                 goto ea_err;
2828                         }
2829                 }
2830                 /* Skip if not a process group */
2831                 if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP ||
2832                     (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) {
2833                         (void) ea_free_item(&object, EUP_ALLOC);
2834                         continue;
2835                 }
2836 
2837                 /* The process group entry should be complete */
2838                 while (numfound < 9) {
2839                         bzero(&pobject, sizeof (pobject));
2840                         ret = ea_get_object(&ctl->zsctl_proc_eaf,
2841                             &pobject);
2842                         if (ret < 0) {
2843                                 (void) ea_free_item(&object, EUP_ALLOC);
2844                                 zsd_warn(
2845                                     "unable to get process accounting data");
2846                                 goto ea_err;
2847                         }
2848                         /* Next entries should be process data */
2849                         if ((pobject.eo_catalog & EXT_TYPE_MASK) ==
2850                             EXT_GROUP) {
2851                                 (void) ea_free_item(&object, EUP_ALLOC);
2852                                 (void) ea_free_item(&pobject, EUP_ALLOC);
2853                                 zsd_warn(
2854                                     "process data of wrong type");
2855                                 goto ea_err;
2856                         }
2857                         switch (pobject.eo_catalog & EXD_DATA_MASK) {
2858                         case EXD_PROC_PID:
2859                                 pid = pobject.eo_item.ei_uint32;
2860                                 proc = &(ctl->zsctl_proc_array[pid]);
2861                                 /*
2862                                  * This process should not be currently in
2863                                  * the list of processes to process.
2864                                  */
2865                                 assert(!list_link_active(&proc->zspr_next));
2866                                 numfound++;
2867                                 break;
2868                         case EXD_PROC_ANCPID:
2869                                 ppid = pobject.eo_item.ei_uint32;
2870                                 pproc = &(ctl->zsctl_proc_array[ppid]);
2871                                 numfound++;
2872                                 break;
2873                         case EXD_PROC_ZONENAME:
2874                                 zone = zsd_lookup_zone(ctl,
2875                                     pobject.eo_item.ei_string, -1);
2876                                 numfound++;
2877                                 break;
2878                         case EXD_PROC_CPU_USER_SEC:
2879                                 user.tv_sec =
2880                                     pobject.eo_item.ei_uint64;
2881                                 numfound++;
2882                                 break;
2883                         case EXD_PROC_CPU_USER_NSEC:
2884                                 user.tv_nsec =
2885                                     pobject.eo_item.ei_uint64;
2886                                 numfound++;
2887                                 break;
2888                         case EXD_PROC_CPU_SYS_SEC:
2889                                 sys.tv_sec =
2890                                     pobject.eo_item.ei_uint64;
2891                                 numfound++;
2892                                 break;
2893                         case EXD_PROC_CPU_SYS_NSEC:
2894                                 sys.tv_nsec =
2895                                     pobject.eo_item.ei_uint64;
2896                                 numfound++;
2897                                 break;
2898                         case EXD_PROC_FINISH_SEC:
2899                                 finish.tv_sec =
2900                                     pobject.eo_item.ei_uint64;
2901                                 numfound++;
2902                                 break;
2903                         case EXD_PROC_FINISH_NSEC:
2904                                 finish.tv_nsec =
2905                                     pobject.eo_item.ei_uint64;
2906                                 numfound++;
2907                                 break;
2908                         }
2909                         (void) ea_free_item(&pobject, EUP_ALLOC);
2910                 }
2911                 (void) ea_free_item(&object, EUP_ALLOC);
2912                 if (numfound != 9) {
2913                         zsd_warn(gettext(
2914                             "Malformed process accounting entry found"));
2915                         goto proc_done;
2916                 }
2917 
2918                 if (finish.tv_sec > interval_end.tv_sec ||
2919                     (finish.tv_sec == interval_end.tv_sec &&
2920                     finish.tv_nsec > (interval_end.tv_usec * 1000)))
2921                         hrtime_expired = B_TRUE;
2922 
2923                 /*
2924                  * Try to identify the zone and pset to which this
2925                  * exited process belongs.
2926                  */
2927                 if (zone == NULL)
2928                         goto proc_done;
2929 
2930                 /* Save proc info */
2931                 proc->zspr_ppid = ppid;
2932                 proc->zspr_zoneid = zone->zsz_id;
2933 
2934                 prev_psetid = ZS_PSET_ERROR;
2935                 sched = 0;
2936 
2937                 /*
2938                  * The following tries to deduce the processes pset.
2939                  *
2940                  * First choose pset and sched using cached value from the
2941                  * most recent time the process has been seen.
2942                  *
2943                  * pset and sched can change across zone_enter, so make sure
2944                  * most recent sighting of this process was in the same
2945                  * zone before using most recent known value.
2946                  *
2947                  * If there is no known value, use value of processes
2948                  * parent.  If parent is unknown, walk parents until a known
2949                  * parent is found.
2950                  *
2951                  * If no parent in the zone is found, use the zone's default
2952                  * pset and scheduling class.
2953                  */
2954                 if (proc->zspr_psetid != ZS_PSET_ERROR) {
2955                         prev_psetid = proc->zspr_psetid;
2956                         pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2957                         sched = proc->zspr_sched;
2958                 } else if (pproc->zspr_zoneid == zone->zsz_id &&
2959                     pproc->zspr_psetid != ZS_PSET_ERROR) {
2960                         prev_psetid = pproc->zspr_psetid;
2961                         pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2962                         sched = pproc->zspr_sched;
2963                 }
2964 
2965                 if (pset == NULL) {
2966                         /*
2967                          * Process or processes parent has never been seen.
2968                          * Save to deduce a known parent later.
2969                          */
2970                         proc_usage = sys;
2971                         TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2972                         TIMESTRUC_DELTA(delta, proc_usage,
2973                             proc->zspr_usage);
2974                         proc->zspr_usage = delta;
2975                         list_insert_tail(&plist, proc);
2976                         continue;
2977                 }
2978 
2979                 /* Add the zone's usage to the pset */
2980                 usage = zsd_lookup_insert_usage(ctl, pset, zone);
2981                 if (usage == NULL)
2982                         goto proc_done;
2983 
2984                 zsd_mark_pset_usage_found(usage, sched);
2985 
2986                 /* compute the usage to add for the exited proc */
2987                 proc_usage = sys;
2988                 TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2989                 TIMESTRUC_DELTA(delta, proc_usage,
2990                     proc->zspr_usage);
2991 
2992                 zsd_add_usage(ctl, usage, &delta);
2993 proc_done:
2994                 zsd_flush_proc_info(proc);
2995 
2996                 if (hrtime_expired == B_TRUE)
2997                         break;
2998         }
2999         /*
3000          * close next accounting file.
3001          */
3002         if (ctl->zsctl_proc_open_next) {
3003                 (void) ea_close(
3004                     &ctl->zsctl_proc_eaf_next);
3005                 ctl->zsctl_proc_open_next = 0;
3006                 ctl->zsctl_proc_fd_next = -1;
3007         }
3008 
3009         /* For the remaining processes, use pset and sched of a known parent */
3010         proc = list_head(&plist);
3011         while (proc != NULL) {
3012                 next = proc;
3013                 for (;;) {
3014                         if (next->zspr_ppid == 0 || next->zspr_ppid == -1) {
3015                                 /*
3016                                  * Kernel process, or parent is unknown, skip
3017                                  * process, remove from process list.
3018                                  */
3019                                 tmp = proc;
3020                                 proc = list_next(&plist, proc);
3021                                 list_link_init(&tmp->zspr_next);
3022                                 break;
3023                         }
3024                         pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]);
3025                         if (pproc->zspr_zoneid != proc->zspr_zoneid) {
3026                                 /*
3027                                  * Parent in different zone.  Save process and
3028                                  * use zone's default pset and sched below
3029                                  */
3030                                 tmp = proc;
3031                                 proc = list_next(&plist, proc);
3032                                 list_remove(&plist, tmp);
3033                                 list_insert_tail(&pplist, tmp);
3034                                 break;
3035                         }
3036                         /* Parent has unknown pset, Search parent's parent  */
3037                         if (pproc->zspr_psetid == ZS_PSET_ERROR) {
3038                                 next = pproc;
3039                                 continue;
3040                         }
3041                         /* Found parent with known pset.  Use its info */
3042                         proc->zspr_psetid = pproc->zspr_psetid;
3043                         proc->zspr_sched = pproc->zspr_sched;
3044                         next->zspr_psetid = pproc->zspr_psetid;
3045                         next->zspr_sched = pproc->zspr_sched;
3046                         zone = zsd_lookup_zone_byid(ctl,
3047                             proc->zspr_zoneid);
3048                         if (zone == NULL) {
3049                                 tmp = proc;
3050                                 proc = list_next(&plist, proc);
3051                                 list_remove(&plist, tmp);
3052                                 list_link_init(&tmp->zspr_next);
3053                                 break;
3054                         }
3055                         pset = zsd_lookup_pset_byid(ctl,
3056                             proc->zspr_psetid);
3057                         if (pset == NULL) {
3058                                 tmp = proc;
3059                                 proc = list_next(&plist, proc);
3060                                 list_remove(&plist, tmp);
3061                                 list_link_init(&tmp->zspr_next);
3062                                 break;
3063                         }
3064                         /* Add the zone's usage to the pset */
3065                         usage = zsd_lookup_insert_usage(ctl, pset, zone);
3066                         if (usage == NULL) {
3067                                 tmp = proc;
3068                                 proc = list_next(&plist, proc);
3069                                 list_remove(&plist, tmp);
3070                                 list_link_init(&tmp->zspr_next);
3071                                 break;
3072                         }
3073                         zsd_mark_pset_usage_found(usage, proc->zspr_sched);
3074                         zsd_add_usage(ctl, usage, &proc->zspr_usage);
3075                         zsd_flush_proc_info(proc);
3076                         tmp = proc;
3077                         proc = list_next(&plist, proc);
3078                         list_remove(&plist, tmp);
3079                         list_link_init(&tmp->zspr_next);
3080                         break;
3081                 }
3082         }
3083         /*
3084          * Process has never been seen.  Using zone info to
3085          * determine pset and scheduling class.
3086          */
3087         proc = list_head(&pplist);
3088         while (proc != NULL) {
3089 
3090                 zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid);
3091                 if (zone == NULL)
3092                         goto next;
3093                 if (zone->zsz_psetid != ZS_PSET_ERROR &&
3094                     zone->zsz_psetid != ZS_PSET_MULTI) {
3095                         prev_psetid = zone->zsz_psetid;
3096                         pset = zsd_lookup_pset_byid(ctl, prev_psetid);
3097                 } else {
3098                         pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1);
3099                         if (pset != NULL)
3100                                 prev_psetid = pset->zsp_id;
3101                 }
3102                 if (pset == NULL)
3103                         goto next;
3104 
3105                 sched = zone->zsz_scheds;
3106                 /*
3107                  * Ignore FX high scheduling class if it is not the
3108                  * only scheduling class in the zone.
3109                  */
3110                 if (sched != ZS_SCHED_FX_60)
3111                         sched &= (~ZS_SCHED_FX_60);
3112                 /*
3113                  * If more than one scheduling class has been found
3114                  * in the zone, use zone's default scheduling class for
3115                  * this process.
3116                  */
3117                 if ((sched & (sched - 1)) != 0)
3118                         sched = zone->zsz_default_sched;
3119 
3120                 /* Add the zone's usage to the pset */
3121                 usage = zsd_lookup_insert_usage(ctl, pset, zone);
3122                 if (usage == NULL)
3123                         goto next;
3124 
3125                 zsd_mark_pset_usage_found(usage, sched);
3126                 zsd_add_usage(ctl, usage, &proc->zspr_usage);
3127 next:
3128                 tmp = proc;
3129                 proc = list_next(&pplist, proc);
3130                 zsd_flush_proc_info(tmp);
3131                 list_link_init(&tmp->zspr_next);
3132         }
3133         return;
3134 ea_err:
3135         /*
3136          * Close the next accounting file if we have not transitioned to it
3137          * yet.
3138          */
3139         if (ctl->zsctl_proc_open_next) {
3140                 (void) ea_close(&ctl->zsctl_proc_eaf_next);
3141                 ctl->zsctl_proc_open_next = 0;
3142                 ctl->zsctl_proc_fd_next = -1;
3143         }
3144 }
3145 
3146 /*
3147  * getvmusage(2) uses size_t's in the passwd data structure, which differ
3148  * in size for 32bit and 64 bit kernels.  Since this is a contracted interface,
3149  * and zonestatd does not necessarily match the kernel's bitness, marshal
3150  * results appropriately.
3151  */
3152 static int
3153 zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf,
3154     uint64_t *nres)
3155 {
3156         zsd_vmusage32_t *vmu32;
3157         zsd_vmusage64_t *vmu64;
3158         uint32_t nres32;
3159         int i;
3160         int ret;
3161 
3162         if (ctl->zsctl_kern_bits == 32)  {
3163                 nres32 = *nres;
3164                 ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3165                     flags, age, (uintptr_t)buf, (uintptr_t)&nres32);
3166                 *nres = nres32;
3167                 if (ret == 0 && buf != NULL) {
3168                         /*
3169                          * An array of vmusage32_t's has been returned.
3170                          * Convert it to an array of vmusage64_t's.
3171                          */
3172                         vmu32 = (zsd_vmusage32_t *)buf;
3173                         vmu64 = (zsd_vmusage64_t *)buf;
3174                         for (i = nres32 - 1; i >= 0; i--) {
3175 
3176                                 vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid;
3177                                 vmu64[i].vmu_type = vmu32[i].vmu_type;
3178                                 vmu64[i].vmu_type = vmu32[i].vmu_type;
3179                                 vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all;
3180                                 vmu64[i].vmu_rss_private =
3181                                     vmu32[i].vmu_rss_private;
3182                                 vmu64[i].vmu_rss_shared =
3183                                     vmu32[i].vmu_rss_shared;
3184                                 vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all;
3185                                 vmu64[i].vmu_swap_private =
3186                                     vmu32[i].vmu_swap_private;
3187                                 vmu64[i].vmu_swap_shared =
3188                                     vmu32[i].vmu_swap_shared;
3189                         }
3190                 }
3191                 return (ret);
3192         } else {
3193                 /*
3194                  * kernel is 64 bit, so use 64 bit structures as zonestat
3195                  * expects.
3196                  */
3197                 return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3198                     flags, age, (uintptr_t)buf, (uintptr_t)nres));
3199 
3200         }
3201 }
3202 
3203 /*
3204  * Update the current physical, virtual, and locked memory usage of the
3205  * running zones.
3206  */
3207 static void
3208 zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init)
3209 {
3210 
3211         uint64_t phys_total;
3212         uint64_t phys_used;
3213         uint64_t phys_zones;
3214         uint64_t phys_zones_overcount;
3215         uint64_t phys_zones_extra;
3216         uint64_t phys_zones_credit;
3217 
3218         uint64_t vm_free;
3219         uint64_t vm_used;
3220 
3221         uint64_t disk_swap_total;
3222         uint64_t disk_swap_used;        /* disk swap with contents */
3223 
3224         uint64_t physmem;
3225         uint64_t pp_kernel;
3226         uint64_t arc_size = 0;
3227         struct anoninfo ani;
3228 
3229         int num_swap_devices;
3230         struct swaptable *swt;
3231         struct swapent *swent;
3232         size_t swt_size;
3233         char *path;
3234 
3235         zsd_vmusage64_t *vmusage;
3236         uint64_t num_vmusage;
3237 
3238         int i, ret;
3239 
3240         zsd_system_t *sys;
3241         zsd_zone_t *zone;
3242         int vmu_nzones;
3243 
3244         kstat_t *kstat;
3245         char kstat_name[KSTAT_STRLEN];
3246         kstat_named_t *knp;
3247         kid_t kid;
3248 
3249         if (init)
3250                 return;
3251 
3252         sys = ctl->zsctl_system;
3253 
3254         /* interrogate swap devices to find the amount of disk swap */
3255 disk_swap_again:
3256         num_swap_devices = swapctl(SC_GETNSWP, NULL);
3257 
3258         if (num_swap_devices == 0) {
3259                 sys->zss_swap_total = disk_swap_total = 0;
3260                 sys->zss_swap_used = disk_swap_used = 0;
3261                 /* No disk swap */
3262                 goto disk_swap_done;
3263         }
3264         /* see if swap table needs to be larger */
3265         if (num_swap_devices > ctl->zsctl_swap_cache_num) {
3266                 swt_size = sizeof (int) +
3267                     (num_swap_devices * sizeof (struct swapent)) +
3268                     (num_swap_devices * MAXPATHLEN);
3269                 if (ctl->zsctl_swap_cache != NULL)
3270                         free(ctl->zsctl_swap_cache);
3271 
3272                 swt = (struct swaptable *)malloc(swt_size);
3273                 if (swt == NULL) {
3274                         /*
3275                          * Could not allocate to get list of swap devices.
3276                          * Just use data from the most recent read, which will
3277                          * be zero if this is the first read.
3278                          */
3279                         zsd_warn(gettext("Unable to allocate to determine "
3280                             "virtual memory"));
3281                         disk_swap_total = sys->zss_swap_total;
3282                         disk_swap_used = sys->zss_swap_used;
3283                         goto disk_swap_done;
3284                 }
3285                 swent = swt->swt_ent;
3286                 path = (char *)swt + (sizeof (int) +
3287                     num_swap_devices * sizeof (swapent_t));
3288                 for (i = 0; i < num_swap_devices; i++, swent++) {
3289                         swent->ste_path = path;
3290                         path += MAXPATHLEN;
3291                 }
3292                 swt->swt_n = num_swap_devices;
3293                 ctl->zsctl_swap_cache = swt;
3294                 ctl->zsctl_swap_cache_size = swt_size;
3295                 ctl->zsctl_swap_cache_num = num_swap_devices;
3296         }
3297         num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache);
3298         if (num_swap_devices < 0) {
3299                 /* More swap devices have arrived */
3300                 if (errno == ENOMEM)
3301                         goto disk_swap_again;
3302 
3303                 zsd_warn(gettext("Unable to determine disk swap devices"));
3304                 /* Unexpected error.  Use existing data */
3305                 disk_swap_total = sys->zss_swap_total;
3306                 disk_swap_used = sys->zss_swap_used;
3307                 goto disk_swap_done;
3308         }
3309 
3310         /* add up the disk swap */
3311         disk_swap_total = 0;
3312         disk_swap_used = 0;
3313         swent = ctl->zsctl_swap_cache->swt_ent;
3314         for (i = 0; i < num_swap_devices; i++, swent++) {
3315                 disk_swap_total += swent->ste_pages;
3316                 disk_swap_used += (swent->ste_pages - swent->ste_free);
3317         }
3318         disk_swap_total *= ctl->zsctl_pagesize;
3319         disk_swap_used *= ctl->zsctl_pagesize;
3320 
3321         sys->zss_swap_total = disk_swap_total;
3322         sys->zss_swap_used = disk_swap_used;
3323 
3324 disk_swap_done:
3325 
3326         /* get system pages kstat */
3327         kid = -1;
3328         kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages");
3329         if (kstat == NULL)
3330                 zsd_warn(gettext("Unable to lookup system pages kstat"));
3331         else
3332                 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3333 
3334         if (kid == -1) {
3335                 zsd_warn(gettext("Unable to read system pages kstat"));
3336                 return;
3337         } else {
3338                 knp = kstat_data_lookup(kstat, "physmem");
3339                 if (knp == NULL) {
3340                         zsd_warn(gettext("Unable to read physmem"));
3341                 } else {
3342                         if (knp->data_type == KSTAT_DATA_UINT64)
3343                                 physmem = knp->value.ui64;
3344                         else if (knp->data_type == KSTAT_DATA_UINT32)
3345                                 physmem = knp->value.ui32;
3346                         else
3347                                 return;
3348                 }
3349                 knp = kstat_data_lookup(kstat, "pp_kernel");
3350                 if (knp == NULL) {
3351                         zsd_warn(gettext("Unable to read pp_kernel"));
3352                 } else {
3353                         if (knp->data_type == KSTAT_DATA_UINT64)
3354                                 pp_kernel = knp->value.ui64;
3355                         else if (knp->data_type == KSTAT_DATA_UINT32)
3356                                 pp_kernel = knp->value.ui32;
3357                         else
3358                                 return;
3359                 }
3360         }
3361         physmem *= ctl->zsctl_pagesize;
3362         pp_kernel *= ctl->zsctl_pagesize;
3363 
3364         /* get the zfs arc size if available */
3365         arc_size = 0;
3366         kid = -1;
3367         kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats");
3368         if (kstat != NULL)
3369                 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3370         if (kid != -1) {
3371                 knp = kstat_data_lookup(kstat, "size");
3372                 if (knp != NULL)
3373                         if (knp->data_type == KSTAT_DATA_UINT64)
3374                                 arc_size = knp->value.ui64;
3375         }
3376 
3377         /* Try to get swap information */
3378         if (swapctl(SC_AINFO, &ani) < 0) {
3379                 zsd_warn(gettext("Unable to get swap info"));
3380                 return;
3381         }
3382 
3383 vmusage_again:
3384         /* getvmusage to get physical memory usage */
3385         vmusage = ctl->zsctl_vmusage_cache;
3386         num_vmusage = ctl->zsctl_vmusage_cache_num;
3387 
3388         ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0,
3389             vmusage, &num_vmusage);
3390 
3391         if (ret != 0) {
3392                 /* Unexpected error.  Use existing data */
3393                 if (errno != EOVERFLOW) {
3394                         zsd_warn(gettext(
3395                             "Unable to read physical memory usage"));
3396                         phys_zones = sys->zss_ram_zones;
3397                         goto vmusage_done;
3398                 }
3399         }
3400         /* vmusage results cache too small */
3401         if (num_vmusage > ctl->zsctl_vmusage_cache_num) {
3402 
3403                 size_t size = sizeof (zsd_vmusage64_t) * num_vmusage;
3404 
3405                 if (ctl->zsctl_vmusage_cache != NULL)
3406                         free(ctl->zsctl_vmusage_cache);
3407                 vmusage = (zsd_vmusage64_t *)malloc(size);
3408                 if (vmusage == NULL) {
3409                         zsd_warn(gettext("Unable to alloc to determine "
3410                             "physical memory usage"));
3411                         phys_zones = sys->zss_ram_zones;
3412                         goto vmusage_done;
3413                 }
3414                 ctl->zsctl_vmusage_cache = vmusage;
3415                 ctl->zsctl_vmusage_cache_num = num_vmusage;
3416                 goto vmusage_again;
3417         }
3418 
3419         phys_zones_overcount = 0;
3420         vmu_nzones = 0;
3421         for (i = 0; i < num_vmusage; i++) {
3422                 switch (vmusage[i].vmu_type) {
3423                 case VMUSAGE_SYSTEM:
3424                         /* total pages backing user process mappings */
3425                         phys_zones = sys->zss_ram_zones =
3426                             vmusage[i].vmu_rss_all;
3427                         break;
3428                 case VMUSAGE_ZONE:
3429                         vmu_nzones++;
3430                         phys_zones_overcount += vmusage[i].vmu_rss_all;
3431                         zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id);
3432                         if (zone != NULL)
3433                                 zone->zsz_usage_ram = vmusage[i].vmu_rss_all;
3434                         break;
3435                 default:
3436                         break;
3437                 }
3438         }
3439         /*
3440          * Figure how much memory was double counted due to text sharing
3441          * between zones.  Credit this back so that the sum of the zones
3442          * equals the total zone ram usage;
3443          */
3444         phys_zones_extra = phys_zones_overcount - phys_zones;
3445         phys_zones_credit = phys_zones_extra / vmu_nzones;
3446 
3447 vmusage_done:
3448 
3449         /* walk the zones to get swap and locked kstats.  Fetch ram cap. */
3450         sys->zss_locked_zones = 0;
3451         sys->zss_vm_zones = 0;
3452         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3453             zone = list_next(&ctl->zsctl_zones, zone)) {
3454 
3455                 /* If zone halted during interval, show memory usage as none */
3456                 if (zone->zsz_active == B_FALSE ||
3457                     zone->zsz_deleted == B_TRUE) {
3458                         zone->zsz_usage_ram = 0;
3459                         zone->zsz_usage_vm = 0;
3460                         zone->zsz_usage_locked = 0;
3461                         continue;
3462                 }
3463 
3464                 if (phys_zones_credit > 0) {
3465                         if (zone->zsz_usage_ram > phys_zones_credit) {
3466                                 zone->zsz_usage_ram -= phys_zones_credit;
3467                         }
3468                 }
3469                 /*
3470                  * Get zone's swap usage.  Since zone could have halted,
3471                  * treats as zero if cannot read
3472                  */
3473                 zone->zsz_usage_vm = 0;
3474                 (void) snprintf(kstat_name, sizeof (kstat_name),
3475                     "swapresv_zone_%d", zone->zsz_id);
3476                 kid = -1;
3477                 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3478                     zone->zsz_id, kstat_name);
3479                 if (kstat != NULL)
3480                         kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3481                 if (kid != -1) {
3482                         knp = kstat_data_lookup(kstat, "usage");
3483                         if (knp != NULL &&
3484                             knp->data_type == KSTAT_DATA_UINT64) {
3485                                 zone->zsz_usage_vm = knp->value.ui64;
3486                                 sys->zss_vm_zones += knp->value.ui64;
3487                         }
3488                 }
3489                 /*
3490                  * Get zone's locked usage.  Since zone could have halted,
3491                  * treats as zero if cannot read
3492                  */
3493                 zone->zsz_usage_locked = 0;
3494                 (void) snprintf(kstat_name, sizeof (kstat_name),
3495                     "lockedmem_zone_%d", zone->zsz_id);
3496                 kid = -1;
3497                 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3498                     zone->zsz_id, kstat_name);
3499                 if (kstat != NULL)
3500                         kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3501                 if (kid != -1) {
3502                         knp = kstat_data_lookup(kstat, "usage");
3503                         if (knp != NULL &&
3504                             knp->data_type == KSTAT_DATA_UINT64) {
3505                                 zone->zsz_usage_locked = knp->value.ui64;
3506                                 /*
3507                                  * Since locked memory accounting for zones
3508                                  * can double count ddi locked memory, cap each
3509                                  * zone's locked usage at its ram usage.
3510                                  */
3511                                 if (zone->zsz_usage_locked >
3512                                     zone->zsz_usage_ram)
3513                                         zone->zsz_usage_locked =
3514                                             zone->zsz_usage_ram;
3515                                 sys->zss_locked_zones +=
3516                                     zone->zsz_usage_locked;
3517                         }
3518                 }
3519         }
3520 
3521         phys_total =
3522             sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize;
3523 
3524         phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES))
3525             * ctl->zsctl_pagesize;
3526 
3527         /* Compute remaining statistics */
3528         sys->zss_ram_total = phys_total;
3529         sys->zss_ram_zones = phys_zones;
3530         sys->zss_ram_kern = phys_used - phys_zones - arc_size;
3531 
3532         /*
3533          * The total for kernel locked memory should include
3534          * segkp locked pages, but oh well.  The arc size is subtracted,
3535          * as that physical memory is reclaimable.
3536          */
3537         sys->zss_locked_kern = pp_kernel - arc_size;
3538         /* Add memory used by kernel startup and obp to kernel locked */
3539         if ((phys_total - physmem) > 0)
3540                 sys->zss_locked_kern += phys_total - physmem;
3541 
3542         /*
3543          * Add in the portion of (RAM+DISK) that is not available as swap,
3544          * and consider it swap used by the kernel.
3545          */
3546         sys->zss_vm_total = phys_total + disk_swap_total;
3547         vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize;
3548         vm_used = sys->zss_vm_total - vm_free;
3549         sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size;
3550 }
3551 
3552 /*
3553  * Charge each cpu's usage to its processor sets.  Also add the cpu's total
3554  * time to each zone using the processor set.  This tracks the maximum
3555  * amount of cpu time that a zone could have used.
3556  */
3557 static void
3558 zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init)
3559 {
3560         zsd_system_t *sys;
3561         zsd_zone_t *zone;
3562         zsd_pset_usage_t *usage;
3563         zsd_cpu_t *cpu;
3564         zsd_cpu_t *cpu_next;
3565         zsd_pset_t *pset;
3566         timestruc_t ts;
3567         uint64_t hrtime;
3568         timestruc_t delta;
3569 
3570         /* Update the per-cpu kstat data */
3571         cpu_next = list_head(&ctl->zsctl_cpus);
3572         while (cpu_next != NULL) {
3573                 cpu = cpu_next;
3574                 cpu_next = list_next(&ctl->zsctl_cpus, cpu);
3575                 zsd_update_cpu_stats(ctl, cpu);
3576         }
3577         /* Update the elapsed real time */
3578         hrtime = gethrtime();
3579         if (init) {
3580                 /* first time around, store hrtime for future comparision */
3581                 ctl->zsctl_hrtime = hrtime;
3582                 ctl->zsctl_hrtime_prev = hrtime;
3583 
3584         } else {
3585                 /* Compute increase in hrtime since the most recent read */
3586                 ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime;
3587                 ctl->zsctl_hrtime = hrtime;
3588                 if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0)
3589                         TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime);
3590         }
3591 
3592         /* On initialization, all psets have zero time  */
3593         if (init)
3594                 return;
3595 
3596         for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
3597             pset = list_next(&ctl->zsctl_psets, pset)) {
3598 
3599                 if (pset->zsp_active == B_FALSE) {
3600                         zsd_warn(gettext("Internal error,inactive pset found"));
3601                         continue;
3602                 }
3603 
3604                 /* sum total used time for pset */
3605                 ts.tv_sec = 0;
3606                 ts.tv_nsec = 0;
3607                 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr);
3608                 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern);
3609                 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user);
3610                 /* kernel time in pset is total time minus zone time */
3611                 TIMESTRUC_DELTA(pset->zsp_usage_kern, ts,
3612                     pset->zsp_usage_zones);
3613                 if (pset->zsp_usage_kern.tv_sec < 0 ||
3614                     pset->zsp_usage_kern.tv_nsec < 0) {
3615                         pset->zsp_usage_kern.tv_sec = 0;
3616                         pset->zsp_usage_kern.tv_nsec = 0;
3617                 }
3618                 /* Total pset elapsed time is used time plus idle time */
3619                 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle);
3620 
3621                 TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time);
3622 
3623                 for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
3624                     usage = list_next(&pset->zsp_usage_list, usage)) {
3625 
3626                         zone = usage->zsu_zone;
3627                         if (usage->zsu_cpu_shares != ZS_LIMIT_NONE &&
3628                             usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED &&
3629                             usage->zsu_cpu_shares != 0) {
3630                                 /*
3631                                  * Figure out how many nanoseconds of share time
3632                                  * to give to the zone
3633                                  */
3634                                 hrtime = delta.tv_sec;
3635                                 hrtime *= NANOSEC;
3636                                 hrtime += delta.tv_nsec;
3637                                 hrtime *= usage->zsu_cpu_shares;
3638                                 hrtime /= pset->zsp_cpu_shares;
3639                                 TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time,
3640                                     hrtime);
3641                         }
3642                         /* Add pset time to each zone using pset */
3643                         TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta);
3644 
3645                         zone->zsz_cpus_online += pset->zsp_online;
3646                 }
3647                 pset->zsp_total_time = ts;
3648         }
3649 
3650         for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3651             zone = list_next(&ctl->zsctl_zones, zone)) {
3652 
3653                 /* update cpu cap tracking if the zone has a cpu cap */
3654                 if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) {
3655                         uint64_t elapsed;
3656 
3657                         elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev;
3658                         elapsed *= zone->zsz_cpu_cap;
3659                         elapsed = elapsed / 100;
3660                         TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed);
3661                 }
3662         }
3663         sys = ctl->zsctl_system;
3664         ts.tv_sec = 0;
3665         ts.tv_nsec = 0;
3666         TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr);
3667         TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern);
3668         TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user);
3669 
3670         /* kernel time in pset is total time minus zone time */
3671         TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts,
3672             sys->zss_cpu_usage_zones);
3673         if (sys->zss_cpu_usage_kern.tv_sec < 0 ||
3674             sys->zss_cpu_usage_kern.tv_nsec < 0) {
3675                 sys->zss_cpu_usage_kern.tv_sec = 0;
3676                 sys->zss_cpu_usage_kern.tv_nsec = 0;
3677         }
3678         /* Total pset elapsed time is used time plus idle time */
3679         TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle);
3680         sys->zss_cpu_total_time = ts;
3681 }
3682 
3683 /*
3684  * Saves current usage data to a cache that is read by libzonestat when
3685  * calling zs_usage_read().
3686  *
3687  * All pointers in the cached data structure are set to NULL.  When
3688  * libzonestat reads the cached data, it will set the pointers relative to
3689  * its address space.
3690  */
3691 static void
3692 zsd_usage_cache_update(zsd_ctl_t *ctl)
3693 {
3694         zs_usage_cache_t *cache;
3695         zs_usage_cache_t *old;
3696         zs_usage_t *usage;
3697 
3698         zs_system_t *sys;
3699         zsd_system_t *dsys;
3700         zs_zone_t *zone = NULL;
3701         zsd_zone_t *dzone;
3702         zs_pset_t *pset = NULL;
3703         zsd_pset_t *dpset;
3704         zs_pset_zone_t *pusage;
3705         zsd_pset_usage_t *dpusage;
3706 
3707         char *next;
3708         uint_t size, i, j;
3709 
3710         size =
3711             sizeof (zs_usage_cache_t) +
3712             sizeof (zs_usage_t) +
3713             sizeof (zs_system_t) +
3714             sizeof (zs_zone_t) * ctl->zsctl_nzones +
3715             sizeof (zs_pset_t) *  ctl->zsctl_npsets +
3716             sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages;
3717 
3718         cache = (zs_usage_cache_t *)malloc(size);
3719         if (cache == NULL) {
3720                 zsd_warn(gettext("Unable to allocate usage cache\n"));
3721                 return;
3722         }
3723 
3724         next = (char *)cache;
3725         cache->zsuc_size = size - sizeof (zs_usage_cache_t);
3726         next += sizeof (zs_usage_cache_t);
3727 
3728         /* LINTED */
3729         usage = cache->zsuc_usage = (zs_usage_t *)next;
3730         next += sizeof (zs_usage_t);
3731         usage->zsu_start = g_start;
3732         usage->zsu_hrstart = g_hrstart;
3733         usage->zsu_time = g_now;
3734         usage->zsu_hrtime = g_hrnow;
3735         usage->zsu_nzones = ctl->zsctl_nzones;
3736         usage->zsu_npsets = ctl->zsctl_npsets;
3737         usage->zsu_system = NULL;
3738 
3739         /* LINTED */
3740         sys = (zs_system_t *)next;
3741         next += sizeof (zs_system_t);
3742         dsys = ctl->zsctl_system;
3743         sys->zss_ram_total = dsys->zss_ram_total;
3744         sys->zss_ram_kern = dsys->zss_ram_kern;
3745         sys->zss_ram_zones = dsys->zss_ram_zones;
3746         sys->zss_locked_kern = dsys->zss_locked_kern;
3747         sys->zss_locked_zones = dsys->zss_locked_zones;
3748         sys->zss_vm_total = dsys->zss_vm_total;
3749         sys->zss_vm_kern = dsys->zss_vm_kern;
3750         sys->zss_vm_zones = dsys->zss_vm_zones;
3751         sys->zss_swap_total = dsys->zss_swap_total;
3752         sys->zss_swap_used = dsys->zss_swap_used;
3753         sys->zss_ncpus = dsys->zss_ncpus;
3754         sys->zss_ncpus_online = dsys->zss_ncpus_online;
3755 
3756         sys->zss_processes_max = dsys->zss_maxpid;
3757         sys->zss_lwps_max = dsys->zss_lwps_max;
3758         sys->zss_shm_max = dsys->zss_shm_max;
3759         sys->zss_shmids_max = dsys->zss_shmids_max;
3760         sys->zss_semids_max = dsys->zss_semids_max;
3761         sys->zss_msgids_max = dsys->zss_msgids_max;
3762         sys->zss_lofi_max = dsys->zss_lofi_max;
3763 
3764         sys->zss_processes = dsys->zss_processes;
3765         sys->zss_lwps = dsys->zss_lwps;
3766         sys->zss_shm = dsys->zss_shm;
3767         sys->zss_shmids = dsys->zss_shmids;
3768         sys->zss_semids = dsys->zss_semids;
3769         sys->zss_msgids = dsys->zss_msgids;
3770         sys->zss_lofi = dsys->zss_lofi;
3771 
3772         sys->zss_cpu_total_time = dsys->zss_cpu_total_time;
3773         sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones;
3774         sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern;
3775 
3776         for (i = 0, dzone = list_head(&ctl->zsctl_zones);
3777             i < ctl->zsctl_nzones;
3778             i++, dzone = list_next(&ctl->zsctl_zones, dzone)) {
3779                 /* LINTED */
3780                 zone = (zs_zone_t *)next;
3781                 next += sizeof (zs_zone_t);
3782                 list_link_init(&zone->zsz_next);
3783                 zone->zsz_system = NULL;
3784 
3785                 (void) strlcpy(zone->zsz_name, dzone->zsz_name,
3786                     sizeof (zone->zsz_name));
3787                 (void) strlcpy(zone->zsz_pool, dzone->zsz_pool,
3788                     sizeof (zone->zsz_pool));
3789                 (void) strlcpy(zone->zsz_pset, dzone->zsz_pset,
3790                     sizeof (zone->zsz_pset));
3791                 zone->zsz_id = dzone->zsz_id;
3792                 zone->zsz_cputype = dzone->zsz_cputype;
3793                 zone->zsz_iptype = dzone->zsz_iptype;
3794                 zone->zsz_start = dzone->zsz_start;
3795                 zone->zsz_hrstart = dzone->zsz_hrstart;
3796                 zone->zsz_scheds = dzone->zsz_scheds;
3797                 zone->zsz_cpu_shares = dzone->zsz_cpu_shares;
3798                 zone->zsz_cpu_cap = dzone->zsz_cpu_cap;
3799                 zone->zsz_ram_cap = dzone->zsz_ram_cap;
3800                 zone->zsz_vm_cap = dzone->zsz_vm_cap;
3801                 zone->zsz_locked_cap = dzone->zsz_locked_cap;
3802                 zone->zsz_cpu_usage = dzone->zsz_cpu_usage;
3803                 zone->zsz_cpus_online = dzone->zsz_cpus_online;
3804                 zone->zsz_pset_time = dzone->zsz_pset_time;
3805                 zone->zsz_cap_time = dzone->zsz_cap_time;
3806                 zone->zsz_share_time = dzone->zsz_share_time;
3807                 zone->zsz_usage_ram = dzone->zsz_usage_ram;
3808                 zone->zsz_usage_locked = dzone->zsz_usage_locked;
3809                 zone->zsz_usage_vm = dzone->zsz_usage_vm;
3810 
3811                 zone->zsz_processes_cap = dzone->zsz_processes_cap;
3812                 zone->zsz_lwps_cap = dzone->zsz_lwps_cap;
3813                 zone->zsz_shm_cap = dzone->zsz_shm_cap;
3814                 zone->zsz_shmids_cap = dzone->zsz_shmids_cap;
3815                 zone->zsz_semids_cap = dzone->zsz_semids_cap;
3816                 zone->zsz_msgids_cap = dzone->zsz_msgids_cap;
3817                 zone->zsz_lofi_cap = dzone->zsz_lofi_cap;
3818 
3819                 zone->zsz_processes = dzone->zsz_processes;
3820                 zone->zsz_lwps = dzone->zsz_lwps;
3821                 zone->zsz_shm = dzone->zsz_shm;
3822                 zone->zsz_shmids = dzone->zsz_shmids;
3823                 zone->zsz_semids = dzone->zsz_semids;
3824                 zone->zsz_msgids = dzone->zsz_msgids;
3825                 zone->zsz_lofi = dzone->zsz_lofi;
3826         }
3827 
3828         for (i = 0, dpset = list_head(&ctl->zsctl_psets);
3829             i < ctl->zsctl_npsets;
3830             i++, dpset = list_next(&ctl->zsctl_psets, dpset)) {
3831                 /* LINTED */
3832                 pset = (zs_pset_t *)next;
3833                 next += sizeof (zs_pset_t);
3834                 list_link_init(&pset->zsp_next);
3835                 (void) strlcpy(pset->zsp_name, dpset->zsp_name,
3836                     sizeof (pset->zsp_name));
3837                 pset->zsp_id = dpset->zsp_id;
3838                 pset->zsp_cputype = dpset->zsp_cputype;
3839                 pset->zsp_start = dpset->zsp_start;
3840                 pset->zsp_hrstart = dpset->zsp_hrstart;
3841                 pset->zsp_online = dpset->zsp_online;
3842                 pset->zsp_size = dpset->zsp_size;
3843                 pset->zsp_min = dpset->zsp_min;
3844                 pset->zsp_max = dpset->zsp_max;
3845                 pset->zsp_importance = dpset->zsp_importance;
3846                 pset->zsp_scheds = dpset->zsp_scheds;
3847                 pset->zsp_cpu_shares = dpset->zsp_cpu_shares;
3848                 pset->zsp_total_time = dpset->zsp_total_time;
3849                 pset->zsp_usage_kern = dpset->zsp_usage_kern;
3850                 pset->zsp_usage_zones = dpset->zsp_usage_zones;
3851                 pset->zsp_nusage = dpset->zsp_nusage;
3852                 /* Add pset usages for pset */
3853                 for (j = 0, dpusage = list_head(&dpset->zsp_usage_list);
3854                     j < dpset->zsp_nusage;
3855                     j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) {
3856                         /* LINTED */
3857                         pusage = (zs_pset_zone_t *)next;
3858                         next += sizeof (zs_pset_zone_t);
3859                         /* pointers are computed by client */
3860                         pusage->zspz_pset = NULL;
3861                         pusage->zspz_zone = NULL;
3862                         list_link_init(&pusage->zspz_next);
3863                         pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id;
3864                         pusage->zspz_start = dpusage->zsu_start;
3865                         pusage->zspz_hrstart = dpusage->zsu_hrstart;
3866                         pusage->zspz_hrstart = dpusage->zsu_hrstart;
3867                         pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares;
3868                         pusage->zspz_scheds = dpusage->zsu_scheds;
3869                         pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage;
3870                 }
3871         }
3872 
3873         /* Update the current cache pointer */
3874         (void) mutex_lock(&g_usage_cache_lock);
3875                 old = g_usage_cache;
3876                 cache->zsuc_ref = 1;
3877                 cache->zsuc_gen = g_gen_next;
3878                 usage->zsu_gen = g_gen_next;
3879                 usage->zsu_size = size;
3880                 g_usage_cache = cache;
3881                 if (old != NULL) {
3882                         old->zsuc_ref--;
3883                         if (old->zsuc_ref == 0)
3884                                 free(old);
3885                 }
3886                 g_gen_next++;
3887         /* Wake up any clients that are waiting for this calculation */
3888         if (g_usage_cache_kickers > 0) {
3889                 (void) cond_broadcast(&g_usage_cache_wait);
3890         }
3891         (void) mutex_unlock(&g_usage_cache_lock);
3892 }
3893 
3894 static zs_usage_cache_t *
3895 zsd_usage_cache_hold_locked()
3896 {
3897         zs_usage_cache_t *ret;
3898 
3899         ret = g_usage_cache;
3900         ret->zsuc_ref++;
3901         return (ret);
3902 }
3903 
3904 void
3905 zsd_usage_cache_rele(zs_usage_cache_t *cache)
3906 {
3907         (void) mutex_lock(&g_usage_cache_lock);
3908         cache->zsuc_ref--;
3909         if (cache->zsuc_ref == 0)
3910                 free(cache);
3911         (void) mutex_unlock(&g_usage_cache_lock);
3912 }
3913 
3914 /* Close the handles held by zsd_open() */
3915 void
3916 zsd_close(zsd_ctl_t *ctl)
3917 {
3918         zsd_zone_t *zone;
3919         zsd_pset_t *pset;
3920         zsd_pset_usage_t *usage;
3921         zsd_cpu_t *cpu;
3922         int id;
3923 
3924         if (ctl->zsctl_kstat_ctl) {
3925                 (void) kstat_close(ctl->zsctl_kstat_ctl);
3926                 ctl->zsctl_kstat_ctl = NULL;
3927         }
3928         if (ctl->zsctl_proc_open) {
3929                 (void) ea_close(&ctl->zsctl_proc_eaf);
3930                 ctl->zsctl_proc_open = 0;
3931                 ctl->zsctl_proc_fd = -1;
3932         }
3933         if (ctl->zsctl_pool_conf) {
3934                 if (ctl->zsctl_pool_status == POOL_ENABLED)
3935                         (void) pool_conf_close(ctl->zsctl_pool_conf);
3936                 ctl->zsctl_pool_status = POOL_DISABLED;
3937         }
3938 
3939         while ((zone = list_head(&ctl->zsctl_zones)) != NULL) {
3940                 list_remove(&ctl->zsctl_zones, zone);
3941                 free(zone);
3942                 ctl->zsctl_nzones--;
3943         }
3944 
3945         while ((pset = list_head(&ctl->zsctl_psets)) != NULL) {
3946                 while ((usage = list_head(&pset->zsp_usage_list))
3947                     != NULL) {
3948                         list_remove(&pset->zsp_usage_list, usage);
3949                         ctl->zsctl_npset_usages--;
3950                         free(usage);
3951                 }
3952                 list_remove(&ctl->zsctl_psets, pset);
3953                 free(pset);
3954                 ctl->zsctl_npsets--;
3955         }
3956 
3957         /* Release all cpus being tracked */
3958         while (cpu = list_head(&ctl->zsctl_cpus)) {
3959                 list_remove(&ctl->zsctl_cpus, cpu);
3960                 id = cpu->zsc_id;
3961                 bzero(cpu, sizeof (zsd_cpu_t));
3962                 cpu->zsc_id = id;
3963                 cpu->zsc_allocated = B_FALSE;
3964                 cpu->zsc_psetid = ZS_PSET_ERROR;
3965                 cpu->zsc_psetid_prev = ZS_PSET_ERROR;
3966         }
3967 
3968         assert(ctl->zsctl_npset_usages == 0);
3969         assert(ctl->zsctl_npsets == 0);
3970         assert(ctl->zsctl_nzones == 0);
3971         (void) zsd_disable_cpu_stats();
3972 }
3973 
3974 
3975 /*
3976  * Update the utilization data for all zones and processor sets.
3977  */
3978 static int
3979 zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory)
3980 {
3981         (void) kstat_chain_update(ctl->zsctl_kstat_ctl);
3982         (void) gettimeofday(&(ctl->zsctl_timeofday), NULL);
3983 
3984         zsd_refresh_system(ctl);
3985 
3986         /*
3987          * Memory calculation is expensive.  Only update it on sample
3988          * intervals.
3989          */
3990         if (do_memory == B_TRUE)
3991                 zsd_refresh_memory(ctl, init);
3992         zsd_refresh_zones(ctl);
3993         zsd_refresh_psets(ctl);
3994         zsd_refresh_procs(ctl, init);
3995         zsd_refresh_cpu_stats(ctl, init);
3996 
3997         /*
3998          * Delete objects that no longer exist.
3999          * Pset usages must be deleted first as they point to zone and
4000          * pset objects.
4001          */
4002         zsd_mark_pset_usages_end(ctl);
4003         zsd_mark_psets_end(ctl);
4004         zsd_mark_cpus_end(ctl);
4005         zsd_mark_zones_end(ctl);
4006 
4007         /*
4008          * Save results for clients.
4009          */
4010         zsd_usage_cache_update(ctl);
4011 
4012         /*
4013          * Roll process accounting file.
4014          */
4015         (void) zsd_roll_exacct();
4016         return (0);
4017 }
4018 
4019 /*
4020  * Get the system rctl, which is the upper most limit
4021  */
4022 static uint64_t
4023 zsd_get_system_rctl(char *name)
4024 {
4025         rctlblk_t *rblk, *rblk_last;
4026 
4027         rblk = (rctlblk_t *)alloca(rctlblk_size());
4028         rblk_last = (rctlblk_t *)alloca(rctlblk_size());
4029 
4030         if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0)
4031                 return (ZS_LIMIT_NONE);
4032 
4033         while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0)
4034                 (void) bcopy(rblk, rblk_last, rctlblk_size());
4035 
4036         return (rctlblk_get_value(rblk_last));
4037 }
4038 
4039 /*
4040  * Open any necessary subsystems for collecting utilization data,
4041  * allocate and initialize data structures, and get initial utilization.
4042  *
4043  * Errors:
4044  *      ENOMEM  out of memory
4045  *      EINVAL  other error
4046  */
4047 static zsd_ctl_t *
4048 zsd_open(zsd_ctl_t *ctl)
4049 {
4050         zsd_system_t *system;
4051 
4052         char path[MAXPATHLEN];
4053         long pathmax;
4054         struct statvfs svfs;
4055         int ret;
4056         int i;
4057         size_t size;
4058         int err;
4059 
4060         if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1,
4061             sizeof (zsd_ctl_t))) == NULL) {
4062                         zsd_warn(gettext("Out of Memory"));
4063                         errno = ENOMEM;
4064                         goto err;
4065         }
4066         ctl->zsctl_proc_fd = -1;
4067 
4068         /* open kstats */
4069         if (ctl->zsctl_kstat_ctl == NULL &&
4070             (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) {
4071                 err = errno;
4072                 zsd_warn(gettext("Unable to open kstats"));
4073                 errno = err;
4074                 if (errno != ENOMEM)
4075                         errno = EAGAIN;
4076                 goto err;
4077         }
4078 
4079         /*
4080          * These are set when the accounting file is opened by
4081          * zsd_update_procs()
4082          */
4083         ctl->zsctl_proc_fd = -1;
4084         ctl->zsctl_proc_fd_next = -1;
4085         ctl->zsctl_proc_open = 0;
4086         ctl->zsctl_proc_open_next = 0;
4087 
4088 check_exacct:
4089         (void) zsd_enable_cpu_stats();
4090 
4091         /* Create structures to track usage */
4092         if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *)
4093             calloc(1, sizeof (zsd_system_t))) == NULL) {
4094                 ret = -1;
4095                 zsd_warn(gettext("Out of Memory"));
4096                 errno = ENOMEM;
4097                 goto err;
4098         }
4099         system = ctl->zsctl_system;
4100         /* get the kernel bitness to know structure layout for getvmusage */
4101         ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path));
4102         if (ret < 0)
4103                 ctl->zsctl_kern_bits = 32;
4104         else
4105                 ctl->zsctl_kern_bits = 64;
4106         ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE);
4107 
4108         size = sysconf(_SC_CPUID_MAX);
4109         ctl->zsctl_maxcpuid = size;
4110         if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array =
4111             (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) {
4112                 zsd_warn(gettext("Out of Memory"));
4113                 errno = ENOMEM;
4114                 goto err;
4115         }
4116         for (i = 0; i <= ctl->zsctl_maxcpuid; i++) {
4117                 ctl->zsctl_cpu_array[i].zsc_id = i;
4118                 ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE;
4119                 ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR;
4120                 ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR;
4121         }
4122         if (statvfs("/proc", &svfs) != 0 ||
4123             strcmp("/proc", svfs.f_fstr) != 0) {
4124                 zsd_warn(gettext("/proc not a procfs filesystem"));
4125                 errno = EINVAL;
4126                 goto err;
4127         }
4128 
4129         size = sysconf(_SC_MAXPID) + 1;
4130         ctl->zsctl_maxproc = size;
4131         if (ctl->zsctl_proc_array == NULL &&
4132             (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size,
4133             sizeof (zsd_proc_t))) == NULL) {
4134                 zsd_warn(gettext("Out of Memory"));
4135                 errno = ENOMEM;
4136                 goto err;
4137         }
4138         for (i = 0; i <= ctl->zsctl_maxproc; i++) {
4139                 list_link_init(&(ctl->zsctl_proc_array[i].zspr_next));
4140                 ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR;
4141                 ctl->zsctl_proc_array[i].zspr_zoneid = -1;
4142                 ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0;
4143                 ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0;
4144                 ctl->zsctl_proc_array[i].zspr_ppid = -1;
4145         }
4146 
4147         list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t),
4148             offsetof(zsd_zone_t, zsz_next));
4149 
4150         list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t),
4151             offsetof(zsd_pset_t, zsp_next));
4152 
4153         list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t),
4154             offsetof(zsd_cpu_t, zsc_next));
4155 
4156         pathmax = pathconf("/proc", _PC_NAME_MAX);
4157         if (pathmax < 0) {
4158                 zsd_warn(gettext("Unable to determine max path of /proc"));
4159                 errno = EINVAL;
4160                 goto err;
4161         }
4162         size = sizeof (struct dirent) + pathmax + 1;
4163 
4164         ctl->zsctl_procfs_dent_size = size;
4165         if (ctl->zsctl_procfs_dent == NULL &&
4166             (ctl->zsctl_procfs_dent = (struct dirent *)calloc(1, size))
4167             == NULL) {
4168                 zsd_warn(gettext("Out of Memory"));
4169                 errno = ENOMEM;
4170                 goto err;
4171         }
4172 
4173         if (ctl->zsctl_pool_conf == NULL &&
4174             (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) {
4175                 zsd_warn(gettext("Out of Memory"));
4176                 errno = ENOMEM;
4177                 goto err;
4178         }
4179         ctl->zsctl_pool_status = POOL_DISABLED;
4180         ctl->zsctl_pool_changed = 0;
4181 
4182         if (ctl->zsctl_pool_vals[0] == NULL &&
4183             (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) {
4184                 zsd_warn(gettext("Out of Memory"));
4185                 errno = ENOMEM;
4186                 goto err;
4187         }
4188         if (ctl->zsctl_pool_vals[1] == NULL &&
4189             (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) {
4190                 zsd_warn(gettext("Out of Memory"));
4191                 errno = ENOMEM;
4192                 goto err;
4193         }
4194         ctl->zsctl_pool_vals[2] = NULL;
4195 
4196         /*
4197          * get system limits
4198          */
4199         system->zss_maxpid = size = sysconf(_SC_MAXPID);
4200         system->zss_processes_max = zsd_get_system_rctl("zone.max-processes");
4201         system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps");
4202         system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory");
4203         system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids");
4204         system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids");
4205         system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids");
4206         system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi");
4207 
4208         g_gen_next = 1;
4209 
4210         if (zsd_read(ctl, B_TRUE, B_FALSE) != 0)
4211                 zsd_warn(gettext("Reading zone statistics failed"));
4212 
4213         return (ctl);
4214 err:
4215         if (ctl)
4216                 zsd_close(ctl);
4217 
4218         return (NULL);
4219 }
4220 
4221 /* Copy utilization data to buffer, filtering data if non-global zone. */
4222 static void
4223 zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage,
4224     boolean_t is_gz)
4225 {
4226         zs_usage_t *cusage;
4227         zs_system_t *sys, *csys;
4228         zs_zone_t *zone, *czone;
4229         zs_pset_t *pset, *cpset;
4230         zs_pset_zone_t *pz, *cpz, *foundpz;
4231         size_t size = 0, csize = 0;
4232         char *start, *cstart;
4233         int i, j;
4234         timestruc_t delta;
4235 
4236         /* Privileged users in the global zone get everything */
4237         if (is_gz) {
4238                 cusage = cache->zsuc_usage;
4239                 (void) bcopy(cusage, usage, cusage->zsu_size);
4240                 return;
4241         }
4242 
4243         /* Zones just get their own usage */
4244         cusage = cache->zsuc_usage;
4245 
4246         start = (char *)usage;
4247         cstart = (char *)cusage;
4248         size += sizeof (zs_usage_t);
4249         csize += sizeof (zs_usage_t);
4250 
4251         usage->zsu_start = cusage->zsu_start;
4252         usage->zsu_hrstart = cusage->zsu_hrstart;
4253         usage->zsu_time = cusage->zsu_time;
4254         usage->zsu_hrtime = cusage->zsu_hrtime;
4255         usage->zsu_gen = cusage->zsu_gen;
4256         usage->zsu_nzones = 1;
4257         usage->zsu_npsets = 0;
4258 
4259         /* LINTED */
4260         sys = (zs_system_t *)(start + size);
4261         /* LINTED */
4262         csys = (zs_system_t *)(cstart + csize);
4263         size += sizeof (zs_system_t);
4264         csize += sizeof (zs_system_t);
4265 
4266         /* Save system limits but not usage */
4267         *sys = *csys;
4268         sys->zss_ncpus = 0;
4269         sys->zss_ncpus_online = 0;
4270 
4271         /* LINTED */
4272         zone = (zs_zone_t *)(start + size);
4273         /* LINTED */
4274         czone = (zs_zone_t *)(cstart + csize);
4275         /* Find the matching zone */
4276         for (i = 0; i < cusage->zsu_nzones; i++) {
4277                 if (czone->zsz_id == zid) {
4278                         *zone = *czone;
4279                         size += sizeof (zs_zone_t);
4280                 }
4281                 csize += sizeof (zs_zone_t);
4282                 /* LINTED */
4283                 czone = (zs_zone_t *)(cstart + csize);
4284         }
4285         sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram);
4286         sys->zss_ram_zones = zone->zsz_usage_ram;
4287 
4288         sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm);
4289         sys->zss_vm_zones = zone->zsz_usage_vm;
4290 
4291         sys->zss_locked_kern += (sys->zss_locked_zones -
4292             zone->zsz_usage_locked);
4293         sys->zss_locked_zones = zone->zsz_usage_locked;
4294 
4295         TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage);
4296         TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta);
4297         sys->zss_cpu_usage_zones = zone->zsz_cpu_usage;
4298 
4299         /* LINTED */
4300         pset = (zs_pset_t *)(start + size);
4301         /* LINTED */
4302         cpset = (zs_pset_t *)(cstart + csize);
4303         for (i = 0; i < cusage->zsu_npsets; i++) {
4304                 csize += sizeof (zs_pset_t);
4305                 /* LINTED */
4306                 cpz = (zs_pset_zone_t *)(csize + cstart);
4307                 foundpz = NULL;
4308                 for (j = 0; j < cpset->zsp_nusage; j++) {
4309                         if (cpz->zspz_zoneid == zid)
4310                                 foundpz = cpz;
4311 
4312                         csize += sizeof (zs_pset_zone_t);
4313                         /* LINTED */
4314                         cpz = (zs_pset_zone_t *)(csize + cstart);
4315                 }
4316                 if (foundpz != NULL) {
4317                         size += sizeof (zs_pset_t);
4318                         /* LINTED */
4319                         pz = (zs_pset_zone_t *)(start + size);
4320                         size += sizeof (zs_pset_zone_t);
4321 
4322                         *pset = *cpset;
4323                         *pz = *foundpz;
4324 
4325                         TIMESTRUC_DELTA(delta, pset->zsp_usage_zones,
4326                             pz->zspz_cpu_usage);
4327                         TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta);
4328                         pset->zsp_usage_zones = pz->zspz_cpu_usage;
4329                         pset->zsp_nusage = 1;
4330                         usage->zsu_npsets++;
4331                         sys->zss_ncpus += pset->zsp_size;
4332                         sys->zss_ncpus_online += pset->zsp_online;
4333                 }
4334                 /* LINTED */
4335                 cpset = (zs_pset_t *)(cstart + csize);
4336         }
4337         usage->zsu_size = size;
4338 }
4339 
4340 /*
4341  * Respond to new connections from libzonestat.so.  Also respond to zoneadmd,
4342  * which reports new zones.
4343  */
4344 /* ARGSUSED */
4345 static void
4346 zsd_server(void *cookie, char *argp, size_t arg_size,
4347     door_desc_t *dp, uint_t n_desc)
4348 {
4349         int *args, cmd;
4350         door_desc_t door;
4351         ucred_t *ucred;
4352         const priv_set_t *eset;
4353 
4354         if (argp == DOOR_UNREF_DATA) {
4355                 (void) door_return(NULL, 0, NULL, 0);
4356                 thr_exit(NULL);
4357         }
4358 
4359         if (arg_size != sizeof (cmd) * 2) {
4360                 (void) door_return(NULL, 0, NULL, 0);
4361                 thr_exit(NULL);
4362         }
4363 
4364         /* LINTED */
4365         args = (int *)argp;
4366         cmd = args[0];
4367 
4368         /* If connection, return door to stat server */
4369         if (cmd == ZSD_CMD_CONNECT) {
4370 
4371                 /* Verify client compilation version */
4372                 if (args[1] != ZS_VERSION) {
4373                         args[1] = ZSD_STATUS_VERSION_MISMATCH;
4374                         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4375                         thr_exit(NULL);
4376                 }
4377                 ucred = alloca(ucred_size());
4378                 /* Verify client permission */
4379                 if (door_ucred(&ucred) != 0) {
4380                         args[1] = ZSD_STATUS_INTERNAL_ERROR;
4381                         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4382                         thr_exit(NULL);
4383                 }
4384 
4385                 eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4386                 if (eset == NULL) {
4387                         args[1] = ZSD_STATUS_INTERNAL_ERROR;
4388                         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4389                         thr_exit(NULL);
4390                 }
4391                 if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4392                         args[1] = ZSD_STATUS_PERMISSION;
4393                         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4394                         thr_exit(NULL);
4395                 }
4396 
4397                 /* Return stat server door */
4398                 args[1] = ZSD_STATUS_OK;
4399                 door.d_attributes = DOOR_DESCRIPTOR;
4400                 door.d_data.d_desc.d_descriptor = g_stat_door;
4401                 (void) door_return(argp, sizeof (cmd) * 2, &door, 1);
4402                 thr_exit(NULL);
4403         }
4404 
4405         /* Respond to zoneadmd informing zonestatd of a new zone */
4406         if (cmd == ZSD_CMD_NEW_ZONE) {
4407                 zsd_fattach_zone(args[1], g_server_door, B_FALSE);
4408                 (void) door_return(NULL, 0, NULL, 0);
4409                 thr_exit(NULL);
4410         }
4411 
4412         args[1] = ZSD_STATUS_INTERNAL_ERROR;
4413         (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4414         thr_exit(NULL);
4415 }
4416 
4417 /*
4418  * Respond to libzonestat.so clients with the current utlilzation data.
4419  */
4420 /* ARGSUSED */
4421 static void
4422 zsd_stat_server(void *cookie, char *argp, size_t arg_size,
4423     door_desc_t *dp, uint_t n_desc)
4424 {
4425         uint64_t *args, cmd;
4426         zs_usage_cache_t *cache;
4427         int ret;
4428         char *rvalp;
4429         size_t rvals;
4430         zs_usage_t *usage;
4431         ucred_t *ucred;
4432         zoneid_t zoneid;
4433         const priv_set_t *eset;
4434         boolean_t is_gz = B_FALSE;
4435 
4436         /* Tell stat thread there are no more clients */
4437         if (argp == DOOR_UNREF_DATA) {
4438                 (void) mutex_lock(&g_usage_cache_lock);
4439                 g_hasclient = B_FALSE;
4440                 (void) cond_signal(&g_usage_cache_kick);
4441                 (void) mutex_unlock(&g_usage_cache_lock);
4442                 (void) door_return(NULL, 0, NULL, 0);
4443                 thr_exit(NULL);
4444         }
4445         if (arg_size != sizeof (cmd) * 2) {
4446                 (void) door_return(NULL, 0, NULL, 0);
4447                 thr_exit(NULL);
4448         }
4449         /* LINTED */
4450         args = (uint64_t *)argp;
4451         cmd = args[0];
4452         if (cmd != ZSD_CMD_READ) {
4453                 (void) door_return(NULL, 0, NULL, 0);
4454                 thr_exit(NULL);
4455         }
4456         ucred = alloca(ucred_size());
4457         if (door_ucred(&ucred) != 0) {
4458                 (void) door_return(NULL, 0, NULL, 0);
4459                 thr_exit(NULL);
4460         }
4461         zoneid = ucred_getzoneid(ucred);
4462 
4463         if (zoneid == GLOBAL_ZONEID)
4464                 is_gz = B_TRUE;
4465 
4466         eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4467         if (eset == NULL) {
4468                 (void) door_return(NULL, 0, NULL, 0);
4469                 thr_exit(NULL);
4470         }
4471         if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4472                 (void) door_return(NULL, 0, NULL, 0);
4473                 thr_exit(NULL);
4474         }
4475         (void) mutex_lock(&g_usage_cache_lock);
4476         g_hasclient = B_TRUE;
4477 
4478         /*
4479          * Force a new cpu calculation for client.  This will force a
4480          * new memory calculation if the memory data is older than the
4481          * sample period.
4482          */
4483         g_usage_cache_kickers++;
4484         (void) cond_signal(&g_usage_cache_kick);
4485         ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock);
4486         g_usage_cache_kickers--;
4487         if (ret != 0 && errno == EINTR) {
4488                 (void) mutex_unlock(&g_usage_cache_lock);
4489                 zsd_warn(gettext(
4490                     "Interrupted before writing usage size to client\n"));
4491                 (void) door_return(NULL, 0, NULL, 0);
4492                 thr_exit(NULL);
4493         }
4494         cache = zsd_usage_cache_hold_locked();
4495         if (cache == NULL) {
4496                 zsd_warn(gettext("Usage cache empty.\n"));
4497                 (void) door_return(NULL, 0, NULL, 0);
4498                 thr_exit(NULL);
4499         }
4500         (void) mutex_unlock(&g_usage_cache_lock);
4501 
4502         /* Copy current usage data to stack to send to client */
4503         usage = (zs_usage_t *)alloca(cache->zsuc_size);
4504 
4505         /* Filter out results if caller is non-global zone */
4506         zsd_usage_filter(zoneid, cache, usage, is_gz);
4507 
4508         rvalp = (void *)usage;
4509         rvals = usage->zsu_size;
4510         zsd_usage_cache_rele(cache);
4511 
4512         (void) door_return(rvalp, rvals, 0, NULL);
4513         thr_exit(NULL);
4514 }
4515 
4516 static volatile boolean_t g_quit;
4517 
4518 /* ARGSUSED */
4519 static void
4520 zonestat_quithandler(int sig)
4521 {
4522         g_quit = B_TRUE;
4523 }
4524 
4525 /*
4526  * The stat thread generates new utilization data when clients request
4527  * it.  It also manages opening and closing the subsystems used to gather
4528  * data depending on if clients exist.
4529  */
4530 /* ARGSUSED */
4531 void *
4532 stat_thread(void *arg)
4533 {
4534         time_t start;
4535         time_t now;
4536         time_t next_memory;
4537         boolean_t do_memory;
4538         boolean_t do_read;
4539         boolean_t do_close;
4540 
4541         start = time(NULL);
4542         if (start < 0) {
4543                 if (g_quit == B_TRUE)
4544                         goto quit;
4545                 zsd_warn(gettext("Unable to fetch current time"));
4546                 g_quit = B_TRUE;
4547                 goto quit;
4548         }
4549 
4550         next_memory = start;
4551         while (g_quit == B_FALSE) {
4552                 for (;;) {
4553                         /*
4554                          * These are used to decide if the most recent memory
4555                          * calculation was within a sample interval,
4556                          * and weather or not the usage collection needs to
4557                          * be opened or closed.
4558                          */
4559                         do_memory = B_FALSE;
4560                         do_read = B_FALSE;
4561                         do_close = B_FALSE;
4562 
4563                         /*
4564                          * If all clients have gone, close usage collecting
4565                          */
4566                         (void) mutex_lock(&g_usage_cache_lock);
4567                         if (!g_hasclient && g_open == B_TRUE) {
4568                                 do_close = B_TRUE;
4569                                 (void) mutex_unlock(&g_usage_cache_lock);
4570                                 break;
4571                         }
4572                         if (g_quit == B_TRUE) {
4573                                 (void) mutex_unlock(
4574                                     &g_usage_cache_lock);
4575                                 break;
4576                         }
4577                         /*
4578                          * Wait for a usage data request
4579                          */
4580                         if (g_usage_cache_kickers == 0) {
4581                                 (void) cond_wait(&g_usage_cache_kick,
4582                                     &g_usage_cache_lock);
4583                         }
4584                         now = time(NULL);
4585                         if (now < 0) {
4586                                 if (g_quit == B_TRUE) {
4587                                         (void) mutex_unlock(
4588                                             &g_usage_cache_lock);
4589                                         goto quit;
4590                                 }
4591                                 g_quit = B_TRUE;
4592                                 (void) mutex_unlock(&g_usage_cache_lock);
4593                                 zsd_warn(gettext(
4594                                     "Unable to fetch current time"));
4595                                 goto quit;
4596                         }
4597                         if (g_hasclient) {
4598                                 do_read = B_TRUE;
4599                                 if (now >= next_memory) {
4600                                         do_memory = B_TRUE;
4601                                         next_memory = now + g_interval;
4602                                 }
4603                         } else {
4604                                 do_close = B_TRUE;
4605                         }
4606                         (void) mutex_unlock(&g_usage_cache_lock);
4607                         if (do_read || do_close)
4608                                 break;
4609                 }
4610                 g_now = now;
4611                 g_hrnow = gethrtime();
4612                 if (g_hasclient && g_open == B_FALSE) {
4613                         g_start = g_now;
4614                         g_hrstart = g_hrnow;
4615                         g_ctl = zsd_open(g_ctl);
4616                         if (g_ctl == NULL)
4617                                 zsd_warn(gettext(
4618                                     "Unable to open zone statistics"));
4619                         else
4620                                 g_open = B_TRUE;
4621                 }
4622                 if (do_read && g_ctl) {
4623                         if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) {
4624                                 zsd_warn(gettext(
4625                                     "Unable to read zone statistics"));
4626                                 g_quit = B_TRUE;
4627                                 return (NULL);
4628                         }
4629                 }
4630                 (void) mutex_lock(&g_usage_cache_lock);
4631                 if (!g_hasclient && g_open == B_TRUE && g_ctl) {
4632                         (void) mutex_unlock(&g_usage_cache_lock);
4633                         zsd_close(g_ctl);
4634                         g_open = B_FALSE;
4635                 } else {
4636                         (void) mutex_unlock(&g_usage_cache_lock);
4637                 }
4638         }
4639 quit:
4640         if (g_open)
4641                 zsd_close(g_ctl);
4642 
4643         (void) thr_kill(g_main, SIGINT);
4644         thr_exit(NULL);
4645         return (NULL);
4646 }
4647 
4648 void
4649 zsd_set_fx()
4650 {
4651         pcinfo_t pcinfo;
4652         pcparms_t pcparms;
4653 
4654         (void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname));
4655         if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) {
4656                 zsd_warn(gettext("cannot get FX class parameters"));
4657                 return;
4658         }
4659         pcparms.pc_cid = pcinfo.pc_cid;
4660         ((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60;
4661         ((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60;
4662         ((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0;
4663         ((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE;
4664         if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1)
4665                 zsd_warn(gettext("cannot enter the FX class"));
4666 }
4667 
4668 static int pipe_fd;
4669 
4670 static void
4671 daemonize_ready(char status)
4672 {
4673         /*
4674          * wake the parent with a clue
4675          */
4676         (void) write(pipe_fd, &status, 1);
4677         (void) close(pipe_fd);
4678 }
4679 
4680 static int
4681 daemonize_start(void)
4682 {
4683         char data;
4684         int status;
4685 
4686         int filedes[2];
4687         pid_t pid;
4688 
4689         (void) close(0);
4690         (void) dup2(2, 1);
4691 
4692         if (pipe(filedes) < 0)
4693                 return (-1);
4694 
4695         (void) fflush(NULL);
4696 
4697         if ((pid = fork1()) < 0)
4698                 return (-1);
4699 
4700         if (pid != 0) {
4701                 /*
4702                  * parent
4703                  */
4704                 struct sigaction act;
4705 
4706                 act.sa_sigaction = SIG_DFL;
4707                 (void) sigemptyset(&act.sa_mask);
4708                 act.sa_flags = 0;
4709 
4710                 (void) sigaction(SIGPIPE, &act, NULL);  /* ignore SIGPIPE */
4711 
4712                 (void) close(filedes[1]);
4713                 if (read(filedes[0], &data, 1) == 1) {
4714                         /* forward ready code via exit status */
4715                         exit(data);
4716                 }
4717                 status = -1;
4718                 (void) wait4(pid, &status, 0, NULL);
4719                 /* daemon process exited before becoming ready */
4720                 if (WIFEXITED(status)) {
4721                         /* assume daemon process printed useful message */
4722                         exit(WEXITSTATUS(status));
4723                 } else {
4724                         zsd_warn(gettext("daemon process killed or died"));
4725                         exit(1);
4726                 }
4727         }
4728 
4729         /*
4730          * child
4731          */
4732         pipe_fd = filedes[1];
4733         (void) close(filedes[0]);
4734 
4735         /*
4736          * generic Unix setup
4737          */
4738         (void) setsid();
4739         (void) umask(0000);
4740 
4741         return (0);
4742 }
4743 
4744 static void
4745 fattach_all_zones(boolean_t detach_only)
4746 {
4747         zoneid_t *zids;
4748         uint_t nzids, nzids_last;
4749         int i;
4750 
4751 again:
4752         (void) zone_list(NULL, &nzids);
4753         nzids_last = nzids;
4754         zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last);
4755         if (zids == NULL)
4756                 zsd_error(gettext("Out of memory"));
4757 
4758         (void) zone_list(zids, &nzids);
4759         if (nzids > nzids_last) {
4760                 free(zids);
4761                 goto again;
4762         }
4763         for (i = 0; i < nzids; i++)
4764                 zsd_fattach_zone(zids[i], g_server_door, detach_only);
4765 
4766         free(zids);
4767 }
4768 
4769 int
4770 main(int argc, char *argv[])
4771 {
4772 
4773         int arg;
4774         thread_t tid;
4775         scf_simple_prop_t *prop;
4776         uint64_t *intervalp;
4777         boolean_t opt_cleanup = B_FALSE;
4778 
4779         g_main = thr_self();
4780         g_quit = B_FALSE;
4781         (void) signal(SIGINT, zonestat_quithandler);
4782         (void) signal(SIGTERM, zonestat_quithandler);
4783         (void) signal(SIGHUP, zonestat_quithandler);
4784 /*      (void) sigignore(SIGCHLD); */
4785         (void) sigignore(SIGPIPE);
4786 
4787         if (getzoneid() != GLOBAL_ZONEID)
4788                 zsd_error(gettext("Must be run from global zone only"));
4789 
4790         while ((arg = getopt(argc, argv, "c"))
4791             != EOF) {
4792                 switch (arg) {
4793                 case 'c':
4794                         opt_cleanup = B_TRUE;
4795                         break;
4796                 default:
4797                         zsd_error(gettext("Invalid option"));
4798                 }
4799         }
4800 
4801         if (opt_cleanup) {
4802                 if (zsd_disable_cpu_stats() != 0)
4803                         exit(1);
4804                 else
4805                         exit(0);
4806         }
4807 
4808         /* Get the configured sample interval */
4809         prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default",
4810             "config", "sample_interval");
4811         if (prop == NULL)
4812                 zsd_error(gettext("Unable to fetch SMF property "
4813                     "\"config/sample_interval\""));
4814 
4815         if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT)
4816                 zsd_error(gettext("Malformed SMF property "
4817                     "\"config/sample_interval\".  Must be of type \"count\""));
4818 
4819         intervalp = scf_simple_prop_next_count(prop);
4820         g_interval = *intervalp;
4821         if (g_interval == 0)
4822                 zsd_error(gettext("Malformed SMF property "
4823                     "\"config/sample_interval\".  Must be greater than zero"));
4824 
4825         scf_simple_prop_free(prop);
4826 
4827         if (daemonize_start() < 0)
4828                 zsd_error(gettext("Unable to start daemon\n"));
4829 
4830         /* Run at high priority */
4831         zsd_set_fx();
4832 
4833         (void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL);
4834         (void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL);
4835         (void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL);
4836 
4837         g_server_door = door_create(zsd_server, NULL,
4838             DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4839         if (g_server_door < 0)
4840                 zsd_error(gettext("Unable to create server door\n"));
4841 
4842 
4843         g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI |
4844             DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4845         if (g_stat_door < 0)
4846                 zsd_error(gettext("Unable to create statistics door\n"));
4847 
4848         fattach_all_zones(B_FALSE);
4849 
4850         if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0)
4851                 zsd_error(gettext("Unable to create statistics thread\n"));
4852 
4853         daemonize_ready(0);
4854 
4855         /* Wait for signal to quit */
4856         while (g_quit == B_FALSE)
4857                 (void) pause();
4858 
4859         /* detach doors */
4860         fattach_all_zones(B_TRUE);
4861 
4862         (void) door_revoke(g_server_door);
4863         (void) door_revoke(g_stat_door);
4864 
4865         /* kick stat thread and wait for it to close the statistics */
4866         (void) mutex_lock(&g_usage_cache_lock);
4867         g_quit = B_TRUE;
4868         (void) cond_signal(&g_usage_cache_kick);
4869         (void) mutex_unlock(&g_usage_cache_lock);
4870 end:
4871         (void) thr_join(tid, NULL, NULL);
4872         return (0);
4873 }