VZONEROOT New usr/src/uts/common/os/zone.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2019, Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27  */
  28 
  29 /*
  30  * Zones
  31  *
  32  *   A zone is a named collection of processes, namespace constraints,
  33  *   and other system resources which comprise a secure and manageable
  34  *   application containment facility.
  35  *
  36  *   Zones (represented by the reference counted zone_t) are tracked in
  37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38  *   (zoneid_t) are used to track zone association.  Zone IDs are
  39  *   dynamically generated when the zone is created; if a persistent
  40  *   identifier is needed (core files, accounting logs, audit trail,
  41  *   etc.), the zone name should be used.
  42  *
  43  *
  44  *   Global Zone:
  45  *
  46  *   The global zone (zoneid 0) is automatically associated with all
  47  *   system resources that have not been bound to a user-created zone.
  48  *   This means that even systems where zones are not in active use
  49  *   have a global zone, and all processes, mounts, etc. are
  50  *   associated with that zone.  The global zone is generally
  51  *   unconstrained in terms of privileges and access, though the usual
  52  *   credential and privilege based restrictions apply.
  53  *
  54  *
  55  *   Zone States:
  56  *
  57  *   The states in which a zone may be in and the transitions are as
  58  *   follows:
  59  *
  60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61  *   initialized zone is added to the list of active zones on the system but
  62  *   isn't accessible.
  63  *
  64  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65  *   not yet completed. Not possible to enter the zone, but attributes can
  66  *   be retrieved.
  67  *
  68  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70  *   executed.  A zone remains in this state until it transitions into
  71  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72  *
  73  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75  *   state.
  76  *
  77  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78  *   successfully started init.   A zone remains in this state until
  79  *   zone_shutdown() is called.
  80  *
  81  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82  *   killing all processes running in the zone. The zone remains
  83  *   in this state until there are no more user processes running in the zone.
  84  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85  *   Since zone_shutdown() is restartable, it may be called successfully
  86  *   multiple times for the same zone_t.  Setting of the zone's state to
  87  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88  *   the zone's status without worrying about it being a moving target.
  89  *
  90  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91  *   are no more user processes in the zone.  The zone remains in this
  92  *   state until there are no more kernel threads associated with the
  93  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94  *   fail.
  95  *
  96  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98  *   join the zone or create kernel threads therein.
  99  *
 100  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  *   return NULL from now on.
 103  *
 104  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  *   processes or threads doing work on behalf of the zone.  The zone is
 106  *   removed from the list of active zones.  zone_destroy() returns, and
 107  *   the zone can be recreated.
 108  *
 109  *   ZONE_IS_FREE (internal state): All references have been dropped and
 110  *   the zone_t is no longer in the zone_active nor zone_deathrow lists.
 111  *   The zone_t is in the process of being freed.  This state exists
 112  *   only for publishing a sysevent to indicate that the zone by this
 113  *   name can be booted again.
 114  *
 115  *   Threads can wait for the zone to enter a requested state (other than
 116  *   ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait()
 117  *   with the desired state passed in as an argument.  Zone state transitions
 118  *   are uni-directional; it is not possible to move back to an earlier state.
 119  *
 120  *
 121  *   Zone-Specific Data:
 122  *
 123  *   Subsystems needing to maintain zone-specific data can store that
 124  *   data using the ZSD mechanism.  This provides a zone-specific data
 125  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 126  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 127  *   to register callbacks to be invoked when a zone is created, shut
 128  *   down, or destroyed.  This can be used to initialize zone-specific
 129  *   data for new zones and to clean up when zones go away.
 130  *
 131  *
 132  *   Data Structures:
 133  *
 134  *   The per-zone structure (zone_t) is reference counted, and freed
 135  *   when all references are released.  zone_hold and zone_rele can be
 136  *   used to adjust the reference count.  In addition, reference counts
 137  *   associated with the cred_t structure are tracked separately using
 138  *   zone_cred_hold and zone_cred_rele.
 139  *
 140  *   Pointers to active zone_t's are stored in two hash tables; one
 141  *   for searching by id, the other for searching by name.  Lookups
 142  *   can be performed on either basis, using zone_find_by_id and
 143  *   zone_find_by_name.  Both return zone_t pointers with the zone
 144  *   held, so zone_rele should be called when the pointer is no longer
 145  *   needed.  Zones can also be searched by path; zone_find_by_path
 146  *   returns the zone with which a path name is associated (global
 147  *   zone if the path is not within some other zone's file system
 148  *   hierarchy).  This currently requires iterating through each zone,
 149  *   so it is slower than an id or name search via a hash table.
 150  *
 151  *
 152  *   Locking:
 153  *
 154  *   zonehash_lock: This is a top-level global lock used to protect the
 155  *       zone hash tables and lists.  Zones cannot be created or destroyed
 156  *       while this lock is held.
 157  *   zone_status_lock: This is a global lock protecting zone state.
 158  *       Zones cannot change state while this lock is held.  It also
 159  *       protects the list of kernel threads associated with a zone.
 160  *   zone_lock: This is a per-zone lock used to protect several fields of
 161  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 162  *       this lock means that the zone cannot go away.
 163  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 164  *       related to the zone.max-lwps rctl.
 165  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 166  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 167  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 168  *       currently just max_lofi
 169  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 170  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 171  *       list (a list of zones in the ZONE_IS_DEAD state).
 172  *
 173  *   Ordering requirements:
 174  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 175  *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
 176  *
 177  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 178  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 179  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 180  *
 181  *   Blocking memory allocations are permitted while holding any of the
 182  *   zone locks.
 183  *
 184  *
 185  *   System Call Interface:
 186  *
 187  *   The zone subsystem can be managed and queried from user level with
 188  *   the following system calls (all subcodes of the primary "zone"
 189  *   system call):
 190  *   - zone_create: creates a zone with selected attributes (name,
 191  *     root path, privileges, resource controls, ZFS datasets)
 192  *   - zone_enter: allows the current process to enter a zone
 193  *   - zone_getattr: reports attributes of a zone
 194  *   - zone_setattr: set attributes of a zone
 195  *   - zone_boot: set 'init' running for the zone
 196  *   - zone_list: lists all zones active in the system
 197  *   - zone_lookup: looks up zone id based on name
 198  *   - zone_shutdown: initiates shutdown process (see states above)
 199  *   - zone_destroy: completes shutdown process (see states above)
 200  *
 201  */
 202 
 203 #include <sys/priv_impl.h>
 204 #include <sys/cred.h>
 205 #include <c2/audit.h>
 206 #include <sys/debug.h>
 207 #include <sys/file.h>
 208 #include <sys/kmem.h>
 209 #include <sys/kstat.h>
 210 #include <sys/mutex.h>
 211 #include <sys/note.h>
 212 #include <sys/pathname.h>
 213 #include <sys/proc.h>
 214 #include <sys/project.h>
 215 #include <sys/sysevent.h>
 216 #include <sys/task.h>
 217 #include <sys/systm.h>
 218 #include <sys/types.h>
 219 #include <sys/utsname.h>
 220 #include <sys/vnode.h>
 221 #include <sys/vfs.h>
 222 #include <sys/systeminfo.h>
 223 #include <sys/policy.h>
 224 #include <sys/cred_impl.h>
 225 #include <sys/contract_impl.h>
 226 #include <sys/contract/process_impl.h>
 227 #include <sys/class.h>
 228 #include <sys/pool.h>
 229 #include <sys/pool_pset.h>
 230 #include <sys/pset.h>
 231 #include <sys/strlog.h>
 232 #include <sys/sysmacros.h>
 233 #include <sys/callb.h>
 234 #include <sys/vmparam.h>
 235 #include <sys/corectl.h>
 236 #include <sys/ipc_impl.h>
 237 #include <sys/klpd.h>
 238 
 239 #include <sys/door.h>
 240 #include <sys/cpuvar.h>
 241 #include <sys/sdt.h>
 242 
 243 #include <sys/uadmin.h>
 244 #include <sys/session.h>
 245 #include <sys/cmn_err.h>
 246 #include <sys/modhash.h>
 247 #include <sys/sunddi.h>
 248 #include <sys/nvpair.h>
 249 #include <sys/rctl.h>
 250 #include <sys/fss.h>
 251 #include <sys/brand.h>
 252 #include <sys/zone.h>
 253 #include <net/if.h>
 254 #include <sys/cpucaps.h>
 255 #include <vm/seg.h>
 256 #include <sys/mac.h>
 257 #include <sys/rt.h>
 258 #include <sys/fx.h>
 259 
 260 /*
 261  * This constant specifies the number of seconds that threads waiting for
 262  * subsystems to release a zone's general-purpose references will wait before
 263  * they log the zone's reference counts.  The constant's value shouldn't
 264  * be so small that reference counts are unnecessarily reported for zones
 265  * whose references are slowly released.  On the other hand, it shouldn't be so
 266  * large that users reboot their systems out of frustration over hung zones
 267  * before the system logs the zones' reference counts.
 268  */
 269 #define ZONE_DESTROY_TIMEOUT_SECS       60
 270 
 271 /* List of data link IDs which are accessible from the zone */
 272 typedef struct zone_dl {
 273         datalink_id_t   zdl_id;
 274         nvlist_t        *zdl_net;
 275         list_node_t     zdl_linkage;
 276 } zone_dl_t;
 277 
 278 /*
 279  * cv used to signal that all references to the zone have been released.  This
 280  * needs to be global since there may be multiple waiters, and the first to
 281  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 282  */
 283 static kcondvar_t zone_destroy_cv;
 284 /*
 285  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 286  * but then we'd need another lock for zone_destroy_cv, and why bother?
 287  */
 288 static kmutex_t zone_status_lock;
 289 
 290 /*
 291  * ZSD-related global variables.
 292  */
 293 static kmutex_t zsd_key_lock;   /* protects the following two */
 294 /*
 295  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 296  */
 297 static zone_key_t zsd_keyval = 0;
 298 /*
 299  * Global list of registered keys.  We use this when a new zone is created.
 300  */
 301 static list_t zsd_registered_keys;
 302 
 303 int zone_hash_size = 256;
 304 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 305 static kmutex_t zonehash_lock;
 306 static uint_t zonecount;
 307 static id_space_t *zoneid_space;
 308 
 309 /*
 310  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 311  * kernel proper runs, and which manages all other zones.
 312  *
 313  * Although not declared as static, the variable "zone0" should not be used
 314  * except for by code that needs to reference the global zone early on in boot,
 315  * before it is fully initialized.  All other consumers should use
 316  * 'global_zone'.
 317  */
 318 zone_t zone0;
 319 zone_zfs_io_t zone0_zp_zfs;
 320 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 321 
 322 /*
 323  * List of active zones, protected by zonehash_lock.
 324  */
 325 static list_t zone_active;
 326 
 327 /*
 328  * List of destroyed zones that still have outstanding cred references.
 329  * Used for debugging.  Uses a separate lock to avoid lock ordering
 330  * problems in zone_free.
 331  */
 332 static list_t zone_deathrow;
 333 static kmutex_t zone_deathrow_lock;
 334 
 335 /* This can be dynamically reduced if various subsystems hit internal limits. */
 336 uint_t maxzones = MAX_ZONES;
 337 
 338 /* Event channel to sent zone state change notifications */
 339 evchan_t *zone_event_chan;
 340 
 341 /*
 342  * This table holds the mapping from kernel zone states to
 343  * states visible in the state notification API.
 344  * The idea is that we only expose "obvious" states and
 345  * do not expose states which are just implementation details.
 346  */
 347 const char  *zone_status_table[] = {
 348         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 349         ZONE_EVENT_INITIALIZED,         /* initialized */
 350         ZONE_EVENT_READY,               /* ready */
 351         ZONE_EVENT_READY,               /* booting */
 352         ZONE_EVENT_RUNNING,             /* running */
 353         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 354         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 355         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 356         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 357         ZONE_EVENT_UNINITIALIZED,       /* dead */
 358         ZONE_EVENT_FREE,                /* free */
 359 };
 360 
 361 /*
 362  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 363  * (see sys/zone.h).
 364  */
 365 static char *zone_ref_subsys_names[] = {
 366         "NFS",          /* ZONE_REF_NFS */
 367         "NFSv4",        /* ZONE_REF_NFSV4 */
 368         "SMBFS",        /* ZONE_REF_SMBFS */
 369         "MNTFS",        /* ZONE_REF_MNTFS */
 370         "LOFI",         /* ZONE_REF_LOFI */
 371         "VFS",          /* ZONE_REF_VFS */
 372         "IPC"           /* ZONE_REF_IPC */
 373 };
 374 
 375 /*
 376  * This isn't static so lint doesn't complain.
 377  */
 378 rctl_hndl_t rc_zone_cpu_shares;
 379 rctl_hndl_t rc_zone_locked_mem;
 380 rctl_hndl_t rc_zone_max_swap;
 381 rctl_hndl_t rc_zone_phys_mem;
 382 rctl_hndl_t rc_zone_max_lofi;
 383 rctl_hndl_t rc_zone_cpu_cap;
 384 rctl_hndl_t rc_zone_cpu_baseline;
 385 rctl_hndl_t rc_zone_cpu_burst_time;
 386 rctl_hndl_t rc_zone_zfs_io_pri;
 387 rctl_hndl_t rc_zone_nlwps;
 388 rctl_hndl_t rc_zone_nprocs;
 389 rctl_hndl_t rc_zone_shmmax;
 390 rctl_hndl_t rc_zone_shmmni;
 391 rctl_hndl_t rc_zone_semmni;
 392 rctl_hndl_t rc_zone_msgmni;
 393 
 394 const char * const zone_default_initname = "/sbin/init";
 395 static char * const zone_prefix = "/zone/";
 396 static int zone_shutdown(zoneid_t zoneid);
 397 static int zone_add_datalink(zoneid_t, datalink_id_t);
 398 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 399 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 400 static int zone_set_network(zoneid_t, zone_net_data_t *);
 401 static int zone_get_network(zoneid_t, zone_net_data_t *);
 402 static void zone_status_set(zone_t *, zone_status_t);
 403 
 404 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 405 
 406 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 407 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 408 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 409 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 410     zone_key_t);
 411 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 412 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 413     kmutex_t *);
 414 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 415     kmutex_t *);
 416 
 417 /*
 418  * Bump this number when you alter the zone syscall interfaces; this is
 419  * because we need to have support for previous API versions in libc
 420  * to support patching; libc calls into the kernel to determine this number.
 421  *
 422  * Version 1 of the API is the version originally shipped with Solaris 10
 423  * Version 2 alters the zone_create system call in order to support more
 424  *     arguments by moving the args into a structure; and to do better
 425  *     error reporting when zone_create() fails.
 426  * Version 3 alters the zone_create system call in order to support the
 427  *     import of ZFS datasets to zones.
 428  * Version 4 alters the zone_create system call in order to support
 429  *     Trusted Extensions.
 430  * Version 5 alters the zone_boot system call, and converts its old
 431  *     bootargs parameter to be set by the zone_setattr API instead.
 432  * Version 6 adds the flag argument to zone_create.
 433  * Version 7 adds the requested zoneid to zone_create.
 434  */
 435 static const int ZONE_SYSCALL_API_VERSION = 7;
 436 
 437 /*
 438  * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
 439  * data which can be referenced independently of the zone_t structure. This
 440  * data falls into two categories;
 441  *   1) pages and RSS data associated with processes inside a zone
 442  *   2) in-flight ZFS I/O data
 443  *
 444  * Each member of zone_persist_t stores the zone's current page usage, its page
 445  * limit, a flag indicating if the zone is over its physical memory cap and
 446  * various page-related statistics. The zpers_over flag is the interface for
 447  * the page scanner to use when reclaiming pages for zones that are over their
 448  * cap. The zone_persist_t structure also includes a mutex and a reference to a
 449  * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
 450  *
 451  * All zone physical memory cap data is stored in this array instead of within
 452  * the zone structure itself. This is because zone structures come and go, but
 453  * paging-related work can be asynchronous to any particular zone. In,
 454  * particular:
 455  * 1) Page scanning to reclaim pages occurs from a kernel thread that is not
 456  *    associated with any zone.
 457  * 2) Freeing segkp pages can occur long after the zone which first
 458  *    instantiated those pages has gone away.
 459  * We want to be able to account for pages/zone without constantly having to
 460  * take extra locks and finding the relevant zone structure, particularly during
 461  * page scanning.
 462  *
 463  * The page scanner can run when "zone_num_over_cap" is non-zero. It can
 464  * do a direct lookup of a zoneid into the "zone_pdata" array to determine
 465  * if that zone is over its cap.
 466  *
 467  * There is no locking for the page scanner to perform these two checks.
 468  * We cannot have the page scanner blocking normal paging activity for
 469  * running processes. Because the physical memory cap is a soft cap, it is
 470  * fine for the scanner to simply read the current state of the counter and
 471  * the zone's zpers_over entry in the array. The scanner should never modify
 472  * either of these items. Internally the entries and the counter are managed
 473  * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
 474  * take care to ensure that we only take the zone_physcap_lock mutex when a
 475  * zone is transitioning over/under its physical memory cap.
 476  *
 477  * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
 478  * the "zone_pdata" array and associated counter.
 479  *
 480  * The zone_persist_t structure tracks the zone's physical cap and phyiscal
 481  * usage in terms of pages. These values are currently defined as uint32. Thus,
 482  * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
 483  * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
 484  * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
 485  * In the future we may need to expand these counters to 64-bit, but for now
 486  * we're using 32-bit to conserve memory, since this array is statically
 487  * allocated within the kernel based on the maximum number of zones supported.
 488  *
 489  * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
 490  * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
 491  * had to continuously find the zone structure associated with an I/O that has
 492  * just completed. To avoid that overhead, we track the I/O data within the
 493  * zone_zfs_io_t instead. We can directly access that data without having to
 494  * lookup the full zone_t structure.
 495  */
 496 uint_t zone_num_over_cap;
 497 zone_persist_t zone_pdata[MAX_ZONES];
 498 static kmutex_t zone_physcap_lock;
 499 
 500 /*
 501  * Certain filesystems (such as NFS and autofs) need to know which zone
 502  * the mount is being placed in.  Because of this, we need to be able to
 503  * ensure that a zone isn't in the process of being created/destroyed such
 504  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 505  * it gets added the list of mounted zones, it ends up on the wrong zone's
 506  * mount list. Since a zone can't reside on an NFS file system, we don't
 507  * have to worry about the zonepath itself.
 508  *
 509  * The following functions: block_mounts()/resume_mounts() and
 510  * mount_in_progress()/mount_completed() are used by zones and the VFS
 511  * layer (respectively) to synchronize zone state transitions and new
 512  * mounts within a zone. This syncronization is on a per-zone basis, so
 513  * activity for one zone will not interfere with activity for another zone.
 514  *
 515  * The semantics are like a reader-reader lock such that there may
 516  * either be multiple mounts (or zone state transitions, if that weren't
 517  * serialized by zonehash_lock) in progress at the same time, but not
 518  * both.
 519  *
 520  * We use cv's so the user can ctrl-C out of the operation if it's
 521  * taking too long.
 522  *
 523  * The semantics are such that there is unfair bias towards the
 524  * "current" operation.  This means that zone halt may starve if
 525  * there is a rapid succession of new mounts coming in to the zone.
 526  */
 527 /*
 528  * Prevent new mounts from progressing to the point of calling
 529  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 530  * them to complete.
 531  */
 532 static int
 533 block_mounts(zone_t *zp)
 534 {
 535         int retval = 0;
 536 
 537         /*
 538          * Since it may block for a long time, block_mounts() shouldn't be
 539          * called with zonehash_lock held.
 540          */
 541         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 542         mutex_enter(&zp->zone_mount_lock);
 543         while (zp->zone_mounts_in_progress > 0) {
 544                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 545                         goto signaled;
 546         }
 547         /*
 548          * A negative value of mounts_in_progress indicates that mounts
 549          * have been blocked by (-mounts_in_progress) different callers
 550          * (remotely possible if two threads enter zone_shutdown at the same
 551          * time).
 552          */
 553         zp->zone_mounts_in_progress--;
 554         retval = 1;
 555 signaled:
 556         mutex_exit(&zp->zone_mount_lock);
 557         return (retval);
 558 }
 559 
 560 /*
 561  * The VFS layer may progress with new mounts as far as we're concerned.
 562  * Allow them to progress if we were the last obstacle.
 563  */
 564 static void
 565 resume_mounts(zone_t *zp)
 566 {
 567         mutex_enter(&zp->zone_mount_lock);
 568         if (++zp->zone_mounts_in_progress == 0)
 569                 cv_broadcast(&zp->zone_mount_cv);
 570         mutex_exit(&zp->zone_mount_lock);
 571 }
 572 
 573 /*
 574  * The VFS layer is busy with a mount; this zone should wait until all
 575  * of its mounts are completed to progress.
 576  */
 577 void
 578 mount_in_progress(zone_t *zp)
 579 {
 580         mutex_enter(&zp->zone_mount_lock);
 581         while (zp->zone_mounts_in_progress < 0)
 582                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 583         zp->zone_mounts_in_progress++;
 584         mutex_exit(&zp->zone_mount_lock);
 585 }
 586 
 587 /*
 588  * VFS is done with one mount; wake up any waiting block_mounts()
 589  * callers if this is the last mount.
 590  */
 591 void
 592 mount_completed(zone_t *zp)
 593 {
 594         mutex_enter(&zp->zone_mount_lock);
 595         if (--zp->zone_mounts_in_progress == 0)
 596                 cv_broadcast(&zp->zone_mount_cv);
 597         mutex_exit(&zp->zone_mount_lock);
 598 }
 599 
 600 /*
 601  * ZSD routines.
 602  *
 603  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 604  * defined by the pthread_key_create() and related interfaces.
 605  *
 606  * Kernel subsystems may register one or more data items and/or
 607  * callbacks to be executed when a zone is created, shutdown, or
 608  * destroyed.
 609  *
 610  * Unlike the thread counterpart, destructor callbacks will be executed
 611  * even if the data pointer is NULL and/or there are no constructor
 612  * callbacks, so it is the responsibility of such callbacks to check for
 613  * NULL data values if necessary.
 614  *
 615  * The locking strategy and overall picture is as follows:
 616  *
 617  * When someone calls zone_key_create(), a template ZSD entry is added to the
 618  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 619  * holding that lock all the existing zones are marked as
 620  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 621  * zone_zsd list (protected by zone_lock). The global list is updated first
 622  * (under zone_key_lock) to make sure that newly created zones use the
 623  * most recent list of keys. Then under zonehash_lock we walk the zones
 624  * and mark them.  Similar locking is used in zone_key_delete().
 625  *
 626  * The actual create, shutdown, and destroy callbacks are done without
 627  * holding any lock. And zsd_flags are used to ensure that the operations
 628  * completed so that when zone_key_create (and zone_create) is done, as well as
 629  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 630  * are completed.
 631  *
 632  * When new zones are created constructor callbacks for all registered ZSD
 633  * entries will be called. That also uses the above two phases of marking
 634  * what needs to be done, and then running the callbacks without holding
 635  * any locks.
 636  *
 637  * The framework does not provide any locking around zone_getspecific() and
 638  * zone_setspecific() apart from that needed for internal consistency, so
 639  * callers interested in atomic "test-and-set" semantics will need to provide
 640  * their own locking.
 641  */
 642 
 643 /*
 644  * Helper function to find the zsd_entry associated with the key in the
 645  * given list.
 646  */
 647 static struct zsd_entry *
 648 zsd_find(list_t *l, zone_key_t key)
 649 {
 650         struct zsd_entry *zsd;
 651 
 652         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 653                 if (zsd->zsd_key == key) {
 654                         return (zsd);
 655                 }
 656         }
 657         return (NULL);
 658 }
 659 
 660 /*
 661  * Helper function to find the zsd_entry associated with the key in the
 662  * given list. Move it to the front of the list.
 663  */
 664 static struct zsd_entry *
 665 zsd_find_mru(list_t *l, zone_key_t key)
 666 {
 667         struct zsd_entry *zsd;
 668 
 669         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 670                 if (zsd->zsd_key == key) {
 671                         /*
 672                          * Move to head of list to keep list in MRU order.
 673                          */
 674                         if (zsd != list_head(l)) {
 675                                 list_remove(l, zsd);
 676                                 list_insert_head(l, zsd);
 677                         }
 678                         return (zsd);
 679                 }
 680         }
 681         return (NULL);
 682 }
 683 
 684 void
 685 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 686     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 687 {
 688         struct zsd_entry *zsdp;
 689         struct zsd_entry *t;
 690         struct zone *zone;
 691         zone_key_t  key;
 692 
 693         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 694         zsdp->zsd_data = NULL;
 695         zsdp->zsd_create = create;
 696         zsdp->zsd_shutdown = shutdown;
 697         zsdp->zsd_destroy = destroy;
 698 
 699         /*
 700          * Insert in global list of callbacks. Makes future zone creations
 701          * see it.
 702          */
 703         mutex_enter(&zsd_key_lock);
 704         key = zsdp->zsd_key = ++zsd_keyval;
 705         ASSERT(zsd_keyval != 0);
 706         list_insert_tail(&zsd_registered_keys, zsdp);
 707         mutex_exit(&zsd_key_lock);
 708 
 709         /*
 710          * Insert for all existing zones and mark them as needing
 711          * a create callback.
 712          */
 713         mutex_enter(&zonehash_lock);        /* stop the world */
 714         for (zone = list_head(&zone_active); zone != NULL;
 715             zone = list_next(&zone_active, zone)) {
 716                 zone_status_t status;
 717 
 718                 mutex_enter(&zone->zone_lock);
 719 
 720                 /* Skip zones that are on the way down or not yet up */
 721                 status = zone_status_get(zone);
 722                 if (status >= ZONE_IS_DOWN ||
 723                     status == ZONE_IS_UNINITIALIZED) {
 724                         mutex_exit(&zone->zone_lock);
 725                         continue;
 726                 }
 727 
 728                 t = zsd_find_mru(&zone->zone_zsd, key);
 729                 if (t != NULL) {
 730                         /*
 731                          * A zsd_configure already inserted it after
 732                          * we dropped zsd_key_lock above.
 733                          */
 734                         mutex_exit(&zone->zone_lock);
 735                         continue;
 736                 }
 737                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 738                 t->zsd_key = key;
 739                 t->zsd_create = create;
 740                 t->zsd_shutdown = shutdown;
 741                 t->zsd_destroy = destroy;
 742                 if (create != NULL) {
 743                         t->zsd_flags = ZSD_CREATE_NEEDED;
 744                         DTRACE_PROBE2(zsd__create__needed,
 745                             zone_t *, zone, zone_key_t, key);
 746                 }
 747                 list_insert_tail(&zone->zone_zsd, t);
 748                 mutex_exit(&zone->zone_lock);
 749         }
 750         mutex_exit(&zonehash_lock);
 751 
 752         if (create != NULL) {
 753                 /* Now call the create callback for this key */
 754                 zsd_apply_all_zones(zsd_apply_create, key);
 755         }
 756         /*
 757          * It is safe for consumers to use the key now, make it
 758          * globally visible. Specifically zone_getspecific() will
 759          * always successfully return the zone specific data associated
 760          * with the key.
 761          */
 762         *keyp = key;
 763 
 764 }
 765 
 766 /*
 767  * Function called when a module is being unloaded, or otherwise wishes
 768  * to unregister its ZSD key and callbacks.
 769  *
 770  * Remove from the global list and determine the functions that need to
 771  * be called under a global lock. Then call the functions without
 772  * holding any locks. Finally free up the zone_zsd entries. (The apply
 773  * functions need to access the zone_zsd entries to find zsd_data etc.)
 774  */
 775 int
 776 zone_key_delete(zone_key_t key)
 777 {
 778         struct zsd_entry *zsdp = NULL;
 779         zone_t *zone;
 780 
 781         mutex_enter(&zsd_key_lock);
 782         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 783         if (zsdp == NULL) {
 784                 mutex_exit(&zsd_key_lock);
 785                 return (-1);
 786         }
 787         list_remove(&zsd_registered_keys, zsdp);
 788         mutex_exit(&zsd_key_lock);
 789 
 790         mutex_enter(&zonehash_lock);
 791         for (zone = list_head(&zone_active); zone != NULL;
 792             zone = list_next(&zone_active, zone)) {
 793                 struct zsd_entry *del;
 794 
 795                 mutex_enter(&zone->zone_lock);
 796                 del = zsd_find_mru(&zone->zone_zsd, key);
 797                 if (del == NULL) {
 798                         /*
 799                          * Somebody else got here first e.g the zone going
 800                          * away.
 801                          */
 802                         mutex_exit(&zone->zone_lock);
 803                         continue;
 804                 }
 805                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 806                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 807                 if (del->zsd_shutdown != NULL &&
 808                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 809                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 810                         DTRACE_PROBE2(zsd__shutdown__needed,
 811                             zone_t *, zone, zone_key_t, key);
 812                 }
 813                 if (del->zsd_destroy != NULL &&
 814                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 815                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 816                         DTRACE_PROBE2(zsd__destroy__needed,
 817                             zone_t *, zone, zone_key_t, key);
 818                 }
 819                 mutex_exit(&zone->zone_lock);
 820         }
 821         mutex_exit(&zonehash_lock);
 822         kmem_free(zsdp, sizeof (*zsdp));
 823 
 824         /* Now call the shutdown and destroy callback for this key */
 825         zsd_apply_all_zones(zsd_apply_shutdown, key);
 826         zsd_apply_all_zones(zsd_apply_destroy, key);
 827 
 828         /* Now we can free up the zsdp structures in each zone */
 829         mutex_enter(&zonehash_lock);
 830         for (zone = list_head(&zone_active); zone != NULL;
 831             zone = list_next(&zone_active, zone)) {
 832                 struct zsd_entry *del;
 833 
 834                 mutex_enter(&zone->zone_lock);
 835                 del = zsd_find(&zone->zone_zsd, key);
 836                 if (del != NULL) {
 837                         list_remove(&zone->zone_zsd, del);
 838                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 839                         kmem_free(del, sizeof (*del));
 840                 }
 841                 mutex_exit(&zone->zone_lock);
 842         }
 843         mutex_exit(&zonehash_lock);
 844 
 845         return (0);
 846 }
 847 
 848 /*
 849  * ZSD counterpart of pthread_setspecific().
 850  *
 851  * Since all zsd callbacks, including those with no create function,
 852  * have an entry in zone_zsd, if the key is registered it is part of
 853  * the zone_zsd list.
 854  * Return an error if the key wasn't registerd.
 855  */
 856 int
 857 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 858 {
 859         struct zsd_entry *t;
 860 
 861         mutex_enter(&zone->zone_lock);
 862         t = zsd_find_mru(&zone->zone_zsd, key);
 863         if (t != NULL) {
 864                 /*
 865                  * Replace old value with new
 866                  */
 867                 t->zsd_data = (void *)data;
 868                 mutex_exit(&zone->zone_lock);
 869                 return (0);
 870         }
 871         mutex_exit(&zone->zone_lock);
 872         return (-1);
 873 }
 874 
 875 /*
 876  * ZSD counterpart of pthread_getspecific().
 877  */
 878 void *
 879 zone_getspecific(zone_key_t key, zone_t *zone)
 880 {
 881         struct zsd_entry *t;
 882         void *data;
 883 
 884         mutex_enter(&zone->zone_lock);
 885         t = zsd_find_mru(&zone->zone_zsd, key);
 886         data = (t == NULL ? NULL : t->zsd_data);
 887         mutex_exit(&zone->zone_lock);
 888         return (data);
 889 }
 890 
 891 /*
 892  * Function used to initialize a zone's list of ZSD callbacks and data
 893  * when the zone is being created.  The callbacks are initialized from
 894  * the template list (zsd_registered_keys). The constructor callback is
 895  * executed later (once the zone exists and with locks dropped).
 896  */
 897 static void
 898 zone_zsd_configure(zone_t *zone)
 899 {
 900         struct zsd_entry *zsdp;
 901         struct zsd_entry *t;
 902 
 903         ASSERT(MUTEX_HELD(&zonehash_lock));
 904         ASSERT(list_head(&zone->zone_zsd) == NULL);
 905         mutex_enter(&zone->zone_lock);
 906         mutex_enter(&zsd_key_lock);
 907         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 908             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 909                 /*
 910                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 911                  * should not have added anything to it.
 912                  */
 913                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 914 
 915                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 916                 t->zsd_key = zsdp->zsd_key;
 917                 t->zsd_create = zsdp->zsd_create;
 918                 t->zsd_shutdown = zsdp->zsd_shutdown;
 919                 t->zsd_destroy = zsdp->zsd_destroy;
 920                 if (zsdp->zsd_create != NULL) {
 921                         t->zsd_flags = ZSD_CREATE_NEEDED;
 922                         DTRACE_PROBE2(zsd__create__needed,
 923                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 924                 }
 925                 list_insert_tail(&zone->zone_zsd, t);
 926         }
 927         mutex_exit(&zsd_key_lock);
 928         mutex_exit(&zone->zone_lock);
 929 }
 930 
 931 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 932 
 933 /*
 934  * Helper function to execute shutdown or destructor callbacks.
 935  */
 936 static void
 937 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 938 {
 939         struct zsd_entry *t;
 940 
 941         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 942         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 943         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 944 
 945         /*
 946          * Run the callback solely based on what is registered for the zone
 947          * in zone_zsd. The global list can change independently of this
 948          * as keys are registered and unregistered and we don't register new
 949          * callbacks for a zone that is in the process of going away.
 950          */
 951         mutex_enter(&zone->zone_lock);
 952         for (t = list_head(&zone->zone_zsd); t != NULL;
 953             t = list_next(&zone->zone_zsd, t)) {
 954                 zone_key_t key = t->zsd_key;
 955 
 956                 /* Skip if no callbacks registered */
 957 
 958                 if (ct == ZSD_SHUTDOWN) {
 959                         if (t->zsd_shutdown != NULL &&
 960                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 961                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 962                                 DTRACE_PROBE2(zsd__shutdown__needed,
 963                                     zone_t *, zone, zone_key_t, key);
 964                         }
 965                 } else {
 966                         if (t->zsd_destroy != NULL &&
 967                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 968                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 969                                 DTRACE_PROBE2(zsd__destroy__needed,
 970                                     zone_t *, zone, zone_key_t, key);
 971                         }
 972                 }
 973         }
 974         mutex_exit(&zone->zone_lock);
 975 
 976         /* Now call the shutdown and destroy callback for this key */
 977         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 978         zsd_apply_all_keys(zsd_apply_destroy, zone);
 979 
 980 }
 981 
 982 /*
 983  * Called when the zone is going away; free ZSD-related memory, and
 984  * destroy the zone_zsd list.
 985  */
 986 static void
 987 zone_free_zsd(zone_t *zone)
 988 {
 989         struct zsd_entry *t, *next;
 990 
 991         /*
 992          * Free all the zsd_entry's we had on this zone.
 993          */
 994         mutex_enter(&zone->zone_lock);
 995         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 996                 next = list_next(&zone->zone_zsd, t);
 997                 list_remove(&zone->zone_zsd, t);
 998                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 999                 kmem_free(t, sizeof (*t));
1000         }
1001         list_destroy(&zone->zone_zsd);
1002         mutex_exit(&zone->zone_lock);
1003 
1004 }
1005 
1006 /*
1007  * Apply a function to all zones for particular key value.
1008  *
1009  * The applyfn has to drop zonehash_lock if it does some work, and
1010  * then reacquire it before it returns.
1011  * When the lock is dropped we don't follow list_next even
1012  * if it is possible to do so without any hazards. This is
1013  * because we want the design to allow for the list of zones
1014  * to change in any arbitrary way during the time the
1015  * lock was dropped.
1016  *
1017  * It is safe to restart the loop at list_head since the applyfn
1018  * changes the zsd_flags as it does work, so a subsequent
1019  * pass through will have no effect in applyfn, hence the loop will terminate
1020  * in at worst O(N^2).
1021  */
1022 static void
1023 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
1024 {
1025         zone_t *zone;
1026 
1027         mutex_enter(&zonehash_lock);
1028         zone = list_head(&zone_active);
1029         while (zone != NULL) {
1030                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
1031                         /* Lock dropped - restart at head */
1032                         zone = list_head(&zone_active);
1033                 } else {
1034                         zone = list_next(&zone_active, zone);
1035                 }
1036         }
1037         mutex_exit(&zonehash_lock);
1038 }
1039 
1040 /*
1041  * Apply a function to all keys for a particular zone.
1042  *
1043  * The applyfn has to drop zonehash_lock if it does some work, and
1044  * then reacquire it before it returns.
1045  * When the lock is dropped we don't follow list_next even
1046  * if it is possible to do so without any hazards. This is
1047  * because we want the design to allow for the list of zsd callbacks
1048  * to change in any arbitrary way during the time the
1049  * lock was dropped.
1050  *
1051  * It is safe to restart the loop at list_head since the applyfn
1052  * changes the zsd_flags as it does work, so a subsequent
1053  * pass through will have no effect in applyfn, hence the loop will terminate
1054  * in at worst O(N^2).
1055  */
1056 static void
1057 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
1058 {
1059         struct zsd_entry *t;
1060 
1061         mutex_enter(&zone->zone_lock);
1062         t = list_head(&zone->zone_zsd);
1063         while (t != NULL) {
1064                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
1065                         /* Lock dropped - restart at head */
1066                         t = list_head(&zone->zone_zsd);
1067                 } else {
1068                         t = list_next(&zone->zone_zsd, t);
1069                 }
1070         }
1071         mutex_exit(&zone->zone_lock);
1072 }
1073 
1074 /*
1075  * Call the create function for the zone and key if CREATE_NEEDED
1076  * is set.
1077  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1078  * we wait for that thread to complete so that we can ensure that
1079  * all the callbacks are done when we've looped over all zones/keys.
1080  *
1081  * When we call the create function, we drop the global held by the
1082  * caller, and return true to tell the caller it needs to re-evalute the
1083  * state.
1084  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1085  * remains held on exit.
1086  */
1087 static boolean_t
1088 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1089     zone_t *zone, zone_key_t key)
1090 {
1091         void *result;
1092         struct zsd_entry *t;
1093         boolean_t dropped;
1094 
1095         if (lockp != NULL) {
1096                 ASSERT(MUTEX_HELD(lockp));
1097         }
1098         if (zone_lock_held) {
1099                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1100         } else {
1101                 mutex_enter(&zone->zone_lock);
1102         }
1103 
1104         t = zsd_find(&zone->zone_zsd, key);
1105         if (t == NULL) {
1106                 /*
1107                  * Somebody else got here first e.g the zone going
1108                  * away.
1109                  */
1110                 if (!zone_lock_held)
1111                         mutex_exit(&zone->zone_lock);
1112                 return (B_FALSE);
1113         }
1114         dropped = B_FALSE;
1115         if (zsd_wait_for_inprogress(zone, t, lockp))
1116                 dropped = B_TRUE;
1117 
1118         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1119                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1120                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1121                 DTRACE_PROBE2(zsd__create__inprogress,
1122                     zone_t *, zone, zone_key_t, key);
1123                 mutex_exit(&zone->zone_lock);
1124                 if (lockp != NULL)
1125                         mutex_exit(lockp);
1126 
1127                 dropped = B_TRUE;
1128                 ASSERT(t->zsd_create != NULL);
1129                 DTRACE_PROBE2(zsd__create__start,
1130                     zone_t *, zone, zone_key_t, key);
1131 
1132                 result = (*t->zsd_create)(zone->zone_id);
1133 
1134                 DTRACE_PROBE2(zsd__create__end,
1135                     zone_t *, zone, voidn *, result);
1136 
1137                 ASSERT(result != NULL);
1138                 if (lockp != NULL)
1139                         mutex_enter(lockp);
1140                 mutex_enter(&zone->zone_lock);
1141                 t->zsd_data = result;
1142                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1143                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1144                 cv_broadcast(&t->zsd_cv);
1145                 DTRACE_PROBE2(zsd__create__completed,
1146                     zone_t *, zone, zone_key_t, key);
1147         }
1148         if (!zone_lock_held)
1149                 mutex_exit(&zone->zone_lock);
1150         return (dropped);
1151 }
1152 
1153 /*
1154  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1155  * is set.
1156  * If some other thread gets here first and sets *_INPROGRESS, then
1157  * we wait for that thread to complete so that we can ensure that
1158  * all the callbacks are done when we've looped over all zones/keys.
1159  *
1160  * When we call the shutdown function, we drop the global held by the
1161  * caller, and return true to tell the caller it needs to re-evalute the
1162  * state.
1163  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1164  * remains held on exit.
1165  */
1166 static boolean_t
1167 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1168     zone_t *zone, zone_key_t key)
1169 {
1170         struct zsd_entry *t;
1171         void *data;
1172         boolean_t dropped;
1173 
1174         if (lockp != NULL) {
1175                 ASSERT(MUTEX_HELD(lockp));
1176         }
1177         if (zone_lock_held) {
1178                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1179         } else {
1180                 mutex_enter(&zone->zone_lock);
1181         }
1182 
1183         t = zsd_find(&zone->zone_zsd, key);
1184         if (t == NULL) {
1185                 /*
1186                  * Somebody else got here first e.g the zone going
1187                  * away.
1188                  */
1189                 if (!zone_lock_held)
1190                         mutex_exit(&zone->zone_lock);
1191                 return (B_FALSE);
1192         }
1193         dropped = B_FALSE;
1194         if (zsd_wait_for_creator(zone, t, lockp))
1195                 dropped = B_TRUE;
1196 
1197         if (zsd_wait_for_inprogress(zone, t, lockp))
1198                 dropped = B_TRUE;
1199 
1200         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1201                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1202                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1203                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1204                     zone_t *, zone, zone_key_t, key);
1205                 mutex_exit(&zone->zone_lock);
1206                 if (lockp != NULL)
1207                         mutex_exit(lockp);
1208                 dropped = B_TRUE;
1209 
1210                 ASSERT(t->zsd_shutdown != NULL);
1211                 data = t->zsd_data;
1212 
1213                 DTRACE_PROBE2(zsd__shutdown__start,
1214                     zone_t *, zone, zone_key_t, key);
1215 
1216                 (t->zsd_shutdown)(zone->zone_id, data);
1217                 DTRACE_PROBE2(zsd__shutdown__end,
1218                     zone_t *, zone, zone_key_t, key);
1219 
1220                 if (lockp != NULL)
1221                         mutex_enter(lockp);
1222                 mutex_enter(&zone->zone_lock);
1223                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1224                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1225                 cv_broadcast(&t->zsd_cv);
1226                 DTRACE_PROBE2(zsd__shutdown__completed,
1227                     zone_t *, zone, zone_key_t, key);
1228         }
1229         if (!zone_lock_held)
1230                 mutex_exit(&zone->zone_lock);
1231         return (dropped);
1232 }
1233 
1234 /*
1235  * Call the destroy function for the zone and key if DESTROY_NEEDED
1236  * is set.
1237  * If some other thread gets here first and sets *_INPROGRESS, then
1238  * we wait for that thread to complete so that we can ensure that
1239  * all the callbacks are done when we've looped over all zones/keys.
1240  *
1241  * When we call the destroy function, we drop the global held by the
1242  * caller, and return true to tell the caller it needs to re-evalute the
1243  * state.
1244  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1245  * remains held on exit.
1246  */
1247 static boolean_t
1248 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1249     zone_t *zone, zone_key_t key)
1250 {
1251         struct zsd_entry *t;
1252         void *data;
1253         boolean_t dropped;
1254 
1255         if (lockp != NULL) {
1256                 ASSERT(MUTEX_HELD(lockp));
1257         }
1258         if (zone_lock_held) {
1259                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1260         } else {
1261                 mutex_enter(&zone->zone_lock);
1262         }
1263 
1264         t = zsd_find(&zone->zone_zsd, key);
1265         if (t == NULL) {
1266                 /*
1267                  * Somebody else got here first e.g the zone going
1268                  * away.
1269                  */
1270                 if (!zone_lock_held)
1271                         mutex_exit(&zone->zone_lock);
1272                 return (B_FALSE);
1273         }
1274         dropped = B_FALSE;
1275         if (zsd_wait_for_creator(zone, t, lockp))
1276                 dropped = B_TRUE;
1277 
1278         if (zsd_wait_for_inprogress(zone, t, lockp))
1279                 dropped = B_TRUE;
1280 
1281         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1282                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1283                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1284                 DTRACE_PROBE2(zsd__destroy__inprogress,
1285                     zone_t *, zone, zone_key_t, key);
1286                 mutex_exit(&zone->zone_lock);
1287                 if (lockp != NULL)
1288                         mutex_exit(lockp);
1289                 dropped = B_TRUE;
1290 
1291                 ASSERT(t->zsd_destroy != NULL);
1292                 data = t->zsd_data;
1293                 DTRACE_PROBE2(zsd__destroy__start,
1294                     zone_t *, zone, zone_key_t, key);
1295 
1296                 (t->zsd_destroy)(zone->zone_id, data);
1297                 DTRACE_PROBE2(zsd__destroy__end,
1298                     zone_t *, zone, zone_key_t, key);
1299 
1300                 if (lockp != NULL)
1301                         mutex_enter(lockp);
1302                 mutex_enter(&zone->zone_lock);
1303                 t->zsd_data = NULL;
1304                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1305                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1306                 cv_broadcast(&t->zsd_cv);
1307                 DTRACE_PROBE2(zsd__destroy__completed,
1308                     zone_t *, zone, zone_key_t, key);
1309         }
1310         if (!zone_lock_held)
1311                 mutex_exit(&zone->zone_lock);
1312         return (dropped);
1313 }
1314 
1315 /*
1316  * Wait for any CREATE_NEEDED flag to be cleared.
1317  * Returns true if lockp was temporarily dropped while waiting.
1318  */
1319 static boolean_t
1320 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1321 {
1322         boolean_t dropped = B_FALSE;
1323 
1324         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1325                 DTRACE_PROBE2(zsd__wait__for__creator,
1326                     zone_t *, zone, struct zsd_entry *, t);
1327                 if (lockp != NULL) {
1328                         dropped = B_TRUE;
1329                         mutex_exit(lockp);
1330                 }
1331                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1332                 if (lockp != NULL) {
1333                         /* First drop zone_lock to preserve order */
1334                         mutex_exit(&zone->zone_lock);
1335                         mutex_enter(lockp);
1336                         mutex_enter(&zone->zone_lock);
1337                 }
1338         }
1339         return (dropped);
1340 }
1341 
1342 /*
1343  * Wait for any INPROGRESS flag to be cleared.
1344  * Returns true if lockp was temporarily dropped while waiting.
1345  */
1346 static boolean_t
1347 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1348 {
1349         boolean_t dropped = B_FALSE;
1350 
1351         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1352                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1353                     zone_t *, zone, struct zsd_entry *, t);
1354                 if (lockp != NULL) {
1355                         dropped = B_TRUE;
1356                         mutex_exit(lockp);
1357                 }
1358                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1359                 if (lockp != NULL) {
1360                         /* First drop zone_lock to preserve order */
1361                         mutex_exit(&zone->zone_lock);
1362                         mutex_enter(lockp);
1363                         mutex_enter(&zone->zone_lock);
1364                 }
1365         }
1366         return (dropped);
1367 }
1368 
1369 /*
1370  * Frees memory associated with the zone dataset list.
1371  */
1372 static void
1373 zone_free_datasets(zone_t *zone)
1374 {
1375         zone_dataset_t *t, *next;
1376 
1377         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1378                 next = list_next(&zone->zone_datasets, t);
1379                 list_remove(&zone->zone_datasets, t);
1380                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1381                 kmem_free(t, sizeof (*t));
1382         }
1383         list_destroy(&zone->zone_datasets);
1384 }
1385 
1386 /*
1387  * zone.cpu-shares resource control support.
1388  */
1389 /*ARGSUSED*/
1390 static rctl_qty_t
1391 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1392 {
1393         ASSERT(MUTEX_HELD(&p->p_lock));
1394         return (p->p_zone->zone_shares);
1395 }
1396 
1397 /*ARGSUSED*/
1398 static int
1399 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1400     rctl_qty_t nv)
1401 {
1402         ASSERT(MUTEX_HELD(&p->p_lock));
1403         ASSERT(e->rcep_t == RCENTITY_ZONE);
1404         if (e->rcep_p.zone == NULL)
1405                 return (0);
1406 
1407         e->rcep_p.zone->zone_shares = nv;
1408         return (0);
1409 }
1410 
1411 static rctl_ops_t zone_cpu_shares_ops = {
1412         rcop_no_action,
1413         zone_cpu_shares_usage,
1414         zone_cpu_shares_set,
1415         rcop_no_test
1416 };
1417 
1418 /*
1419  * zone.cpu-cap resource control support.
1420  */
1421 /*ARGSUSED*/
1422 static rctl_qty_t
1423 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1424 {
1425         ASSERT(MUTEX_HELD(&p->p_lock));
1426         return (cpucaps_zone_get(p->p_zone));
1427 }
1428 
1429 /*ARGSUSED*/
1430 static int
1431 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1432     rctl_qty_t nv)
1433 {
1434         zone_t *zone = e->rcep_p.zone;
1435 
1436         ASSERT(MUTEX_HELD(&p->p_lock));
1437         ASSERT(e->rcep_t == RCENTITY_ZONE);
1438 
1439         if (zone == NULL)
1440                 return (0);
1441 
1442         /*
1443          * set cap to the new value.
1444          */
1445         return (cpucaps_zone_set(zone, nv));
1446 }
1447 
1448 static rctl_ops_t zone_cpu_cap_ops = {
1449         rcop_no_action,
1450         zone_cpu_cap_get,
1451         zone_cpu_cap_set,
1452         rcop_no_test
1453 };
1454 
1455 /*ARGSUSED*/
1456 static rctl_qty_t
1457 zone_cpu_base_get(rctl_t *rctl, struct proc *p)
1458 {
1459         ASSERT(MUTEX_HELD(&p->p_lock));
1460         return (cpucaps_zone_get_base(p->p_zone));
1461 }
1462 
1463 /*
1464  * The zone cpu base is used to set the baseline CPU for the zone
1465  * so we can track when the zone is bursting.
1466  */
1467 /*ARGSUSED*/
1468 static int
1469 zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1470     rctl_qty_t nv)
1471 {
1472         zone_t *zone = e->rcep_p.zone;
1473 
1474         ASSERT(MUTEX_HELD(&p->p_lock));
1475         ASSERT(e->rcep_t == RCENTITY_ZONE);
1476 
1477         if (zone == NULL)
1478                 return (0);
1479 
1480         return (cpucaps_zone_set_base(zone, nv));
1481 }
1482 
1483 static rctl_ops_t zone_cpu_base_ops = {
1484         rcop_no_action,
1485         zone_cpu_base_get,
1486         zone_cpu_base_set,
1487         rcop_no_test
1488 };
1489 
1490 /*ARGSUSED*/
1491 static rctl_qty_t
1492 zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
1493 {
1494         ASSERT(MUTEX_HELD(&p->p_lock));
1495         return (cpucaps_zone_get_burst_time(p->p_zone));
1496 }
1497 
1498 /*
1499  * The zone cpu burst time is used to set the amount of time CPU(s) can be
1500  * bursting for the zone.
1501  */
1502 /*ARGSUSED*/
1503 static int
1504 zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1505     rctl_qty_t nv)
1506 {
1507         zone_t *zone = e->rcep_p.zone;
1508 
1509         ASSERT(MUTEX_HELD(&p->p_lock));
1510         ASSERT(e->rcep_t == RCENTITY_ZONE);
1511 
1512         if (zone == NULL)
1513                 return (0);
1514 
1515         return (cpucaps_zone_set_burst_time(zone, nv));
1516 }
1517 
1518 static rctl_ops_t zone_cpu_burst_time_ops = {
1519         rcop_no_action,
1520         zone_cpu_burst_time_get,
1521         zone_cpu_burst_time_set,
1522         rcop_no_test
1523 };
1524 
1525 /*
1526  * zone.zfs-io-pri resource control support (IO priority).
1527  */
1528 /*ARGSUSED*/
1529 static rctl_qty_t
1530 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1531 {
1532         zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
1533         rctl_qty_t r = 0;
1534 
1535         ASSERT(MUTEX_HELD(&p->p_lock));
1536         mutex_enter(&zp->zpers_zfs_lock);
1537         if (zp->zpers_zfsp != NULL)
1538                 r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
1539         mutex_exit(&zp->zpers_zfs_lock);
1540 
1541         return (r);
1542 }
1543 
1544 /*ARGSUSED*/
1545 static int
1546 zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1547     rctl_qty_t nv)
1548 {
1549         zone_t *zone = e->rcep_p.zone;
1550         zone_persist_t *zp;
1551 
1552         ASSERT(MUTEX_HELD(&p->p_lock));
1553         ASSERT(e->rcep_t == RCENTITY_ZONE);
1554 
1555         if (zone == NULL)
1556                 return (0);
1557 
1558         /*
1559          * set priority to the new value.
1560          */
1561         zp = &zone_pdata[zone->zone_id];
1562         mutex_enter(&zp->zpers_zfs_lock);
1563         if (zp->zpers_zfsp != NULL)
1564                 zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
1565         mutex_exit(&zp->zpers_zfs_lock);
1566         return (0);
1567 }
1568 
1569 static rctl_ops_t zone_zfs_io_pri_ops = {
1570         rcop_no_action,
1571         zone_zfs_io_pri_get,
1572         zone_zfs_io_pri_set,
1573         rcop_no_test
1574 };
1575 
1576 /*ARGSUSED*/
1577 static rctl_qty_t
1578 zone_lwps_usage(rctl_t *r, proc_t *p)
1579 {
1580         rctl_qty_t nlwps;
1581         zone_t *zone = p->p_zone;
1582 
1583         ASSERT(MUTEX_HELD(&p->p_lock));
1584 
1585         mutex_enter(&zone->zone_nlwps_lock);
1586         nlwps = zone->zone_nlwps;
1587         mutex_exit(&zone->zone_nlwps_lock);
1588 
1589         return (nlwps);
1590 }
1591 
1592 /*ARGSUSED*/
1593 static int
1594 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1595     rctl_qty_t incr, uint_t flags)
1596 {
1597         rctl_qty_t nlwps;
1598 
1599         ASSERT(MUTEX_HELD(&p->p_lock));
1600         ASSERT(e->rcep_t == RCENTITY_ZONE);
1601         if (e->rcep_p.zone == NULL)
1602                 return (0);
1603         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1604         nlwps = e->rcep_p.zone->zone_nlwps;
1605 
1606         if (nlwps + incr > rcntl->rcv_value)
1607                 return (1);
1608 
1609         return (0);
1610 }
1611 
1612 /*ARGSUSED*/
1613 static int
1614 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1615 {
1616         ASSERT(MUTEX_HELD(&p->p_lock));
1617         ASSERT(e->rcep_t == RCENTITY_ZONE);
1618         if (e->rcep_p.zone == NULL)
1619                 return (0);
1620         e->rcep_p.zone->zone_nlwps_ctl = nv;
1621         return (0);
1622 }
1623 
1624 static rctl_ops_t zone_lwps_ops = {
1625         rcop_no_action,
1626         zone_lwps_usage,
1627         zone_lwps_set,
1628         zone_lwps_test,
1629 };
1630 
1631 /*ARGSUSED*/
1632 static rctl_qty_t
1633 zone_procs_usage(rctl_t *r, proc_t *p)
1634 {
1635         rctl_qty_t nprocs;
1636         zone_t *zone = p->p_zone;
1637 
1638         ASSERT(MUTEX_HELD(&p->p_lock));
1639 
1640         mutex_enter(&zone->zone_nlwps_lock);
1641         nprocs = zone->zone_nprocs;
1642         mutex_exit(&zone->zone_nlwps_lock);
1643 
1644         return (nprocs);
1645 }
1646 
1647 /*ARGSUSED*/
1648 static int
1649 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1650     rctl_qty_t incr, uint_t flags)
1651 {
1652         rctl_qty_t nprocs;
1653 
1654         ASSERT(MUTEX_HELD(&p->p_lock));
1655         ASSERT(e->rcep_t == RCENTITY_ZONE);
1656         if (e->rcep_p.zone == NULL)
1657                 return (0);
1658         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1659         nprocs = e->rcep_p.zone->zone_nprocs;
1660 
1661         if (nprocs + incr > rcntl->rcv_value)
1662                 return (1);
1663 
1664         return (0);
1665 }
1666 
1667 /*ARGSUSED*/
1668 static int
1669 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1670 {
1671         ASSERT(MUTEX_HELD(&p->p_lock));
1672         ASSERT(e->rcep_t == RCENTITY_ZONE);
1673         if (e->rcep_p.zone == NULL)
1674                 return (0);
1675         e->rcep_p.zone->zone_nprocs_ctl = nv;
1676         return (0);
1677 }
1678 
1679 static rctl_ops_t zone_procs_ops = {
1680         rcop_no_action,
1681         zone_procs_usage,
1682         zone_procs_set,
1683         zone_procs_test,
1684 };
1685 
1686 /*ARGSUSED*/
1687 static rctl_qty_t
1688 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1689 {
1690         ASSERT(MUTEX_HELD(&p->p_lock));
1691         return (p->p_zone->zone_shmmax);
1692 }
1693 
1694 /*ARGSUSED*/
1695 static int
1696 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1697     rctl_qty_t incr, uint_t flags)
1698 {
1699         rctl_qty_t v;
1700         ASSERT(MUTEX_HELD(&p->p_lock));
1701         ASSERT(e->rcep_t == RCENTITY_ZONE);
1702         v = e->rcep_p.zone->zone_shmmax + incr;
1703         if (v > rval->rcv_value)
1704                 return (1);
1705         return (0);
1706 }
1707 
1708 static rctl_ops_t zone_shmmax_ops = {
1709         rcop_no_action,
1710         zone_shmmax_usage,
1711         rcop_no_set,
1712         zone_shmmax_test
1713 };
1714 
1715 /*ARGSUSED*/
1716 static rctl_qty_t
1717 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1718 {
1719         ASSERT(MUTEX_HELD(&p->p_lock));
1720         return (p->p_zone->zone_ipc.ipcq_shmmni);
1721 }
1722 
1723 /*ARGSUSED*/
1724 static int
1725 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1726     rctl_qty_t incr, uint_t flags)
1727 {
1728         rctl_qty_t v;
1729         ASSERT(MUTEX_HELD(&p->p_lock));
1730         ASSERT(e->rcep_t == RCENTITY_ZONE);
1731         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1732         if (v > rval->rcv_value)
1733                 return (1);
1734         return (0);
1735 }
1736 
1737 static rctl_ops_t zone_shmmni_ops = {
1738         rcop_no_action,
1739         zone_shmmni_usage,
1740         rcop_no_set,
1741         zone_shmmni_test
1742 };
1743 
1744 /*ARGSUSED*/
1745 static rctl_qty_t
1746 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1747 {
1748         ASSERT(MUTEX_HELD(&p->p_lock));
1749         return (p->p_zone->zone_ipc.ipcq_semmni);
1750 }
1751 
1752 /*ARGSUSED*/
1753 static int
1754 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1755     rctl_qty_t incr, uint_t flags)
1756 {
1757         rctl_qty_t v;
1758         ASSERT(MUTEX_HELD(&p->p_lock));
1759         ASSERT(e->rcep_t == RCENTITY_ZONE);
1760         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1761         if (v > rval->rcv_value)
1762                 return (1);
1763         return (0);
1764 }
1765 
1766 static rctl_ops_t zone_semmni_ops = {
1767         rcop_no_action,
1768         zone_semmni_usage,
1769         rcop_no_set,
1770         zone_semmni_test
1771 };
1772 
1773 /*ARGSUSED*/
1774 static rctl_qty_t
1775 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1776 {
1777         ASSERT(MUTEX_HELD(&p->p_lock));
1778         return (p->p_zone->zone_ipc.ipcq_msgmni);
1779 }
1780 
1781 /*ARGSUSED*/
1782 static int
1783 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1784     rctl_qty_t incr, uint_t flags)
1785 {
1786         rctl_qty_t v;
1787         ASSERT(MUTEX_HELD(&p->p_lock));
1788         ASSERT(e->rcep_t == RCENTITY_ZONE);
1789         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1790         if (v > rval->rcv_value)
1791                 return (1);
1792         return (0);
1793 }
1794 
1795 static rctl_ops_t zone_msgmni_ops = {
1796         rcop_no_action,
1797         zone_msgmni_usage,
1798         rcop_no_set,
1799         zone_msgmni_test
1800 };
1801 
1802 /*ARGSUSED*/
1803 static rctl_qty_t
1804 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1805 {
1806         rctl_qty_t q;
1807         ASSERT(MUTEX_HELD(&p->p_lock));
1808         mutex_enter(&p->p_zone->zone_mem_lock);
1809         q = p->p_zone->zone_locked_mem;
1810         mutex_exit(&p->p_zone->zone_mem_lock);
1811         return (q);
1812 }
1813 
1814 /*ARGSUSED*/
1815 static int
1816 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1817     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1818 {
1819         rctl_qty_t q;
1820         zone_t *z;
1821 
1822         z = e->rcep_p.zone;
1823         ASSERT(MUTEX_HELD(&p->p_lock));
1824         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1825         q = z->zone_locked_mem;
1826         if (q + incr > rcntl->rcv_value)
1827                 return (1);
1828         return (0);
1829 }
1830 
1831 /*ARGSUSED*/
1832 static int
1833 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1834     rctl_qty_t nv)
1835 {
1836         ASSERT(MUTEX_HELD(&p->p_lock));
1837         ASSERT(e->rcep_t == RCENTITY_ZONE);
1838         if (e->rcep_p.zone == NULL)
1839                 return (0);
1840         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1841         return (0);
1842 }
1843 
1844 static rctl_ops_t zone_locked_mem_ops = {
1845         rcop_no_action,
1846         zone_locked_mem_usage,
1847         zone_locked_mem_set,
1848         zone_locked_mem_test
1849 };
1850 
1851 /*ARGSUSED*/
1852 static rctl_qty_t
1853 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1854 {
1855         rctl_qty_t q;
1856         zone_t *z = p->p_zone;
1857 
1858         ASSERT(MUTEX_HELD(&p->p_lock));
1859         mutex_enter(&z->zone_mem_lock);
1860         q = z->zone_max_swap;
1861         mutex_exit(&z->zone_mem_lock);
1862         return (q);
1863 }
1864 
1865 /*ARGSUSED*/
1866 static int
1867 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1868     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1869 {
1870         rctl_qty_t q;
1871         zone_t *z;
1872 
1873         z = e->rcep_p.zone;
1874         ASSERT(MUTEX_HELD(&p->p_lock));
1875         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1876         q = z->zone_max_swap;
1877         if (q + incr > rcntl->rcv_value)
1878                 return (1);
1879         return (0);
1880 }
1881 
1882 /*ARGSUSED*/
1883 static int
1884 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1885     rctl_qty_t nv)
1886 {
1887         ASSERT(MUTEX_HELD(&p->p_lock));
1888         ASSERT(e->rcep_t == RCENTITY_ZONE);
1889         if (e->rcep_p.zone == NULL)
1890                 return (0);
1891         e->rcep_p.zone->zone_max_swap_ctl = nv;
1892         return (0);
1893 }
1894 
1895 static rctl_ops_t zone_max_swap_ops = {
1896         rcop_no_action,
1897         zone_max_swap_usage,
1898         zone_max_swap_set,
1899         zone_max_swap_test
1900 };
1901 
1902 /*ARGSUSED*/
1903 static rctl_qty_t
1904 zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
1905 {
1906         rctl_qty_t q;
1907         zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
1908 
1909         ASSERT(MUTEX_HELD(&p->p_lock));
1910         q = ptob(zp->zpers_pg_cnt);
1911         return (q);
1912 }
1913 
1914 /*ARGSUSED*/
1915 static int
1916 zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1917     rctl_qty_t nv)
1918 {
1919         zoneid_t zid;
1920         uint_t pg_val;
1921 
1922         ASSERT(MUTEX_HELD(&p->p_lock));
1923         ASSERT(e->rcep_t == RCENTITY_ZONE);
1924         if (e->rcep_p.zone == NULL)
1925                 return (0);
1926         zid = e->rcep_p.zone->zone_id;
1927         if (nv == UINT64_MAX) {
1928                 pg_val = UINT32_MAX;
1929         } else {
1930                 uint64_t pages = btop(nv);
1931 
1932                 /*
1933                  * Return from RCTLOP_SET is always ignored so just clamp an
1934                  * out-of-range value to our largest "limited" value.
1935                  */
1936                 if (pages >= UINT32_MAX) {
1937                         pg_val = UINT32_MAX - 1;
1938                 } else {
1939                         pg_val = (uint_t)pages;
1940                 }
1941         }
1942         zone_pdata[zid].zpers_pg_limit = pg_val;
1943         return (0);
1944 }
1945 
1946 static rctl_ops_t zone_phys_mem_ops = {
1947         rcop_no_action,
1948         zone_phys_mem_usage,
1949         zone_phys_mem_set,
1950         rcop_no_test
1951 };
1952 
1953 /*ARGSUSED*/
1954 static rctl_qty_t
1955 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1956 {
1957         rctl_qty_t q;
1958         zone_t *z = p->p_zone;
1959 
1960         ASSERT(MUTEX_HELD(&p->p_lock));
1961         mutex_enter(&z->zone_rctl_lock);
1962         q = z->zone_max_lofi;
1963         mutex_exit(&z->zone_rctl_lock);
1964         return (q);
1965 }
1966 
1967 /*ARGSUSED*/
1968 static int
1969 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1970     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1971 {
1972         rctl_qty_t q;
1973         zone_t *z;
1974 
1975         z = e->rcep_p.zone;
1976         ASSERT(MUTEX_HELD(&p->p_lock));
1977         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1978         q = z->zone_max_lofi;
1979         if (q + incr > rcntl->rcv_value)
1980                 return (1);
1981         return (0);
1982 }
1983 
1984 /*ARGSUSED*/
1985 static int
1986 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1987     rctl_qty_t nv)
1988 {
1989         ASSERT(MUTEX_HELD(&p->p_lock));
1990         ASSERT(e->rcep_t == RCENTITY_ZONE);
1991         if (e->rcep_p.zone == NULL)
1992                 return (0);
1993         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1994         return (0);
1995 }
1996 
1997 static rctl_ops_t zone_max_lofi_ops = {
1998         rcop_no_action,
1999         zone_max_lofi_usage,
2000         zone_max_lofi_set,
2001         zone_max_lofi_test
2002 };
2003 
2004 /*
2005  * Helper function to brand the zone with a unique ID.
2006  */
2007 static void
2008 zone_uniqid(zone_t *zone)
2009 {
2010         static uint64_t uniqid = 0;
2011 
2012         ASSERT(MUTEX_HELD(&zonehash_lock));
2013         zone->zone_uniqid = uniqid++;
2014 }
2015 
2016 /*
2017  * Returns a held pointer to the "kcred" for the specified zone.
2018  */
2019 struct cred *
2020 zone_get_kcred(zoneid_t zoneid)
2021 {
2022         zone_t *zone;
2023         cred_t *cr;
2024 
2025         if ((zone = zone_find_by_id(zoneid)) == NULL)
2026                 return (NULL);
2027         cr = zone->zone_kcred;
2028         crhold(cr);
2029         zone_rele(zone);
2030         return (cr);
2031 }
2032 
2033 static int
2034 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
2035 {
2036         zone_t *zone = ksp->ks_private;
2037         zone_kstat_t *zk = ksp->ks_data;
2038 
2039         if (rw == KSTAT_WRITE)
2040                 return (EACCES);
2041 
2042         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
2043         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
2044         return (0);
2045 }
2046 
2047 static int
2048 zone_physmem_kstat_update(kstat_t *ksp, int rw)
2049 {
2050         zone_t *zone = ksp->ks_private;
2051         zone_kstat_t *zk = ksp->ks_data;
2052         zone_persist_t *zp = &zone_pdata[zone->zone_id];
2053 
2054         if (rw == KSTAT_WRITE)
2055                 return (EACCES);
2056 
2057         zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
2058         zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
2059         return (0);
2060 }
2061 
2062 static int
2063 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
2064 {
2065         zone_t *zone = ksp->ks_private;
2066         zone_kstat_t *zk = ksp->ks_data;
2067 
2068         if (rw == KSTAT_WRITE)
2069                 return (EACCES);
2070 
2071         zk->zk_usage.value.ui64 = zone->zone_nprocs;
2072         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
2073         return (0);
2074 }
2075 
2076 static int
2077 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
2078 {
2079         zone_t *zone = ksp->ks_private;
2080         zone_kstat_t *zk = ksp->ks_data;
2081 
2082         if (rw == KSTAT_WRITE)
2083                 return (EACCES);
2084 
2085         zk->zk_usage.value.ui64 = zone->zone_max_swap;
2086         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
2087         return (0);
2088 }
2089 
2090 static kstat_t *
2091 zone_rctl_kstat_create_common(zone_t *zone, char *name,
2092     int (*updatefunc) (kstat_t *, int))
2093 {
2094         kstat_t *ksp;
2095         zone_kstat_t *zk;
2096 
2097         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
2098             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
2099             KSTAT_FLAG_VIRTUAL);
2100 
2101         if (ksp == NULL)
2102                 return (NULL);
2103 
2104         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
2105         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2106         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
2107         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
2108         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
2109         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
2110         ksp->ks_update = updatefunc;
2111         ksp->ks_private = zone;
2112         kstat_install(ksp);
2113         return (ksp);
2114 }
2115 
2116 static int
2117 zone_vfs_kstat_update(kstat_t *ksp, int rw)
2118 {
2119         zone_t *zone = ksp->ks_private;
2120         zone_vfs_kstat_t *zvp = ksp->ks_data;
2121         kstat_io_t *kiop = &zone->zone_vfs_rwstats;
2122 
2123         if (rw == KSTAT_WRITE)
2124                 return (EACCES);
2125 
2126         /*
2127          * Extract the VFS statistics from the kstat_io_t structure used by
2128          * kstat_runq_enter() and related functions.  Since the slow ops
2129          * counters are updated directly by the VFS layer, there's no need to
2130          * copy those statistics here.
2131          *
2132          * Note that kstat_runq_enter() and the related functions use
2133          * gethrtime_unscaled(), so scale the time here.
2134          */
2135         zvp->zv_nread.value.ui64 = kiop->nread;
2136         zvp->zv_reads.value.ui64 = kiop->reads;
2137         zvp->zv_rtime.value.ui64 = kiop->rtime;
2138         zvp->zv_rcnt.value.ui64 = kiop->rcnt;
2139         zvp->zv_rlentime.value.ui64 = kiop->rlentime;
2140         zvp->zv_nwritten.value.ui64 = kiop->nwritten;
2141         zvp->zv_writes.value.ui64 = kiop->writes;
2142         zvp->zv_wtime.value.ui64 = kiop->wtime;
2143         zvp->zv_wcnt.value.ui64 = kiop->wcnt;
2144         zvp->zv_wlentime.value.ui64 = kiop->wlentime;
2145 
2146         scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
2147         scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
2148         scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
2149         scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
2150 
2151         return (0);
2152 }
2153 
2154 static kstat_t *
2155 zone_vfs_kstat_create(zone_t *zone)
2156 {
2157         kstat_t *ksp;
2158         zone_vfs_kstat_t *zvp;
2159 
2160         if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
2161             zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
2162             sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
2163             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2164                 return (NULL);
2165 
2166         if (zone->zone_id != GLOBAL_ZONEID)
2167                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2168 
2169         zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
2170         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2171         ksp->ks_lock = &zone->zone_vfs_lock;
2172         zone->zone_vfs_stats = zvp;
2173 
2174         /* The kstat "name" field is not large enough for a full zonename */
2175         kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
2176         kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
2177         kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
2178         kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
2179         kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
2180         kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
2181         kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
2182         kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
2183         kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
2184         kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
2185         kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
2186         kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
2187         kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
2188         kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
2189         kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
2190         kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
2191         kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
2192         kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
2193 
2194         ksp->ks_update = zone_vfs_kstat_update;
2195         ksp->ks_private = zone;
2196 
2197         kstat_install(ksp);
2198         return (ksp);
2199 }
2200 
2201 static int
2202 zone_zfs_kstat_update(kstat_t *ksp, int rw)
2203 {
2204         zone_t *zone = ksp->ks_private;
2205         zone_zfs_kstat_t *zzp = ksp->ks_data;
2206         zone_persist_t *zp = &zone_pdata[zone->zone_id];
2207 
2208         if (rw == KSTAT_WRITE)
2209                 return (EACCES);
2210 
2211         mutex_enter(&zp->zpers_zfs_lock);
2212         if (zp->zpers_zfsp == NULL) {
2213                 zzp->zz_nread.value.ui64 = 0;
2214                 zzp->zz_reads.value.ui64 = 0;
2215                 zzp->zz_rtime.value.ui64 = 0;
2216                 zzp->zz_rlentime.value.ui64 = 0;
2217                 zzp->zz_nwritten.value.ui64 = 0;
2218                 zzp->zz_writes.value.ui64 = 0;
2219                 zzp->zz_waittime.value.ui64 = 0;
2220         } else {
2221                 kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
2222 
2223                 /*
2224                  * Extract the ZFS statistics from the kstat_io_t structure
2225                  * used by kstat_runq_enter() and related functions. Since the
2226                  * I/O throttle counters are updated directly by the ZFS layer,
2227                  * there's no need to copy those statistics here.
2228                  *
2229                  * Note that kstat_runq_enter() and the related functions use
2230                  * gethrtime_unscaled(), so scale the time here.
2231                  */
2232                 zzp->zz_nread.value.ui64 = kiop->nread;
2233                 zzp->zz_reads.value.ui64 = kiop->reads;
2234                 zzp->zz_rtime.value.ui64 = kiop->rtime;
2235                 zzp->zz_rlentime.value.ui64 = kiop->rlentime;
2236                 zzp->zz_nwritten.value.ui64 = kiop->nwritten;
2237                 zzp->zz_writes.value.ui64 = kiop->writes;
2238                 zzp->zz_waittime.value.ui64 =
2239                     zp->zpers_zfsp->zpers_zfs_rd_waittime;
2240         }
2241         mutex_exit(&zp->zpers_zfs_lock);
2242 
2243         scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
2244         scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
2245 
2246         return (0);
2247 }
2248 
2249 static kstat_t *
2250 zone_zfs_kstat_create(zone_t *zone)
2251 {
2252         kstat_t *ksp;
2253         zone_zfs_kstat_t *zzp;
2254 
2255         if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
2256             zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
2257             sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
2258             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2259                 return (NULL);
2260 
2261         if (zone->zone_id != GLOBAL_ZONEID)
2262                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2263 
2264         zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
2265         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2266         ksp->ks_lock = &zone->zone_zfs_lock;
2267         zone->zone_zfs_stats = zzp;
2268 
2269         /* The kstat "name" field is not large enough for a full zonename */
2270         kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
2271         kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
2272         kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
2273         kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
2274         kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
2275         kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
2276         kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
2277         kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
2278         kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
2279 
2280         ksp->ks_update = zone_zfs_kstat_update;
2281         ksp->ks_private = zone;
2282 
2283         kstat_install(ksp);
2284         return (ksp);
2285 }
2286 
2287 static int
2288 zone_mcap_kstat_update(kstat_t *ksp, int rw)
2289 {
2290         zone_t *zone = ksp->ks_private;
2291         zone_mcap_kstat_t *zmp = ksp->ks_data;
2292         zone_persist_t *zp;
2293 
2294         if (rw == KSTAT_WRITE)
2295                 return (EACCES);
2296 
2297         zp = &zone_pdata[zone->zone_id];
2298 
2299         zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
2300         zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
2301         zmp->zm_swap.value.ui64 = zone->zone_max_swap;
2302         zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
2303         zmp->zm_nover.value.ui64 = zp->zpers_nover;
2304 #ifndef DEBUG
2305         zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
2306 #else
2307         zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
2308             zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
2309 #endif
2310         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
2311         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
2312         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
2313         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
2314         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
2315 
2316         return (0);
2317 }
2318 
2319 static kstat_t *
2320 zone_mcap_kstat_create(zone_t *zone)
2321 {
2322         kstat_t *ksp;
2323         zone_mcap_kstat_t *zmp;
2324 
2325         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2326             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2327             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2328             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2329                 return (NULL);
2330 
2331         if (zone->zone_id != GLOBAL_ZONEID)
2332                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2333 
2334         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2335         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2336         ksp->ks_lock = &zone->zone_mcap_lock;
2337         zone->zone_mcap_stats = zmp;
2338 
2339         /* The kstat "name" field is not large enough for a full zonename */
2340         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2341         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2342         kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2343         kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2344         kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2345         kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2346         kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2347         kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2348         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2349         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2350         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2351         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2352         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2353             KSTAT_DATA_UINT64);
2354 
2355         ksp->ks_update = zone_mcap_kstat_update;
2356         ksp->ks_private = zone;
2357 
2358         kstat_install(ksp);
2359         return (ksp);
2360 }
2361 
2362 static int
2363 zone_misc_kstat_update(kstat_t *ksp, int rw)
2364 {
2365         zone_t *zone = ksp->ks_private;
2366         zone_misc_kstat_t *zmp = ksp->ks_data;
2367         hrtime_t hrtime;
2368         uint64_t tmp;
2369 
2370         if (rw == KSTAT_WRITE)
2371                 return (EACCES);
2372 
2373         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
2374         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
2375         scalehrtime(&hrtime);
2376         zmp->zm_stime.value.ui64 = hrtime;
2377 
2378         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
2379         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
2380         scalehrtime(&hrtime);
2381         zmp->zm_utime.value.ui64 = hrtime;
2382 
2383         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
2384         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
2385         scalehrtime(&hrtime);
2386         zmp->zm_wtime.value.ui64 = hrtime;
2387 
2388         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
2389         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
2390         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
2391 
2392         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
2393         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
2394         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
2395         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
2396 
2397         zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
2398 
2399         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
2400 
2401         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
2402         zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts;
2403         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
2404 
2405         return (0);
2406 }
2407 
2408 static kstat_t *
2409 zone_misc_kstat_create(zone_t *zone)
2410 {
2411         kstat_t *ksp;
2412         zone_misc_kstat_t *zmp;
2413 
2414         if ((ksp = kstat_create_zone("zones", zone->zone_id,
2415             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
2416             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
2417             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2418                 return (NULL);
2419 
2420         if (zone->zone_id != GLOBAL_ZONEID)
2421                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2422 
2423         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2424         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2425         ksp->ks_lock = &zone->zone_misc_lock;
2426         zone->zone_misc_stats = zmp;
2427 
2428         /* The kstat "name" field is not large enough for a full zonename */
2429         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2430         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2431         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2432         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
2433         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2434         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2435         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2436         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2437             KSTAT_DATA_UINT32);
2438         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2439         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2440             KSTAT_DATA_UINT32);
2441         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2442         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2443         kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
2444             KSTAT_DATA_UINT32);
2445         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2446             KSTAT_DATA_UINT32);
2447         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2448         kstat_named_init(&zmp->zm_init_restarts, "init_restarts",
2449             KSTAT_DATA_UINT32);
2450         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2451 
2452         ksp->ks_update = zone_misc_kstat_update;
2453         ksp->ks_private = zone;
2454 
2455         kstat_install(ksp);
2456         return (ksp);
2457 }
2458 
2459 static void
2460 zone_kstat_create(zone_t *zone)
2461 {
2462         zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
2463             "lockedmem", zone_lockedmem_kstat_update);
2464         zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
2465             "swapresv", zone_swapresv_kstat_update);
2466         zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
2467             "physicalmem", zone_physmem_kstat_update);
2468         zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
2469             "nprocs", zone_nprocs_kstat_update);
2470 
2471         if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2472                 zone->zone_vfs_stats = kmem_zalloc(
2473                     sizeof (zone_vfs_kstat_t), KM_SLEEP);
2474         }
2475 
2476         if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
2477                 zone->zone_zfs_stats = kmem_zalloc(
2478                     sizeof (zone_zfs_kstat_t), KM_SLEEP);
2479         }
2480 
2481         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2482                 zone->zone_mcap_stats = kmem_zalloc(
2483                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2484         }
2485 
2486         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2487                 zone->zone_misc_stats = kmem_zalloc(
2488                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2489         }
2490 }
2491 
2492 static void
2493 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2494 {
2495         void *data;
2496 
2497         if (*pkstat != NULL) {
2498                 data = (*pkstat)->ks_data;
2499                 kstat_delete(*pkstat);
2500                 kmem_free(data, datasz);
2501                 *pkstat = NULL;
2502         }
2503 }
2504 
2505 static void
2506 zone_kstat_delete(zone_t *zone)
2507 {
2508         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2509             sizeof (zone_kstat_t));
2510         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2511             sizeof (zone_kstat_t));
2512         zone_kstat_delete_common(&zone->zone_physmem_kstat,
2513             sizeof (zone_kstat_t));
2514         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2515             sizeof (zone_kstat_t));
2516 
2517         zone_kstat_delete_common(&zone->zone_vfs_ksp,
2518             sizeof (zone_vfs_kstat_t));
2519         zone_kstat_delete_common(&zone->zone_zfs_ksp,
2520             sizeof (zone_zfs_kstat_t));
2521         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2522             sizeof (zone_mcap_kstat_t));
2523         zone_kstat_delete_common(&zone->zone_misc_ksp,
2524             sizeof (zone_misc_kstat_t));
2525 }
2526 
2527 /*
2528  * Called very early on in boot to initialize the ZSD list so that
2529  * zone_key_create() can be called before zone_init().  It also initializes
2530  * portions of zone0 which may be used before zone_init() is called.  The
2531  * variable "global_zone" will be set when zone0 is fully initialized by
2532  * zone_init().
2533  */
2534 void
2535 zone_zsd_init(void)
2536 {
2537         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2538         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2539         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2540             offsetof(struct zsd_entry, zsd_linkage));
2541         list_create(&zone_active, sizeof (zone_t),
2542             offsetof(zone_t, zone_linkage));
2543         list_create(&zone_deathrow, sizeof (zone_t),
2544             offsetof(zone_t, zone_linkage));
2545 
2546         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2547         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2548         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2549         zone0.zone_shares = 1;
2550         zone0.zone_nlwps = 0;
2551         zone0.zone_nlwps_ctl = INT_MAX;
2552         zone0.zone_nprocs = 0;
2553         zone0.zone_nprocs_ctl = INT_MAX;
2554         zone0.zone_locked_mem = 0;
2555         zone0.zone_locked_mem_ctl = UINT64_MAX;
2556         ASSERT(zone0.zone_max_swap == 0);
2557         zone0.zone_max_swap_ctl = UINT64_MAX;
2558         zone0.zone_max_lofi = 0;
2559         zone0.zone_max_lofi_ctl = UINT64_MAX;
2560         zone0.zone_shmmax = 0;
2561         zone0.zone_ipc.ipcq_shmmni = 0;
2562         zone0.zone_ipc.ipcq_semmni = 0;
2563         zone0.zone_ipc.ipcq_msgmni = 0;
2564         zone0.zone_name = GLOBAL_ZONENAME;
2565         zone0.zone_nodename = utsname.nodename;
2566         zone0.zone_domain = srpc_domain;
2567         zone0.zone_hostid = HW_INVALID_HOSTID;
2568         zone0.zone_fs_allowed = NULL;
2569         psecflags_default(&zone0.zone_secflags);
2570         zone0.zone_ref = 1;
2571         zone0.zone_id = GLOBAL_ZONEID;
2572         zone0.zone_status = ZONE_IS_RUNNING;
2573         zone0.zone_rootpath = "/";
2574         zone0.zone_rootpathlen = 2;
2575         zone0.zone_psetid = ZONE_PS_INVAL;
2576         zone0.zone_ncpus = 0;
2577         zone0.zone_ncpus_online = 0;
2578         zone0.zone_proc_initpid = 1;
2579         zone0.zone_initname = initname;
2580         zone0.zone_lockedmem_kstat = NULL;
2581         zone0.zone_swapresv_kstat = NULL;
2582         zone0.zone_physmem_kstat = NULL;
2583         zone0.zone_nprocs_kstat = NULL;
2584 
2585         zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
2586         zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
2587 
2588         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2589             offsetof(zone_ref_t, zref_linkage));
2590         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2591             offsetof(struct zsd_entry, zsd_linkage));
2592         list_insert_head(&zone_active, &zone0);
2593 
2594         /*
2595          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2596          * to anything meaningful.  It is assigned to be 'rootdir' in
2597          * vfs_mountroot().
2598          */
2599         zone0.zone_rootvp = NULL;
2600         zone0.zone_vfslist = NULL;
2601         zone0.zone_bootargs = initargs;
2602         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2603         /*
2604          * The global zone has all privileges
2605          */
2606         priv_fillset(zone0.zone_privset);
2607         /*
2608          * Add p0 to the global zone
2609          */
2610         zone0.zone_zsched = &p0;
2611         p0.p_zone = &zone0;
2612 }
2613 
2614 /*
2615  * Compute a hash value based on the contents of the label and the DOI.  The
2616  * hash algorithm is somewhat arbitrary, but is based on the observation that
2617  * humans will likely pick labels that differ by amounts that work out to be
2618  * multiples of the number of hash chains, and thus stirring in some primes
2619  * should help.
2620  */
2621 static uint_t
2622 hash_bylabel(void *hdata, mod_hash_key_t key)
2623 {
2624         const ts_label_t *lab = (ts_label_t *)key;
2625         const uint32_t *up, *ue;
2626         uint_t hash;
2627         int i;
2628 
2629         _NOTE(ARGUNUSED(hdata));
2630 
2631         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2632         /* we depend on alignment of label, but not representation */
2633         up = (const uint32_t *)&lab->tsl_label;
2634         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2635         i = 1;
2636         while (up < ue) {
2637                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2638                 hash += *up + (*up << ((i % 16) + 1));
2639                 up++;
2640                 i++;
2641         }
2642         return (hash);
2643 }
2644 
2645 /*
2646  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2647  * equal).  This may need to be changed if less than / greater than is ever
2648  * needed.
2649  */
2650 static int
2651 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2652 {
2653         ts_label_t *lab1 = (ts_label_t *)key1;
2654         ts_label_t *lab2 = (ts_label_t *)key2;
2655 
2656         return (label_equal(lab1, lab2) ? 0 : 1);
2657 }
2658 
2659 /*
2660  * Called by main() to initialize the zones framework.
2661  */
2662 void
2663 zone_init(void)
2664 {
2665         rctl_dict_entry_t *rde;
2666         rctl_val_t *dval;
2667         rctl_set_t *set;
2668         rctl_alloc_gp_t *gp;
2669         rctl_entity_p_t e;
2670         int res;
2671 
2672         ASSERT(curproc == &p0);
2673 
2674         /*
2675          * Create ID space for zone IDs.  ID 0 is reserved for the
2676          * global zone.
2677          */
2678         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2679 
2680         /*
2681          * Initialize generic zone resource controls, if any.
2682          */
2683         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2684             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2685             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2686             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2687 
2688         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2689             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2690             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2691             RCTL_GLOBAL_INFINITE,
2692             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2693 
2694         rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
2695             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2696             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2697             MAXCAP, MAXCAP, &zone_cpu_base_ops);
2698 
2699         rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
2700             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2701             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2702             INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
2703 
2704         rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2705             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2706             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2707             16384, 16384, &zone_zfs_io_pri_ops);
2708 
2709         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2710             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2711             INT_MAX, INT_MAX, &zone_lwps_ops);
2712 
2713         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2714             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2715             INT_MAX, INT_MAX, &zone_procs_ops);
2716 
2717         /*
2718          * System V IPC resource controls
2719          */
2720         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2721             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2722             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2723 
2724         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2725             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2726             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2727 
2728         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2729             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2730             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2731 
2732         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2733             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2734             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2735 
2736         /*
2737          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2738          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2739          */
2740         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2741         bzero(dval, sizeof (rctl_val_t));
2742         dval->rcv_value = 1;
2743         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2744         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2745         dval->rcv_action_recip_pid = -1;
2746 
2747         rde = rctl_dict_lookup("zone.cpu-shares");
2748         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2749 
2750         /*
2751          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2752          * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
2753          */
2754         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2755         bzero(dval, sizeof (rctl_val_t));
2756         dval->rcv_value = 1;
2757         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2758         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2759         dval->rcv_action_recip_pid = -1;
2760 
2761         rde = rctl_dict_lookup("zone.zfs-io-priority");
2762         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2763 
2764         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2765             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2766             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2767             &zone_locked_mem_ops);
2768 
2769         rc_zone_max_swap = rctl_register("zone.max-swap",
2770             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2771             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2772             &zone_max_swap_ops);
2773 
2774         rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2775             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2776             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2777             &zone_phys_mem_ops);
2778 
2779         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2780             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2781             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2782             &zone_max_lofi_ops);
2783 
2784         /*
2785          * Initialize the ``global zone''.
2786          */
2787         set = rctl_set_create();
2788         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2789         mutex_enter(&p0.p_lock);
2790         e.rcep_p.zone = &zone0;
2791         e.rcep_t = RCENTITY_ZONE;
2792         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2793             gp);
2794 
2795         zone0.zone_nlwps = p0.p_lwpcnt;
2796         zone0.zone_nprocs = 1;
2797         zone0.zone_ntasks = 1;
2798         mutex_exit(&p0.p_lock);
2799         zone0.zone_restart_init = B_TRUE;
2800         zone0.zone_reboot_on_init_exit = B_FALSE;
2801         zone0.zone_restart_init_0 = B_FALSE;
2802         zone0.zone_init_status = -1;
2803         zone0.zone_brand = &native_brand;
2804         rctl_prealloc_destroy(gp);
2805         /*
2806          * pool_default hasn't been initialized yet, so we let pool_init()
2807          * take care of making sure the global zone is in the default pool.
2808          */
2809 
2810         /*
2811          * Initialize global zone kstats
2812          */
2813         zone_kstat_create(&zone0);
2814 
2815         /*
2816          * Initialize zone label.
2817          * mlp are initialized when tnzonecfg is loaded.
2818          */
2819         zone0.zone_slabel = l_admin_low;
2820         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2821         label_hold(l_admin_low);
2822 
2823         /*
2824          * Initialise the lock for the database structure used by mntfs.
2825          */
2826         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2827 
2828         zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
2829 
2830         mutex_enter(&zonehash_lock);
2831         zone_uniqid(&zone0);
2832         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2833 
2834         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2835             mod_hash_null_valdtor);
2836         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2837             zone_hash_size, mod_hash_null_valdtor);
2838         /*
2839          * maintain zonehashbylabel only for labeled systems
2840          */
2841         if (is_system_labeled())
2842                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2843                     zone_hash_size, mod_hash_null_keydtor,
2844                     mod_hash_null_valdtor, hash_bylabel, NULL,
2845                     hash_labelkey_cmp, KM_SLEEP);
2846         zonecount = 1;
2847 
2848         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2849             (mod_hash_val_t)&zone0);
2850         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2851             (mod_hash_val_t)&zone0);
2852         if (is_system_labeled()) {
2853                 zone0.zone_flags |= ZF_HASHED_LABEL;
2854                 (void) mod_hash_insert(zonehashbylabel,
2855                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2856         }
2857         mutex_exit(&zonehash_lock);
2858 
2859         /*
2860          * We avoid setting zone_kcred until now, since kcred is initialized
2861          * sometime after zone_zsd_init() and before zone_init().
2862          */
2863         zone0.zone_kcred = kcred;
2864         /*
2865          * The global zone is fully initialized (except for zone_rootvp which
2866          * will be set when the root filesystem is mounted).
2867          */
2868         global_zone = &zone0;
2869 
2870         /*
2871          * Setup an event channel to send zone status change notifications on
2872          */
2873         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2874             EVCH_CREAT);
2875 
2876         if (res)
2877                 panic("Sysevent_evc_bind failed during zone setup.\n");
2878 
2879 }
2880 
2881 static void
2882 zone_free(zone_t *zone)
2883 {
2884         zone_dl_t *zdl;
2885 
2886         ASSERT(zone != global_zone);
2887         ASSERT(zone->zone_ntasks == 0);
2888         ASSERT(zone->zone_nlwps == 0);
2889         ASSERT(zone->zone_nprocs == 0);
2890         ASSERT(zone->zone_cred_ref == 0);
2891         ASSERT(zone->zone_kcred == NULL);
2892         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2893             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2894         ASSERT(list_is_empty(&zone->zone_ref_list));
2895 
2896         /*
2897          * Remove any zone caps.
2898          */
2899         cpucaps_zone_remove(zone);
2900 
2901         /* Clear physical memory capping data. */
2902         bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
2903 
2904         ASSERT(zone->zone_cpucap == NULL);
2905 
2906         /* remove from deathrow list */
2907         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2908                 ASSERT(zone->zone_ref == 0);
2909                 mutex_enter(&zone_deathrow_lock);
2910                 list_remove(&zone_deathrow, zone);
2911                 mutex_exit(&zone_deathrow_lock);
2912         }
2913 
2914         list_destroy(&zone->zone_ref_list);
2915         zone_free_zsd(zone);
2916         zone_free_datasets(zone);
2917 
2918         /*
2919          * While dlmgmtd should have removed all of these, it could have left
2920          * something behind or crashed. In which case it's not safe for us to
2921          * assume that the list is empty which list_destroy() will ASSERT. We
2922          * clean up for our userland comrades which may have crashed, or worse,
2923          * been disabled by SMF.
2924          */
2925         while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
2926                 if (zdl->zdl_net != NULL)
2927                         nvlist_free(zdl->zdl_net);
2928                 kmem_free(zdl, sizeof (zone_dl_t));
2929         }
2930         list_destroy(&zone->zone_dl_list);
2931 
2932         /*
2933          * This zone_t can no longer inhibit creation of another zone_t
2934          * with the same name or debug ID.  Generate a sysevent so that
2935          * userspace tools know it is safe to carry on.
2936          */
2937         mutex_enter(&zone_status_lock);
2938         zone_status_set(zone, ZONE_IS_FREE);
2939         mutex_exit(&zone_status_lock);
2940 
2941         cpu_uarray_free(zone->zone_ustate);
2942 
2943         if (zone->zone_rootvp != NULL) {
2944                 vnode_t *vp = zone->zone_rootvp;
2945 
2946                 mutex_enter(&vp->v_lock);
2947                 vp->v_flag &= ~VZONEROOT;
2948                 mutex_exit(&vp->v_lock);
2949                 VN_RELE(vp);
2950                 /* No need to worry about NULL-ing out zone_rootvp. */
2951         }
2952         if (zone->zone_rootpath)
2953                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2954         if (zone->zone_name != NULL)
2955                 kmem_free(zone->zone_name, ZONENAME_MAX);
2956         if (zone->zone_slabel != NULL)
2957                 label_rele(zone->zone_slabel);
2958         if (zone->zone_nodename != NULL)
2959                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2960         if (zone->zone_domain != NULL)
2961                 kmem_free(zone->zone_domain, _SYS_NMLN);
2962         if (zone->zone_privset != NULL)
2963                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2964         if (zone->zone_rctls != NULL)
2965                 rctl_set_free(zone->zone_rctls);
2966         if (zone->zone_bootargs != NULL)
2967                 strfree(zone->zone_bootargs);
2968         if (zone->zone_initname != NULL)
2969                 strfree(zone->zone_initname);
2970         if (zone->zone_fs_allowed != NULL)
2971                 strfree(zone->zone_fs_allowed);
2972         if (zone->zone_pfexecd != NULL)
2973                 klpd_freelist(&zone->zone_pfexecd);
2974         id_free(zoneid_space, zone->zone_id);
2975         mutex_destroy(&zone->zone_lock);
2976         cv_destroy(&zone->zone_cv);
2977         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2978         rw_destroy(&zone->zone_mntfs_db_lock);
2979         kmem_free(zone, sizeof (zone_t));
2980 }
2981 
2982 /*
2983  * See block comment at the top of this file for information about zone
2984  * status values.
2985  */
2986 /*
2987  * Convenience function for setting zone status.
2988  */
2989 static void
2990 zone_status_set(zone_t *zone, zone_status_t status)
2991 {
2992         timestruc_t now;
2993         uint64_t t;
2994 
2995         nvlist_t *nvl = NULL;
2996         ASSERT(MUTEX_HELD(&zone_status_lock));
2997         ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE ||
2998             status == ZONE_IS_FREE) && status >= zone_status_get(zone));
2999 
3000         /* Current time since Jan 1 1970 but consumers expect NS */
3001         gethrestime(&now);
3002         t = (now.tv_sec * NANOSEC) + now.tv_nsec;
3003 
3004         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
3005             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
3006             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
3007             zone_status_table[status]) ||
3008             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
3009             zone_status_table[zone->zone_status]) ||
3010             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
3011             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
3012             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
3013             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
3014 #ifdef DEBUG
3015                 (void) printf(
3016                     "Failed to allocate and send zone state change event.\n");
3017 #else
3018                 /* EMPTY */
3019 #endif
3020         }
3021         nvlist_free(nvl);
3022 
3023         zone->zone_status = status;
3024 
3025         cv_broadcast(&zone->zone_cv);
3026 }
3027 
3028 /*
3029  * Public function to retrieve the zone status.  The zone status may
3030  * change after it is retrieved.
3031  */
3032 zone_status_t
3033 zone_status_get(zone_t *zone)
3034 {
3035         return (zone->zone_status);
3036 }
3037 
3038 /*
3039  * Publish a zones-related sysevent for purposes other than zone state changes.
3040  * While it is unfortunate that zone_event_chan is associated with
3041  * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be
3042  * the only ones with class "status" and subclass "change".
3043  */
3044 void
3045 zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass,
3046     nvlist_t *ev_nvl)
3047 {
3048         nvlist_t *nvl = NULL;
3049         timestruc_t now;
3050         uint64_t t;
3051 
3052         gethrestime(&now);
3053         t = (now.tv_sec * NANOSEC) + now.tv_nsec;
3054 
3055         if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 ||
3056             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 ||
3057             nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 ||
3058             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 ||
3059             sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com",
3060             "kernel", nvl, EVCH_SLEEP) != 0) {
3061 #ifdef DEBUG
3062                 (void) printf("Failed to allocate and send zone misc event.\n");
3063 #else
3064                 /* EMPTY */
3065 #endif
3066         }
3067         nvlist_free(nvl);
3068 }
3069 
3070 static int
3071 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
3072 {
3073         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
3074         int err = 0;
3075 
3076         ASSERT(zone != global_zone);
3077         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
3078                 goto done;      /* EFAULT or ENAMETOOLONG */
3079 
3080         if (zone->zone_bootargs != NULL)
3081                 strfree(zone->zone_bootargs);
3082 
3083         zone->zone_bootargs = strdup(buf);
3084 
3085 done:
3086         kmem_free(buf, BOOTARGS_MAX);
3087         return (err);
3088 }
3089 
3090 static int
3091 zone_set_brand(zone_t *zone, const char *brand)
3092 {
3093         struct brand_attr *attrp;
3094         brand_t *bp;
3095 
3096         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
3097         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
3098                 kmem_free(attrp, sizeof (struct brand_attr));
3099                 return (EFAULT);
3100         }
3101 
3102         bp = brand_register_zone(attrp);
3103         kmem_free(attrp, sizeof (struct brand_attr));
3104         if (bp == NULL)
3105                 return (EINVAL);
3106 
3107         /*
3108          * This is the only place where a zone can change it's brand.
3109          * We already need to hold zone_status_lock to check the zone
3110          * status, so we'll just use that lock to serialize zone
3111          * branding requests as well.
3112          */
3113         mutex_enter(&zone_status_lock);
3114 
3115         /* Re-Branding is not allowed and the zone can't be booted yet */
3116         if ((ZONE_IS_BRANDED(zone)) ||
3117             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
3118                 mutex_exit(&zone_status_lock);
3119                 brand_unregister_zone(bp);
3120                 return (EINVAL);
3121         }
3122 
3123         /*
3124          * Set up the brand specific data.
3125          * Note that it's possible that the hook has to drop the
3126          * zone_status_lock and reaquire it before returning so we can't
3127          * assume the lock has been held the entire time.
3128          */
3129         zone->zone_brand = bp;
3130         ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
3131 
3132         mutex_exit(&zone_status_lock);
3133         return (0);
3134 }
3135 
3136 static int
3137 zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
3138 {
3139         int err = 0;
3140         psecflags_t psf;
3141 
3142         ASSERT(zone != global_zone);
3143 
3144         if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
3145                 return (err);
3146 
3147         if (zone_status_get(zone) > ZONE_IS_READY)
3148                 return (EINVAL);
3149 
3150         if (!psecflags_validate(&psf))
3151                 return (EINVAL);
3152 
3153         (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
3154 
3155         /* Set security flags on the zone's zsched */
3156         (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
3157             sizeof (zone->zone_zsched->p_secflags));
3158 
3159         return (0);
3160 }
3161 
3162 static int
3163 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
3164 {
3165         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
3166         int err = 0;
3167 
3168         ASSERT(zone != global_zone);
3169         if ((err = copyinstr(zone_fs_allowed, buf,
3170             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
3171                 goto done;
3172 
3173         if (zone->zone_fs_allowed != NULL)
3174                 strfree(zone->zone_fs_allowed);
3175 
3176         zone->zone_fs_allowed = strdup(buf);
3177 
3178 done:
3179         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
3180         return (err);
3181 }
3182 
3183 static int
3184 zone_set_initname(zone_t *zone, const char *zone_initname)
3185 {
3186         char initname[INITNAME_SZ];
3187         size_t len;
3188         int err = 0;
3189 
3190         ASSERT(zone != global_zone);
3191         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
3192                 return (err);   /* EFAULT or ENAMETOOLONG */
3193 
3194         if (zone->zone_initname != NULL)
3195                 strfree(zone->zone_initname);
3196 
3197         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
3198         (void) strcpy(zone->zone_initname, initname);
3199         return (0);
3200 }
3201 
3202 static int
3203 zone_set_sched_class(zone_t *zone, const char *new_class)
3204 {
3205         char sched_class[PC_CLNMSZ];
3206         id_t classid;
3207         int err;
3208 
3209         ASSERT(zone != global_zone);
3210         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
3211                 return (err);   /* EFAULT or ENAMETOOLONG */
3212 
3213         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
3214                 return (set_errno(EINVAL));
3215         zone->zone_defaultcid = classid;
3216         ASSERT(zone->zone_defaultcid > 0 &&
3217             zone->zone_defaultcid < loaded_classes);
3218 
3219         return (0);
3220 }
3221 
3222 /*
3223  * Block indefinitely waiting for (zone_status >= status)
3224  */
3225 void
3226 zone_status_wait(zone_t *zone, zone_status_t status)
3227 {
3228         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3229 
3230         mutex_enter(&zone_status_lock);
3231         while (zone->zone_status < status) {
3232                 cv_wait(&zone->zone_cv, &zone_status_lock);
3233         }
3234         mutex_exit(&zone_status_lock);
3235 }
3236 
3237 /*
3238  * Private CPR-safe version of zone_status_wait().
3239  */
3240 static void
3241 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
3242 {
3243         callb_cpr_t cprinfo;
3244 
3245         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3246 
3247         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
3248             str);
3249         mutex_enter(&zone_status_lock);
3250         while (zone->zone_status < status) {
3251                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
3252                 cv_wait(&zone->zone_cv, &zone_status_lock);
3253                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
3254         }
3255         /*
3256          * zone_status_lock is implicitly released by the following.
3257          */
3258         CALLB_CPR_EXIT(&cprinfo);
3259 }
3260 
3261 /*
3262  * Block until zone enters requested state or signal is received.  Return (0)
3263  * if signaled, non-zero otherwise.
3264  */
3265 int
3266 zone_status_wait_sig(zone_t *zone, zone_status_t status)
3267 {
3268         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3269 
3270         mutex_enter(&zone_status_lock);
3271         while (zone->zone_status < status) {
3272                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
3273                         mutex_exit(&zone_status_lock);
3274                         return (0);
3275                 }
3276         }
3277         mutex_exit(&zone_status_lock);
3278         return (1);
3279 }
3280 
3281 /*
3282  * Block until the zone enters the requested state or the timeout expires,
3283  * whichever happens first.  Return (-1) if operation timed out, time remaining
3284  * otherwise.
3285  */
3286 clock_t
3287 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
3288 {
3289         clock_t timeleft = 0;
3290 
3291         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3292 
3293         mutex_enter(&zone_status_lock);
3294         while (zone->zone_status < status && timeleft != -1) {
3295                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
3296         }
3297         mutex_exit(&zone_status_lock);
3298         return (timeleft);
3299 }
3300 
3301 /*
3302  * Block until the zone enters the requested state, the current process is
3303  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
3304  * operation timed out, 0 if signaled, time remaining otherwise.
3305  */
3306 clock_t
3307 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
3308 {
3309         clock_t timeleft = tim - ddi_get_lbolt();
3310 
3311         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3312 
3313         mutex_enter(&zone_status_lock);
3314         while (zone->zone_status < status) {
3315                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
3316                     tim);
3317                 if (timeleft <= 0)
3318                         break;
3319         }
3320         mutex_exit(&zone_status_lock);
3321         return (timeleft);
3322 }
3323 
3324 /*
3325  * Zones have two reference counts: one for references from credential
3326  * structures (zone_cred_ref), and one (zone_ref) for everything else.
3327  * This is so we can allow a zone to be rebooted while there are still
3328  * outstanding cred references, since certain drivers cache dblks (which
3329  * implicitly results in cached creds).  We wait for zone_ref to drop to
3330  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
3331  * later freed when the zone_cred_ref drops to 0, though nothing other
3332  * than the zone id and privilege set should be accessed once the zone
3333  * is "dead".
3334  *
3335  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
3336  * to force halt/reboot to block waiting for the zone_cred_ref to drop
3337  * to 0.  This can be useful to flush out other sources of cached creds
3338  * that may be less innocuous than the driver case.
3339  *
3340  * Zones also provide a tracked reference counting mechanism in which zone
3341  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
3342  * debuggers determine the sources of leaked zone references.  See
3343  * zone_hold_ref() and zone_rele_ref() below for more information.
3344  */
3345 
3346 int zone_wait_for_cred = 0;
3347 
3348 static void
3349 zone_hold_locked(zone_t *z)
3350 {
3351         ASSERT(MUTEX_HELD(&z->zone_lock));
3352         z->zone_ref++;
3353         ASSERT(z->zone_ref != 0);
3354 }
3355 
3356 /*
3357  * Increment the specified zone's reference count.  The zone's zone_t structure
3358  * will not be freed as long as the zone's reference count is nonzero.
3359  * Decrement the zone's reference count via zone_rele().
3360  *
3361  * NOTE: This function should only be used to hold zones for short periods of
3362  * time.  Use zone_hold_ref() if the zone must be held for a long time.
3363  */
3364 void
3365 zone_hold(zone_t *z)
3366 {
3367         mutex_enter(&z->zone_lock);
3368         zone_hold_locked(z);
3369         mutex_exit(&z->zone_lock);
3370 }
3371 
3372 /*
3373  * If the non-cred ref count drops to 1 and either the cred ref count
3374  * is 0 or we aren't waiting for cred references, the zone is ready to
3375  * be destroyed.
3376  */
3377 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
3378             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
3379 
3380 /*
3381  * Common zone reference release function invoked by zone_rele() and
3382  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
3383  * zone's subsystem-specific reference counters are not affected by the
3384  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
3385  * removed from the specified zone's reference list.  ref must be non-NULL iff
3386  * subsys is not ZONE_REF_NUM_SUBSYS.
3387  */
3388 static void
3389 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3390 {
3391         boolean_t wakeup;
3392 
3393         mutex_enter(&z->zone_lock);
3394         ASSERT(z->zone_ref != 0);
3395         z->zone_ref--;
3396         if (subsys != ZONE_REF_NUM_SUBSYS) {
3397                 ASSERT(z->zone_subsys_ref[subsys] != 0);
3398                 z->zone_subsys_ref[subsys]--;
3399                 list_remove(&z->zone_ref_list, ref);
3400         }
3401         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3402                 /* no more refs, free the structure */
3403                 mutex_exit(&z->zone_lock);
3404                 zone_free(z);
3405                 return;
3406         }
3407         /* signal zone_destroy so the zone can finish halting */
3408         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
3409         mutex_exit(&z->zone_lock);
3410 
3411         if (wakeup) {
3412                 /*
3413                  * Grabbing zonehash_lock here effectively synchronizes with
3414                  * zone_destroy() to avoid missed signals.
3415                  */
3416                 mutex_enter(&zonehash_lock);
3417                 cv_broadcast(&zone_destroy_cv);
3418                 mutex_exit(&zonehash_lock);
3419         }
3420 }
3421 
3422 /*
3423  * Decrement the specified zone's reference count.  The specified zone will
3424  * cease to exist after this function returns if the reference count drops to
3425  * zero.  This function should be paired with zone_hold().
3426  */
3427 void
3428 zone_rele(zone_t *z)
3429 {
3430         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
3431 }
3432 
3433 /*
3434  * Initialize a zone reference structure.  This function must be invoked for
3435  * a reference structure before the structure is passed to zone_hold_ref().
3436  */
3437 void
3438 zone_init_ref(zone_ref_t *ref)
3439 {
3440         ref->zref_zone = NULL;
3441         list_link_init(&ref->zref_linkage);
3442 }
3443 
3444 /*
3445  * Acquire a reference to zone z.  The caller must specify the
3446  * zone_ref_subsys_t constant associated with its subsystem.  The specified
3447  * zone_ref_t structure will represent a reference to the specified zone.  Use
3448  * zone_rele_ref() to release the reference.
3449  *
3450  * The referenced zone_t structure will not be freed as long as the zone_t's
3451  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
3452  * references.
3453  *
3454  * NOTE: The zone_ref_t structure must be initialized before it is used.
3455  * See zone_init_ref() above.
3456  */
3457 void
3458 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3459 {
3460         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
3461 
3462         /*
3463          * Prevent consumers from reusing a reference structure before
3464          * releasing it.
3465          */
3466         VERIFY(ref->zref_zone == NULL);
3467 
3468         ref->zref_zone = z;
3469         mutex_enter(&z->zone_lock);
3470         zone_hold_locked(z);
3471         z->zone_subsys_ref[subsys]++;
3472         ASSERT(z->zone_subsys_ref[subsys] != 0);
3473         list_insert_head(&z->zone_ref_list, ref);
3474         mutex_exit(&z->zone_lock);
3475 }
3476 
3477 /*
3478  * Release the zone reference represented by the specified zone_ref_t.
3479  * The reference is invalid after it's released; however, the zone_ref_t
3480  * structure can be reused without having to invoke zone_init_ref().
3481  * subsys should be the same value that was passed to zone_hold_ref()
3482  * when the reference was acquired.
3483  */
3484 void
3485 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
3486 {
3487         zone_rele_common(ref->zref_zone, ref, subsys);
3488 
3489         /*
3490          * Set the zone_ref_t's zref_zone field to NULL to generate panics
3491          * when consumers dereference the reference.  This helps us catch
3492          * consumers who use released references.  Furthermore, this lets
3493          * consumers reuse the zone_ref_t structure without having to
3494          * invoke zone_init_ref().
3495          */
3496         ref->zref_zone = NULL;
3497 }
3498 
3499 void
3500 zone_cred_hold(zone_t *z)
3501 {
3502         mutex_enter(&z->zone_lock);
3503         z->zone_cred_ref++;
3504         ASSERT(z->zone_cred_ref != 0);
3505         mutex_exit(&z->zone_lock);
3506 }
3507 
3508 void
3509 zone_cred_rele(zone_t *z)
3510 {
3511         boolean_t wakeup;
3512 
3513         mutex_enter(&z->zone_lock);
3514         ASSERT(z->zone_cred_ref != 0);
3515         z->zone_cred_ref--;
3516         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3517                 /* no more refs, free the structure */
3518                 mutex_exit(&z->zone_lock);
3519                 zone_free(z);
3520                 return;
3521         }
3522         /*
3523          * If zone_destroy is waiting for the cred references to drain
3524          * out, and they have, signal it.
3525          */
3526         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
3527             zone_status_get(z) >= ZONE_IS_DEAD);
3528         mutex_exit(&z->zone_lock);
3529 
3530         if (wakeup) {
3531                 /*
3532                  * Grabbing zonehash_lock here effectively synchronizes with
3533                  * zone_destroy() to avoid missed signals.
3534                  */
3535                 mutex_enter(&zonehash_lock);
3536                 cv_broadcast(&zone_destroy_cv);
3537                 mutex_exit(&zonehash_lock);
3538         }
3539 }
3540 
3541 void
3542 zone_task_hold(zone_t *z)
3543 {
3544         mutex_enter(&z->zone_lock);
3545         z->zone_ntasks++;
3546         ASSERT(z->zone_ntasks != 0);
3547         mutex_exit(&z->zone_lock);
3548 }
3549 
3550 void
3551 zone_task_rele(zone_t *zone)
3552 {
3553         uint_t refcnt;
3554 
3555         mutex_enter(&zone->zone_lock);
3556         ASSERT(zone->zone_ntasks != 0);
3557         refcnt = --zone->zone_ntasks;
3558         if (refcnt > 1)      {       /* Common case */
3559                 mutex_exit(&zone->zone_lock);
3560                 return;
3561         }
3562         zone_hold_locked(zone); /* so we can use the zone_t later */
3563         mutex_exit(&zone->zone_lock);
3564         if (refcnt == 1) {
3565                 /*
3566                  * See if the zone is shutting down.
3567                  */
3568                 mutex_enter(&zone_status_lock);
3569                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
3570                         goto out;
3571                 }
3572 
3573                 /*
3574                  * Make sure the ntasks didn't change since we
3575                  * dropped zone_lock.
3576                  */
3577                 mutex_enter(&zone->zone_lock);
3578                 if (refcnt != zone->zone_ntasks) {
3579                         mutex_exit(&zone->zone_lock);
3580                         goto out;
3581                 }
3582                 mutex_exit(&zone->zone_lock);
3583 
3584                 /*
3585                  * No more user processes in the zone.  The zone is empty.
3586                  */
3587                 zone_status_set(zone, ZONE_IS_EMPTY);
3588                 goto out;
3589         }
3590 
3591         ASSERT(refcnt == 0);
3592         /*
3593          * zsched has exited; the zone is dead.
3594          */
3595         zone->zone_zsched = NULL;            /* paranoia */
3596         mutex_enter(&zone_status_lock);
3597         zone_status_set(zone, ZONE_IS_DEAD);
3598 out:
3599         mutex_exit(&zone_status_lock);
3600         zone_rele(zone);
3601 }
3602 
3603 zoneid_t
3604 getzoneid(void)
3605 {
3606         return (curproc->p_zone->zone_id);
3607 }
3608 
3609 zoneid_t
3610 getzonedid(void)
3611 {
3612         return (curproc->p_zone->zone_did);
3613 }
3614 
3615 /*
3616  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3617  * check the validity of a zone's state.
3618  */
3619 static zone_t *
3620 zone_find_all_by_id(zoneid_t zoneid)
3621 {
3622         mod_hash_val_t hv;
3623         zone_t *zone = NULL;
3624 
3625         ASSERT(MUTEX_HELD(&zonehash_lock));
3626 
3627         if (mod_hash_find(zonehashbyid,
3628             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3629                 zone = (zone_t *)hv;
3630         return (zone);
3631 }
3632 
3633 static zone_t *
3634 zone_find_all_by_label(const ts_label_t *label)
3635 {
3636         mod_hash_val_t hv;
3637         zone_t *zone = NULL;
3638 
3639         ASSERT(MUTEX_HELD(&zonehash_lock));
3640 
3641         /*
3642          * zonehashbylabel is not maintained for unlabeled systems
3643          */
3644         if (!is_system_labeled())
3645                 return (NULL);
3646         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3647                 zone = (zone_t *)hv;
3648         return (zone);
3649 }
3650 
3651 static zone_t *
3652 zone_find_all_by_name(char *name)
3653 {
3654         mod_hash_val_t hv;
3655         zone_t *zone = NULL;
3656 
3657         ASSERT(MUTEX_HELD(&zonehash_lock));
3658 
3659         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3660                 zone = (zone_t *)hv;
3661         return (zone);
3662 }
3663 
3664 /*
3665  * Public interface for looking up a zone by zoneid.  Only returns the zone if
3666  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3667  * Caller must call zone_rele() once it is done with the zone.
3668  *
3669  * The zone may begin the zone_destroy() sequence immediately after this
3670  * function returns, but may be safely used until zone_rele() is called.
3671  */
3672 zone_t *
3673 zone_find_by_id(zoneid_t zoneid)
3674 {
3675         zone_t *zone;
3676         zone_status_t status;
3677 
3678         mutex_enter(&zonehash_lock);
3679         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3680                 mutex_exit(&zonehash_lock);
3681                 return (NULL);
3682         }
3683         status = zone_status_get(zone);
3684         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3685                 /*
3686                  * For all practical purposes the zone doesn't exist.
3687                  */
3688                 mutex_exit(&zonehash_lock);
3689                 return (NULL);
3690         }
3691         zone_hold(zone);
3692         mutex_exit(&zonehash_lock);
3693         return (zone);
3694 }
3695 
3696 /*
3697  * Similar to zone_find_by_id, but using zone label as the key.
3698  */
3699 zone_t *
3700 zone_find_by_label(const ts_label_t *label)
3701 {
3702         zone_t *zone;
3703         zone_status_t status;
3704 
3705         mutex_enter(&zonehash_lock);
3706         if ((zone = zone_find_all_by_label(label)) == NULL) {
3707                 mutex_exit(&zonehash_lock);
3708                 return (NULL);
3709         }
3710 
3711         status = zone_status_get(zone);
3712         if (status > ZONE_IS_DOWN) {
3713                 /*
3714                  * For all practical purposes the zone doesn't exist.
3715                  */
3716                 mutex_exit(&zonehash_lock);
3717                 return (NULL);
3718         }
3719         zone_hold(zone);
3720         mutex_exit(&zonehash_lock);
3721         return (zone);
3722 }
3723 
3724 /*
3725  * Similar to zone_find_by_id, but using zone name as the key.
3726  */
3727 zone_t *
3728 zone_find_by_name(char *name)
3729 {
3730         zone_t *zone;
3731         zone_status_t status;
3732 
3733         mutex_enter(&zonehash_lock);
3734         if ((zone = zone_find_all_by_name(name)) == NULL) {
3735                 mutex_exit(&zonehash_lock);
3736                 return (NULL);
3737         }
3738         status = zone_status_get(zone);
3739         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3740                 /*
3741                  * For all practical purposes the zone doesn't exist.
3742                  */
3743                 mutex_exit(&zonehash_lock);
3744                 return (NULL);
3745         }
3746         zone_hold(zone);
3747         mutex_exit(&zonehash_lock);
3748         return (zone);
3749 }
3750 
3751 /*
3752  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3753  * if there is a zone "foo" rooted at /foo/root, and the path argument
3754  * is "/foo/root/proc", it will return the held zone_t corresponding to
3755  * zone "foo".
3756  *
3757  * zone_find_by_path() always returns a non-NULL value, since at the
3758  * very least every path will be contained in the global zone.
3759  *
3760  * As with the other zone_find_by_*() functions, the caller is
3761  * responsible for zone_rele()ing the return value of this function.
3762  */
3763 zone_t *
3764 zone_find_by_path(const char *path)
3765 {
3766         zone_t *zone;
3767         zone_t *zret = NULL;
3768         zone_status_t status;
3769 
3770         if (path == NULL) {
3771                 /*
3772                  * Call from rootconf().
3773                  */
3774                 zone_hold(global_zone);
3775                 return (global_zone);
3776         }
3777         ASSERT(*path == '/');
3778         mutex_enter(&zonehash_lock);
3779         for (zone = list_head(&zone_active); zone != NULL;
3780             zone = list_next(&zone_active, zone)) {
3781                 if (ZONE_PATH_VISIBLE(path, zone))
3782                         zret = zone;
3783         }
3784         ASSERT(zret != NULL);
3785         status = zone_status_get(zret);
3786         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3787                 /*
3788                  * Zone practically doesn't exist.
3789                  */
3790                 zret = global_zone;
3791         }
3792         zone_hold(zret);
3793         mutex_exit(&zonehash_lock);
3794         return (zret);
3795 }
3796 
3797 /*
3798  * Public interface for updating per-zone load averages.  Called once per
3799  * second.
3800  *
3801  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3802  */
3803 void
3804 zone_loadavg_update(void)
3805 {
3806         zone_t *zp;
3807         zone_status_t status;
3808         struct loadavg_s *lavg;
3809         hrtime_t zone_total;
3810         uint64_t tmp;
3811         int i;
3812         hrtime_t hr_avg;
3813         int nrun;
3814         static int64_t f[3] = { 135, 27, 9 };
3815         int64_t q, r;
3816 
3817         mutex_enter(&zonehash_lock);
3818         for (zp = list_head(&zone_active); zp != NULL;
3819             zp = list_next(&zone_active, zp)) {
3820                 mutex_enter(&zp->zone_lock);
3821 
3822                 /* Skip zones that are on the way down or not yet up */
3823                 status = zone_status_get(zp);
3824                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3825                         /* For all practical purposes the zone doesn't exist. */
3826                         mutex_exit(&zp->zone_lock);
3827                         continue;
3828                 }
3829 
3830                 /*
3831                  * Update the 10 second moving average data in zone_loadavg.
3832                  */
3833                 lavg = &zp->zone_loadavg;
3834 
3835                 tmp = cpu_uarray_sum_all(zp->zone_ustate);
3836                 zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
3837 
3838                 scalehrtime(&zone_total);
3839 
3840                 /* The zone_total should always be increasing. */
3841                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3842                     zone_total - lavg->lg_total : 0;
3843                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3844                 /* lg_total holds the prev. 1 sec. total */
3845                 lavg->lg_total = zone_total;
3846 
3847                 /*
3848                  * To simplify the calculation, we don't calculate the load avg.
3849                  * until the zone has been up for at least 10 seconds and our
3850                  * moving average is thus full.
3851                  */
3852                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3853                         lavg->lg_len++;
3854                         mutex_exit(&zp->zone_lock);
3855                         continue;
3856                 }
3857 
3858                 /* Now calculate the 1min, 5min, 15 min load avg. */
3859                 hr_avg = 0;
3860                 for (i = 0; i < S_LOADAVG_SZ; i++)
3861                         hr_avg += lavg->lg_loads[i];
3862                 hr_avg = hr_avg / S_LOADAVG_SZ;
3863                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3864 
3865                 /* Compute load avg. See comment in calcloadavg() */
3866                 for (i = 0; i < 3; i++) {
3867                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3868                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3869                         zp->zone_hp_avenrun[i] +=
3870                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3871 
3872                         /* avenrun[] can only hold 31 bits of load avg. */
3873                         if (zp->zone_hp_avenrun[i] <
3874                             ((uint64_t)1<<(31+16-FSHIFT)))
3875                                 zp->zone_avenrun[i] = (int32_t)
3876                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3877                         else
3878                                 zp->zone_avenrun[i] = 0x7fffffff;
3879                 }
3880 
3881                 mutex_exit(&zp->zone_lock);
3882         }
3883         mutex_exit(&zonehash_lock);
3884 }
3885 
3886 /*
3887  * Get the number of cpus visible to this zone.  The system-wide global
3888  * 'ncpus' is returned if pools are disabled, the caller is in the
3889  * global zone, or a NULL zone argument is passed in.
3890  */
3891 int
3892 zone_ncpus_get(zone_t *zone)
3893 {
3894         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3895 
3896         return (myncpus != 0 ? myncpus : ncpus);
3897 }
3898 
3899 /*
3900  * Get the number of online cpus visible to this zone.  The system-wide
3901  * global 'ncpus_online' is returned if pools are disabled, the caller
3902  * is in the global zone, or a NULL zone argument is passed in.
3903  */
3904 int
3905 zone_ncpus_online_get(zone_t *zone)
3906 {
3907         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3908 
3909         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3910 }
3911 
3912 /*
3913  * Return the pool to which the zone is currently bound.
3914  */
3915 pool_t *
3916 zone_pool_get(zone_t *zone)
3917 {
3918         ASSERT(pool_lock_held());
3919 
3920         return (zone->zone_pool);
3921 }
3922 
3923 /*
3924  * Set the zone's pool pointer and update the zone's visibility to match
3925  * the resources in the new pool.
3926  */
3927 void
3928 zone_pool_set(zone_t *zone, pool_t *pool)
3929 {
3930         ASSERT(pool_lock_held());
3931         ASSERT(MUTEX_HELD(&cpu_lock));
3932 
3933         zone->zone_pool = pool;
3934         zone_pset_set(zone, pool->pool_pset->pset_id);
3935 }
3936 
3937 /*
3938  * Return the cached value of the id of the processor set to which the
3939  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3940  * facility is disabled.
3941  */
3942 psetid_t
3943 zone_pset_get(zone_t *zone)
3944 {
3945         ASSERT(MUTEX_HELD(&cpu_lock));
3946 
3947         return (zone->zone_psetid);
3948 }
3949 
3950 /*
3951  * Set the cached value of the id of the processor set to which the zone
3952  * is currently bound.  Also update the zone's visibility to match the
3953  * resources in the new processor set.
3954  */
3955 void
3956 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3957 {
3958         psetid_t oldpsetid;
3959 
3960         ASSERT(MUTEX_HELD(&cpu_lock));
3961         oldpsetid = zone_pset_get(zone);
3962 
3963         if (oldpsetid == newpsetid)
3964                 return;
3965         /*
3966          * Global zone sees all.
3967          */
3968         if (zone != global_zone) {
3969                 zone->zone_psetid = newpsetid;
3970                 if (newpsetid != ZONE_PS_INVAL)
3971                         pool_pset_visibility_add(newpsetid, zone);
3972                 if (oldpsetid != ZONE_PS_INVAL)
3973                         pool_pset_visibility_remove(oldpsetid, zone);
3974         }
3975         /*
3976          * Disabling pools, so we should start using the global values
3977          * for ncpus and ncpus_online.
3978          */
3979         if (newpsetid == ZONE_PS_INVAL) {
3980                 zone->zone_ncpus = 0;
3981                 zone->zone_ncpus_online = 0;
3982         }
3983 }
3984 
3985 /*
3986  * Walk the list of active zones and issue the provided callback for
3987  * each of them.
3988  *
3989  * Caller must not be holding any locks that may be acquired under
3990  * zonehash_lock.  See comment at the beginning of the file for a list of
3991  * common locks and their interactions with zones.
3992  */
3993 int
3994 zone_walk(int (*cb)(zone_t *, void *), void *data)
3995 {
3996         zone_t *zone;
3997         int ret = 0;
3998         zone_status_t status;
3999 
4000         mutex_enter(&zonehash_lock);
4001         for (zone = list_head(&zone_active); zone != NULL;
4002             zone = list_next(&zone_active, zone)) {
4003                 /*
4004                  * Skip zones that shouldn't be externally visible.
4005                  */
4006                 status = zone_status_get(zone);
4007                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
4008                         continue;
4009                 /*
4010                  * Bail immediately if any callback invocation returns a
4011                  * non-zero value.
4012                  */
4013                 ret = (*cb)(zone, data);
4014                 if (ret != 0)
4015                         break;
4016         }
4017         mutex_exit(&zonehash_lock);
4018         return (ret);
4019 }
4020 
4021 static int
4022 zone_set_root(zone_t *zone, const char *upath)
4023 {
4024         vnode_t *vp;
4025         int trycount;
4026         int error = 0;
4027         char *path;
4028         struct pathname upn, pn;
4029         size_t pathlen;
4030 
4031         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
4032                 return (error);
4033 
4034         pn_alloc(&pn);
4035 
4036         /* prevent infinite loop */
4037         trycount = 10;
4038         for (;;) {
4039                 if (--trycount <= 0) {
4040                         error = ESTALE;
4041                         goto out;
4042                 }
4043 
4044                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
4045                         /*
4046                          * VOP_ACCESS() may cover 'vp' with a new
4047                          * filesystem, if 'vp' is an autoFS vnode.
4048                          * Get the new 'vp' if so.
4049                          */
4050                         if ((error =
4051                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
4052                             (!vn_ismntpt(vp) ||
4053                             (error = traverse(&vp)) == 0)) {
4054                                 pathlen = pn.pn_pathlen + 2;
4055                                 path = kmem_alloc(pathlen, KM_SLEEP);
4056                                 (void) strncpy(path, pn.pn_path,
4057                                     pn.pn_pathlen + 1);
4058                                 path[pathlen - 2] = '/';
4059                                 path[pathlen - 1] = '\0';
4060                                 pn_free(&pn);
4061                                 pn_free(&upn);
4062 
4063                                 /* Success! */
4064                                 break;
4065                         }
4066                         VN_RELE(vp);
4067                 }
4068                 if (error != ESTALE)
4069                         goto out;
4070         }
4071 
4072         ASSERT(error == 0);
4073         mutex_enter(&vp->v_lock);
4074         if (vp->v_flag & VZONEROOT) {
4075                 /* Wow, someone's already using this zone root! */
4076                 error = EEXIST; /* XXX KEBE ASKS, better errno? */
4077                 mutex_exit(&vp->v_lock);
4078                 VN_RELE(vp);
4079                 goto out;
4080         }
4081         vp->v_flag |= VZONEROOT;
4082         mutex_exit(&vp->v_lock);
4083         zone->zone_rootvp = vp;              /* we hold a reference to vp */
4084         zone->zone_rootpath = path;
4085         zone->zone_rootpathlen = pathlen;
4086         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
4087                 zone->zone_flags |= ZF_IS_SCRATCH;
4088         return (0);
4089 
4090 out:
4091         pn_free(&pn);
4092         pn_free(&upn);
4093         return (error);
4094 }
4095 
4096 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
4097                         ((c) >= 'a' && (c) <= 'z') || \
4098                         ((c) >= 'A' && (c) <= 'Z'))
4099 
4100 static int
4101 zone_set_name(zone_t *zone, const char *uname)
4102 {
4103         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
4104         size_t len;
4105         int i, err;
4106 
4107         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
4108                 kmem_free(kname, ZONENAME_MAX);
4109                 return (err);   /* EFAULT or ENAMETOOLONG */
4110         }
4111 
4112         /* must be less than ZONENAME_MAX */
4113         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
4114                 kmem_free(kname, ZONENAME_MAX);
4115                 return (EINVAL);
4116         }
4117 
4118         /*
4119          * Name must start with an alphanumeric and must contain only
4120          * alphanumerics, '-', '_' and '.'.
4121          */
4122         if (!isalnum(kname[0])) {
4123                 kmem_free(kname, ZONENAME_MAX);
4124                 return (EINVAL);
4125         }
4126         for (i = 1; i < len - 1; i++) {
4127                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
4128                     kname[i] != '.') {
4129                         kmem_free(kname, ZONENAME_MAX);
4130                         return (EINVAL);
4131                 }
4132         }
4133 
4134         zone->zone_name = kname;
4135         return (0);
4136 }
4137 
4138 /*
4139  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
4140  * is NULL or it points to a zone with no hostid emulation, then the machine's
4141  * hostid (i.e., the global zone's hostid) is returned.  This function returns
4142  * zero if neither the zone nor the host machine (global zone) have hostids.  It
4143  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
4144  * hostid and the machine's hostid is invalid.
4145  */
4146 uint32_t
4147 zone_get_hostid(zone_t *zonep)
4148 {
4149         unsigned long machine_hostid;
4150 
4151         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
4152                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
4153                         return (HW_INVALID_HOSTID);
4154                 return ((uint32_t)machine_hostid);
4155         }
4156         return (zonep->zone_hostid);
4157 }
4158 
4159 /*
4160  * Similar to thread_create(), but makes sure the thread is in the appropriate
4161  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
4162  */
4163 /*ARGSUSED*/
4164 kthread_t *
4165 zthread_create(
4166     caddr_t stk,
4167     size_t stksize,
4168     void (*proc)(),
4169     void *arg,
4170     size_t len,
4171     pri_t pri)
4172 {
4173         kthread_t *t;
4174         zone_t *zone = curproc->p_zone;
4175         proc_t *pp = zone->zone_zsched;
4176 
4177         zone_hold(zone);        /* Reference to be dropped when thread exits */
4178 
4179         /*
4180          * No-one should be trying to create threads if the zone is shutting
4181          * down and there aren't any kernel threads around.  See comment
4182          * in zthread_exit().
4183          */
4184         ASSERT(!(zone->zone_kthreads == NULL &&
4185             zone_status_get(zone) >= ZONE_IS_EMPTY));
4186         /*
4187          * Create a thread, but don't let it run until we've finished setting
4188          * things up.
4189          */
4190         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
4191         ASSERT(t->t_forw == NULL);
4192         mutex_enter(&zone_status_lock);
4193         if (zone->zone_kthreads == NULL) {
4194                 t->t_forw = t->t_back = t;
4195         } else {
4196                 kthread_t *tx = zone->zone_kthreads;
4197 
4198                 t->t_forw = tx;
4199                 t->t_back = tx->t_back;
4200                 tx->t_back->t_forw = t;
4201                 tx->t_back = t;
4202         }
4203         zone->zone_kthreads = t;
4204         mutex_exit(&zone_status_lock);
4205 
4206         mutex_enter(&pp->p_lock);
4207         t->t_proc_flag |= TP_ZTHREAD;
4208         project_rele(t->t_proj);
4209         t->t_proj = project_hold(pp->p_task->tk_proj);
4210 
4211         /*
4212          * Setup complete, let it run.
4213          */
4214         thread_lock(t);
4215         t->t_schedflag |= TS_ALLSTART;
4216         setrun_locked(t);
4217         thread_unlock(t);
4218 
4219         mutex_exit(&pp->p_lock);
4220 
4221         return (t);
4222 }
4223 
4224 /*
4225  * Similar to thread_exit().  Must be called by threads created via
4226  * zthread_exit().
4227  */
4228 void
4229 zthread_exit(void)
4230 {
4231         kthread_t *t = curthread;
4232         proc_t *pp = curproc;
4233         zone_t *zone = pp->p_zone;
4234 
4235         mutex_enter(&zone_status_lock);
4236 
4237         /*
4238          * Reparent to p0
4239          */
4240         kpreempt_disable();
4241         mutex_enter(&pp->p_lock);
4242         t->t_proc_flag &= ~TP_ZTHREAD;
4243         t->t_procp = &p0;
4244         hat_thread_exit(t);
4245         mutex_exit(&pp->p_lock);
4246         kpreempt_enable();
4247 
4248         if (t->t_back == t) {
4249                 ASSERT(t->t_forw == t);
4250                 /*
4251                  * If the zone is empty, once the thread count
4252                  * goes to zero no further kernel threads can be
4253                  * created.  This is because if the creator is a process
4254                  * in the zone, then it must have exited before the zone
4255                  * state could be set to ZONE_IS_EMPTY.
4256                  * Otherwise, if the creator is a kernel thread in the
4257                  * zone, the thread count is non-zero.
4258                  *
4259                  * This really means that non-zone kernel threads should
4260                  * not create zone kernel threads.
4261                  */
4262                 zone->zone_kthreads = NULL;
4263                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
4264                         zone_status_set(zone, ZONE_IS_DOWN);
4265                         /*
4266                          * Remove any CPU caps on this zone.
4267                          */
4268                         cpucaps_zone_remove(zone);
4269                 }
4270         } else {
4271                 t->t_forw->t_back = t->t_back;
4272                 t->t_back->t_forw = t->t_forw;
4273                 if (zone->zone_kthreads == t)
4274                         zone->zone_kthreads = t->t_forw;
4275         }
4276         mutex_exit(&zone_status_lock);
4277         zone_rele(zone);
4278         thread_exit();
4279         /* NOTREACHED */
4280 }
4281 
4282 static void
4283 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
4284 {
4285         vnode_t *oldvp;
4286 
4287         /* we're going to hold a reference here to the directory */
4288         VN_HOLD(vp);
4289 
4290         /* update abs cwd/root path see c2/audit.c */
4291         if (AU_AUDITING())
4292                 audit_chdirec(vp, vpp);
4293 
4294         mutex_enter(&pp->p_lock);
4295         oldvp = *vpp;
4296         *vpp = vp;
4297         mutex_exit(&pp->p_lock);
4298         if (oldvp != NULL)
4299                 VN_RELE(oldvp);
4300 }
4301 
4302 /*
4303  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
4304  */
4305 static int
4306 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
4307 {
4308         nvpair_t *nvp = NULL;
4309         boolean_t priv_set = B_FALSE;
4310         boolean_t limit_set = B_FALSE;
4311         boolean_t action_set = B_FALSE;
4312 
4313         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4314                 const char *name;
4315                 uint64_t ui64;
4316 
4317                 name = nvpair_name(nvp);
4318                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
4319                         return (EINVAL);
4320                 (void) nvpair_value_uint64(nvp, &ui64);
4321                 if (strcmp(name, "privilege") == 0) {
4322                         /*
4323                          * Currently only privileged values are allowed, but
4324                          * this may change in the future.
4325                          */
4326                         if (ui64 != RCPRIV_PRIVILEGED)
4327                                 return (EINVAL);
4328                         rv->rcv_privilege = ui64;
4329                         priv_set = B_TRUE;
4330                 } else if (strcmp(name, "limit") == 0) {
4331                         rv->rcv_value = ui64;
4332                         limit_set = B_TRUE;
4333                 } else if (strcmp(name, "action") == 0) {
4334                         if (ui64 != RCTL_LOCAL_NOACTION &&
4335                             ui64 != RCTL_LOCAL_DENY)
4336                                 return (EINVAL);
4337                         rv->rcv_flagaction = ui64;
4338                         action_set = B_TRUE;
4339                 } else {
4340                         return (EINVAL);
4341                 }
4342         }
4343 
4344         if (!(priv_set && limit_set && action_set))
4345                 return (EINVAL);
4346         rv->rcv_action_signal = 0;
4347         rv->rcv_action_recipient = NULL;
4348         rv->rcv_action_recip_pid = -1;
4349         rv->rcv_firing_time = 0;
4350 
4351         return (0);
4352 }
4353 
4354 /*
4355  * Non-global zone version of start_init.
4356  */
4357 void
4358 zone_start_init(void)
4359 {
4360         proc_t *p = ttoproc(curthread);
4361         zone_t *z = p->p_zone;
4362 
4363         ASSERT(!INGLOBALZONE(curproc));
4364 
4365         /*
4366          * For all purposes (ZONE_ATTR_INITPID and restart_init),
4367          * storing just the pid of init is sufficient.
4368          */
4369         z->zone_proc_initpid = p->p_pid;
4370 
4371         if (z->zone_setup_app_contract == B_TRUE) {
4372                 /*
4373                  * Normally a process cannot modify its own contract, but we're
4374                  * just starting the zone's init process and its contract is
4375                  * always initialized from the sys_process_tmpl template, so
4376                  * this is the simplest way to setup init's contract to kill
4377                  * the process if any other process in the contract exits.
4378                  */
4379                 p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
4380         }
4381 
4382         /*
4383          * We maintain zone_boot_err so that we can return the cause of the
4384          * failure back to the caller of the zone_boot syscall.
4385          */
4386         p->p_zone->zone_boot_err = start_init_common();
4387 
4388         /*
4389          * We will prevent booting zones from becoming running zones if the
4390          * global zone is shutting down.
4391          */
4392         mutex_enter(&zone_status_lock);
4393         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
4394             ZONE_IS_SHUTTING_DOWN) {
4395                 /*
4396                  * Make sure we are still in the booting state-- we could have
4397                  * raced and already be shutting down, or even further along.
4398                  */
4399                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
4400                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
4401                 }
4402                 mutex_exit(&zone_status_lock);
4403                 /* It's gone bad, dispose of the process */
4404                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
4405                         mutex_enter(&p->p_lock);
4406                         ASSERT(p->p_flag & SEXITLWPS);
4407                         lwp_exit();
4408                 }
4409         } else {
4410                 id_t cid = curthread->t_cid;
4411 
4412                 if (zone_status_get(z) == ZONE_IS_BOOTING)
4413                         zone_status_set(z, ZONE_IS_RUNNING);
4414                 mutex_exit(&zone_status_lock);
4415 
4416                 mutex_enter(&class_lock);
4417                 ASSERT(cid < loaded_classes);
4418                 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
4419                     z->zone_fixed_hipri) {
4420                         /*
4421                          * If the zone is using FX then by default all
4422                          * processes start at the lowest priority and stay
4423                          * there. We provide a mechanism for the zone to
4424                          * indicate that it should run at "high priority". In
4425                          * this case we setup init to run at the highest FX
4426                          * priority (which is one level higher than the
4427                          * non-fixed scheduling classes can use).
4428                          */
4429                         pcparms_t pcparms;
4430 
4431                         pcparms.pc_cid = cid;
4432                         ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
4433                         ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
4434                             FXMAXUPRI;
4435                         ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
4436                             FX_DOUPRILIM | FX_DOUPRI;
4437 
4438                         mutex_enter(&pidlock);
4439                         mutex_enter(&curproc->p_lock);
4440 
4441                         (void) parmsset(&pcparms, curthread);
4442 
4443                         mutex_exit(&curproc->p_lock);
4444                         mutex_exit(&pidlock);
4445                 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
4446                         /*
4447                          * zsched always starts the init lwp at priority
4448                          * minclsyspri - 1. This priority gets set in t_pri and
4449                          * is invalid for RT, but RT never uses t_pri. However
4450                          * t_pri is used by procfs, so we always see processes
4451                          * within an RT zone with an invalid priority value.
4452                          * We fix that up now.
4453                          */
4454                         curthread->t_pri = RTGPPRIO0;
4455                 }
4456                 mutex_exit(&class_lock);
4457 
4458                 /* cause the process to return to userland. */
4459                 lwp_rtt();
4460         }
4461 }
4462 
4463 struct zsched_arg {
4464         zone_t *zone;
4465         nvlist_t *nvlist;
4466 };
4467 
4468 /*
4469  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
4470  * anything to do with scheduling, but rather with the fact that
4471  * per-zone kernel threads are parented to zsched, just like regular
4472  * kernel threads are parented to sched (p0).
4473  *
4474  * zsched is also responsible for launching init for the zone.
4475  */
4476 static void
4477 zsched(void *arg)
4478 {
4479         struct zsched_arg *za = arg;
4480         proc_t *pp = curproc;
4481         proc_t *initp = proc_init;
4482         zone_t *zone = za->zone;
4483         cred_t *cr, *oldcred;
4484         rctl_set_t *set;
4485         rctl_alloc_gp_t *gp;
4486         contract_t *ct = NULL;
4487         task_t *tk, *oldtk;
4488         rctl_entity_p_t e;
4489         kproject_t *pj;
4490 
4491         nvlist_t *nvl = za->nvlist;
4492         nvpair_t *nvp = NULL;
4493 
4494         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
4495         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
4496         PTOU(pp)->u_argc = 0;
4497         PTOU(pp)->u_argv = 0;
4498         PTOU(pp)->u_envp = 0;
4499         PTOU(pp)->u_commpagep = 0;
4500         closeall(P_FINFO(pp));
4501 
4502         /*
4503          * We are this zone's "zsched" process.  As the zone isn't generally
4504          * visible yet we don't need to grab any locks before initializing its
4505          * zone_proc pointer.
4506          */
4507         zone_hold(zone);  /* this hold is released by zone_destroy() */
4508         zone->zone_zsched = pp;
4509         mutex_enter(&pp->p_lock);
4510         pp->p_zone = zone;
4511         mutex_exit(&pp->p_lock);
4512 
4513         /*
4514          * Disassociate process from its 'parent'; parent ourselves to init
4515          * (pid 1) and change other values as needed.
4516          */
4517         sess_create();
4518 
4519         mutex_enter(&pidlock);
4520         proc_detach(pp);
4521         pp->p_ppid = 1;
4522         pp->p_flag |= SZONETOP;
4523         pp->p_ancpid = 1;
4524         pp->p_parent = initp;
4525         pp->p_psibling = NULL;
4526         if (initp->p_child)
4527                 initp->p_child->p_psibling = pp;
4528         pp->p_sibling = initp->p_child;
4529         initp->p_child = pp;
4530 
4531         /* Decrement what newproc() incremented. */
4532         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
4533         /*
4534          * Our credentials are about to become kcred-like, so we don't care
4535          * about the caller's ruid.
4536          */
4537         upcount_inc(crgetruid(kcred), zone->zone_id);
4538         mutex_exit(&pidlock);
4539 
4540         /*
4541          * getting out of global zone, so decrement lwp and process counts
4542          */
4543         pj = pp->p_task->tk_proj;
4544         mutex_enter(&global_zone->zone_nlwps_lock);
4545         pj->kpj_nlwps -= pp->p_lwpcnt;
4546         global_zone->zone_nlwps -= pp->p_lwpcnt;
4547         pj->kpj_nprocs--;
4548         global_zone->zone_nprocs--;
4549         mutex_exit(&global_zone->zone_nlwps_lock);
4550 
4551         /*
4552          * Decrement locked memory counts on old zone and project.
4553          */
4554         mutex_enter(&global_zone->zone_mem_lock);
4555         global_zone->zone_locked_mem -= pp->p_locked_mem;
4556         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
4557         mutex_exit(&global_zone->zone_mem_lock);
4558 
4559         /*
4560          * Create and join a new task in project '0' of this zone.
4561          *
4562          * We don't need to call holdlwps() since we know we're the only lwp in
4563          * this process.
4564          *
4565          * task_join() returns with p_lock held.
4566          */
4567         tk = task_create(0, zone);
4568         mutex_enter(&cpu_lock);
4569         oldtk = task_join(tk, 0);
4570 
4571         pj = pp->p_task->tk_proj;
4572 
4573         mutex_enter(&zone->zone_mem_lock);
4574         zone->zone_locked_mem += pp->p_locked_mem;
4575         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
4576         mutex_exit(&zone->zone_mem_lock);
4577 
4578         /*
4579          * add lwp and process counts to zsched's zone, and increment
4580          * project's task and process count due to the task created in
4581          * the above task_create.
4582          */
4583         mutex_enter(&zone->zone_nlwps_lock);
4584         pj->kpj_nlwps += pp->p_lwpcnt;
4585         pj->kpj_ntasks += 1;
4586         zone->zone_nlwps += pp->p_lwpcnt;
4587         pj->kpj_nprocs++;
4588         zone->zone_nprocs++;
4589         mutex_exit(&zone->zone_nlwps_lock);
4590 
4591         mutex_exit(&curproc->p_lock);
4592         mutex_exit(&cpu_lock);
4593         task_rele(oldtk);
4594 
4595         /*
4596          * The process was created by a process in the global zone, hence the
4597          * credentials are wrong.  We might as well have kcred-ish credentials.
4598          */
4599         cr = zone->zone_kcred;
4600         crhold(cr);
4601         mutex_enter(&pp->p_crlock);
4602         oldcred = pp->p_cred;
4603         pp->p_cred = cr;
4604         mutex_exit(&pp->p_crlock);
4605         crfree(oldcred);
4606 
4607         /*
4608          * Hold credentials again (for thread)
4609          */
4610         crhold(cr);
4611 
4612         /*
4613          * p_lwpcnt can't change since this is a kernel process.
4614          */
4615         crset(pp, cr);
4616 
4617         /*
4618          * Chroot
4619          */
4620         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
4621         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
4622 
4623         /*
4624          * Initialize zone's rctl set.
4625          */
4626         set = rctl_set_create();
4627         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
4628         mutex_enter(&pp->p_lock);
4629         e.rcep_p.zone = zone;
4630         e.rcep_t = RCENTITY_ZONE;
4631         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
4632         mutex_exit(&pp->p_lock);
4633         rctl_prealloc_destroy(gp);
4634 
4635         /*
4636          * Apply the rctls passed in to zone_create().  This is basically a list
4637          * assignment: all of the old values are removed and the new ones
4638          * inserted.  That is, if an empty list is passed in, all values are
4639          * removed.
4640          */
4641         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4642                 rctl_dict_entry_t *rde;
4643                 rctl_hndl_t hndl;
4644                 char *name;
4645                 nvlist_t **nvlarray;
4646                 uint_t i, nelem;
4647                 int error;      /* For ASSERT()s */
4648 
4649                 name = nvpair_name(nvp);
4650                 hndl = rctl_hndl_lookup(name);
4651                 ASSERT(hndl != -1);
4652                 rde = rctl_dict_lookup_hndl(hndl);
4653                 ASSERT(rde != NULL);
4654 
4655                 for (; /* ever */; ) {
4656                         rctl_val_t oval;
4657 
4658                         mutex_enter(&pp->p_lock);
4659                         error = rctl_local_get(hndl, NULL, &oval, pp);
4660                         mutex_exit(&pp->p_lock);
4661                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4662                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4663                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
4664                                 break;
4665                         mutex_enter(&pp->p_lock);
4666                         error = rctl_local_delete(hndl, &oval, pp);
4667                         mutex_exit(&pp->p_lock);
4668                         ASSERT(error == 0);
4669                 }
4670                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4671                 ASSERT(error == 0);
4672                 for (i = 0; i < nelem; i++) {
4673                         rctl_val_t *nvalp;
4674 
4675                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4676                         error = nvlist2rctlval(nvlarray[i], nvalp);
4677                         ASSERT(error == 0);
4678                         /*
4679                          * rctl_local_insert can fail if the value being
4680                          * inserted is a duplicate; this is OK.
4681                          */
4682                         mutex_enter(&pp->p_lock);
4683                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
4684                                 kmem_cache_free(rctl_val_cache, nvalp);
4685                         mutex_exit(&pp->p_lock);
4686                 }
4687         }
4688 
4689         /*
4690          * Tell the world that we're done setting up.
4691          *
4692          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4693          * and atomically set the zone's processor set visibility.  Once
4694          * we drop pool_lock() this zone will automatically get updated
4695          * to reflect any future changes to the pools configuration.
4696          *
4697          * Note that after we drop the locks below (zonehash_lock in
4698          * particular) other operations such as a zone_getattr call can
4699          * now proceed and observe the zone. That is the reason for doing a
4700          * state transition to the INITIALIZED state.
4701          */
4702         pool_lock();
4703         mutex_enter(&cpu_lock);
4704         mutex_enter(&zonehash_lock);
4705         zone_uniqid(zone);
4706         zone_zsd_configure(zone);
4707         if (pool_state == POOL_ENABLED)
4708                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4709         mutex_enter(&zone_status_lock);
4710         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4711         zone_status_set(zone, ZONE_IS_INITIALIZED);
4712         mutex_exit(&zone_status_lock);
4713         mutex_exit(&zonehash_lock);
4714         mutex_exit(&cpu_lock);
4715         pool_unlock();
4716 
4717         /* Now call the create callback for this key */
4718         zsd_apply_all_keys(zsd_apply_create, zone);
4719 
4720         /* The callbacks are complete. Mark ZONE_IS_READY */
4721         mutex_enter(&zone_status_lock);
4722         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4723         zone_status_set(zone, ZONE_IS_READY);
4724         mutex_exit(&zone_status_lock);
4725 
4726         /*
4727          * Once we see the zone transition to the ZONE_IS_BOOTING state,
4728          * we launch init, and set the state to running.
4729          */
4730         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4731 
4732         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4733                 id_t cid;
4734 
4735                 /*
4736                  * Ok, this is a little complicated.  We need to grab the
4737                  * zone's pool's scheduling class ID; note that by now, we
4738                  * are already bound to a pool if we need to be (zoneadmd
4739                  * will have done that to us while we're in the READY
4740                  * state).  *But* the scheduling class for the zone's 'init'
4741                  * must be explicitly passed to newproc, which doesn't
4742                  * respect pool bindings.
4743                  *
4744                  * We hold the pool_lock across the call to newproc() to
4745                  * close the obvious race: the pool's scheduling class
4746                  * could change before we manage to create the LWP with
4747                  * classid 'cid'.
4748                  */
4749                 pool_lock();
4750                 if (zone->zone_defaultcid > 0)
4751                         cid = zone->zone_defaultcid;
4752                 else
4753                         cid = pool_get_class(zone->zone_pool);
4754                 if (cid == -1)
4755                         cid = defaultcid;
4756 
4757                 /*
4758                  * If this fails, zone_boot will ultimately fail.  The
4759                  * state of the zone will be set to SHUTTING_DOWN-- userland
4760                  * will have to tear down the zone, and fail, or try again.
4761                  */
4762                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4763                     minclsyspri - 1, &ct, 0)) != 0) {
4764                         mutex_enter(&zone_status_lock);
4765                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4766                         mutex_exit(&zone_status_lock);
4767                 } else {
4768                         zone->zone_boot_time = gethrestime_sec();
4769                 }
4770 
4771                 pool_unlock();
4772         }
4773 
4774         /*
4775          * Wait for zone_destroy() to be called.  This is what we spend
4776          * most of our life doing.
4777          */
4778         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4779 
4780         if (ct)
4781                 /*
4782                  * At this point the process contract should be empty.
4783                  * (Though if it isn't, it's not the end of the world.)
4784                  */
4785                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4786 
4787         /*
4788          * Allow kcred to be freed when all referring processes
4789          * (including this one) go away.  We can't just do this in
4790          * zone_free because we need to wait for the zone_cred_ref to
4791          * drop to 0 before calling zone_free, and the existence of
4792          * zone_kcred will prevent that.  Thus, we call crfree here to
4793          * balance the crdup in zone_create.  The crhold calls earlier
4794          * in zsched will be dropped when the thread and process exit.
4795          */
4796         crfree(zone->zone_kcred);
4797         zone->zone_kcred = NULL;
4798 
4799         exit(CLD_EXITED, 0);
4800 }
4801 
4802 /*
4803  * Helper function to determine if there are any submounts of the
4804  * provided path.  Used to make sure the zone doesn't "inherit" any
4805  * mounts from before it is created.
4806  */
4807 static uint_t
4808 zone_mount_count(const char *rootpath)
4809 {
4810         vfs_t *vfsp;
4811         uint_t count = 0;
4812         size_t rootpathlen = strlen(rootpath);
4813 
4814         /*
4815          * Holding zonehash_lock prevents race conditions with
4816          * vfs_list_add()/vfs_list_remove() since we serialize with
4817          * zone_find_by_path().
4818          */
4819         ASSERT(MUTEX_HELD(&zonehash_lock));
4820         /*
4821          * The rootpath must end with a '/'
4822          */
4823         ASSERT(rootpath[rootpathlen - 1] == '/');
4824 
4825         /*
4826          * This intentionally does not count the rootpath itself if that
4827          * happens to be a mount point.
4828          */
4829         vfs_list_read_lock();
4830         vfsp = rootvfs;
4831         do {
4832                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4833                     rootpathlen) == 0)
4834                         count++;
4835                 vfsp = vfsp->vfs_next;
4836         } while (vfsp != rootvfs);
4837         vfs_list_unlock();
4838         return (count);
4839 }
4840 
4841 /*
4842  * Helper function to make sure that a zone created on 'rootpath'
4843  * wouldn't end up containing other zones' rootpaths.
4844  */
4845 static boolean_t
4846 zone_is_nested(const char *rootpath)
4847 {
4848         zone_t *zone;
4849         size_t rootpathlen = strlen(rootpath);
4850         size_t len;
4851 
4852         ASSERT(MUTEX_HELD(&zonehash_lock));
4853 
4854         /*
4855          * zone_set_root() appended '/' and '\0' at the end of rootpath
4856          */
4857         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4858             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4859                 return (B_TRUE);
4860 
4861         for (zone = list_head(&zone_active); zone != NULL;
4862             zone = list_next(&zone_active, zone)) {
4863                 if (zone == global_zone)
4864                         continue;
4865                 len = strlen(zone->zone_rootpath);
4866                 if (strncmp(rootpath, zone->zone_rootpath,
4867                     MIN(rootpathlen, len)) == 0)
4868                         return (B_TRUE);
4869         }
4870         return (B_FALSE);
4871 }
4872 
4873 static int
4874 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4875     size_t zone_privssz)
4876 {
4877         priv_set_t *privs;
4878 
4879         if (zone_privssz < sizeof (priv_set_t))
4880                 return (ENOMEM);
4881 
4882         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4883 
4884         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4885                 kmem_free(privs, sizeof (priv_set_t));
4886                 return (EFAULT);
4887         }
4888 
4889         zone->zone_privset = privs;
4890         return (0);
4891 }
4892 
4893 /*
4894  * We make creative use of nvlists to pass in rctls from userland.  The list is
4895  * a list of the following structures:
4896  *
4897  * (name = rctl_name, value = nvpair_list_array)
4898  *
4899  * Where each element of the nvpair_list_array is of the form:
4900  *
4901  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4902  *      (name = "limit", value = uint64_t),
4903  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4904  */
4905 static int
4906 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4907 {
4908         nvpair_t *nvp = NULL;
4909         nvlist_t *nvl = NULL;
4910         char *kbuf;
4911         int error;
4912         rctl_val_t rv;
4913 
4914         *nvlp = NULL;
4915 
4916         if (buflen == 0)
4917                 return (0);
4918 
4919         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4920                 return (ENOMEM);
4921         if (copyin(ubuf, kbuf, buflen)) {
4922                 error = EFAULT;
4923                 goto out;
4924         }
4925         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4926                 /*
4927                  * nvl may have been allocated/free'd, but the value set to
4928                  * non-NULL, so we reset it here.
4929                  */
4930                 nvl = NULL;
4931                 error = EINVAL;
4932                 goto out;
4933         }
4934         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4935                 rctl_dict_entry_t *rde;
4936                 rctl_hndl_t hndl;
4937                 nvlist_t **nvlarray;
4938                 uint_t i, nelem;
4939                 char *name;
4940 
4941                 error = EINVAL;
4942                 name = nvpair_name(nvp);
4943                 if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
4944                     strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
4945                     nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4946                         goto out;
4947                 }
4948                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4949                         goto out;
4950                 }
4951                 rde = rctl_dict_lookup_hndl(hndl);
4952                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4953                 ASSERT(error == 0);
4954                 for (i = 0; i < nelem; i++) {
4955                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4956                                 goto out;
4957                 }
4958                 if (rctl_invalid_value(rde, &rv)) {
4959                         error = EINVAL;
4960                         goto out;
4961                 }
4962         }
4963         error = 0;
4964         *nvlp = nvl;
4965 out:
4966         kmem_free(kbuf, buflen);
4967         if (error && nvl != NULL)
4968                 nvlist_free(nvl);
4969         return (error);
4970 }
4971 
4972 int
4973 zone_create_error(int er_error, int er_ext, int *er_out)
4974 {
4975         if (er_out != NULL) {
4976                 if (copyout(&er_ext, er_out, sizeof (int))) {
4977                         return (set_errno(EFAULT));
4978                 }
4979         }
4980         return (set_errno(er_error));
4981 }
4982 
4983 static int
4984 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4985 {
4986         ts_label_t *tsl;
4987         bslabel_t blab;
4988 
4989         /* Get label from user */
4990         if (copyin(lab, &blab, sizeof (blab)) != 0)
4991                 return (EFAULT);
4992         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4993         if (tsl == NULL)
4994                 return (ENOMEM);
4995 
4996         zone->zone_slabel = tsl;
4997         return (0);
4998 }
4999 
5000 /*
5001  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
5002  */
5003 static int
5004 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
5005 {
5006         char *kbuf;
5007         char *dataset, *next;
5008         zone_dataset_t *zd;
5009         size_t len;
5010 
5011         if (ubuf == NULL || buflen == 0)
5012                 return (0);
5013 
5014         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
5015                 return (ENOMEM);
5016 
5017         if (copyin(ubuf, kbuf, buflen) != 0) {
5018                 kmem_free(kbuf, buflen);
5019                 return (EFAULT);
5020         }
5021 
5022         dataset = next = kbuf;
5023         for (;;) {
5024                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
5025 
5026                 next = strchr(dataset, ',');
5027 
5028                 if (next == NULL)
5029                         len = strlen(dataset);
5030                 else
5031                         len = next - dataset;
5032 
5033                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
5034                 bcopy(dataset, zd->zd_dataset, len);
5035                 zd->zd_dataset[len] = '\0';
5036 
5037                 list_insert_head(&zone->zone_datasets, zd);
5038 
5039                 if (next == NULL)
5040                         break;
5041 
5042                 dataset = next + 1;
5043         }
5044 
5045         kmem_free(kbuf, buflen);
5046         return (0);
5047 }
5048 
5049 /*
5050  * System call to create/initialize a new zone named 'zone_name', rooted
5051  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
5052  * and initialized with the zone-wide rctls described in 'rctlbuf', and
5053  * with labeling set by 'match', 'doi', and 'label'.
5054  *
5055  * If extended error is non-null, we may use it to return more detailed
5056  * error information.
5057  */
5058 static zoneid_t
5059 zone_create(const char *zone_name, const char *zone_root,
5060     const priv_set_t *zone_privs, size_t zone_privssz,
5061     caddr_t rctlbuf, size_t rctlbufsz,
5062     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
5063     int match, uint32_t doi, const bslabel_t *label,
5064     int flags, zoneid_t zone_did)
5065 {
5066         struct zsched_arg zarg;
5067         nvlist_t *rctls = NULL;
5068         proc_t *pp = curproc;
5069         zone_t *zone, *ztmp;
5070         zoneid_t zoneid, start = GLOBAL_ZONEID;
5071         int error;
5072         int error2 = 0;
5073         char *str;
5074         cred_t *zkcr;
5075         boolean_t insert_label_hash;
5076 
5077         if (secpolicy_zone_config(CRED()) != 0)
5078                 return (set_errno(EPERM));
5079 
5080         /* can't boot zone from within chroot environment */
5081         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
5082                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
5083                     extended_error));
5084         /*
5085          * As the first step of zone creation, we want to allocate a zoneid.
5086          * This allocation is complicated by the fact that netstacks use the
5087          * zoneid to determine their stackid, but netstacks themselves are
5088          * freed asynchronously with respect to zone destruction.  This means
5089          * that a netstack reference leak (or in principle, an extraordinarily
5090          * long netstack reference hold) could result in a zoneid being
5091          * allocated that in fact corresponds to a stackid from an active
5092          * (referenced) netstack -- unleashing all sorts of havoc when that
5093          * netstack is actually (re)used.  (In the abstract, we might wish a
5094          * zoneid to not be deallocated until its last referencing netstack
5095          * has been released, but netstacks lack a backpointer into their
5096          * referencing zone -- and changing them to have such a pointer would
5097          * be substantial, to put it euphemistically.)  To avoid this, we
5098          * detect this condition on allocation: if we have allocated a zoneid
5099          * that corresponds to a netstack that's still in use, we warn about
5100          * it (as it is much more likely to be a reference leak than an actual
5101          * netstack reference), free it, and allocate another.  That these
5102          * identifers are allocated out of an ID space assures that we won't
5103          * see the identifier we just allocated.
5104          */
5105         for (;;) {
5106                 zoneid = id_alloc(zoneid_space);
5107 
5108                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
5109                         break;
5110 
5111                 id_free(zoneid_space, zoneid);
5112 
5113                 if (start == GLOBAL_ZONEID) {
5114                         start = zoneid;
5115                 } else if (zoneid == start) {
5116                         /*
5117                          * We have managed to iterate over the entire available
5118                          * zoneid space -- there are no identifiers available,
5119                          * presumably due to some number of leaked netstack
5120                          * references.  While it's in principle possible for us
5121                          * to continue to try, it seems wiser to give up at
5122                          * this point to warn and fail explicitly with a
5123                          * distinctive error.
5124                          */
5125                         cmn_err(CE_WARN, "zone_create() failed: all available "
5126                             "zone IDs have netstacks still in use");
5127                         return (set_errno(ENFILE));
5128                 }
5129 
5130                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
5131                     "netstack still in use", zoneid);
5132         }
5133 
5134         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
5135         zone->zone_id = zoneid;
5136         zone->zone_did = zone_did;
5137         zone->zone_status = ZONE_IS_UNINITIALIZED;
5138         zone->zone_pool = pool_default;
5139         zone->zone_pool_mod = gethrtime();
5140         zone->zone_psetid = ZONE_PS_INVAL;
5141         zone->zone_ncpus = 0;
5142         zone->zone_ncpus_online = 0;
5143         zone->zone_restart_init = B_TRUE;
5144         zone->zone_reboot_on_init_exit = B_FALSE;
5145         zone->zone_restart_init_0 = B_FALSE;
5146         zone->zone_init_status = -1;
5147         zone->zone_brand = &native_brand;
5148         zone->zone_initname = NULL;
5149         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
5150         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
5151         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
5152         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
5153         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
5154             offsetof(zone_ref_t, zref_linkage));
5155         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
5156             offsetof(struct zsd_entry, zsd_linkage));
5157         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
5158             offsetof(zone_dataset_t, zd_linkage));
5159         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
5160             offsetof(zone_dl_t, zdl_linkage));
5161         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
5162         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
5163 
5164         if (flags & ZCF_NET_EXCL) {
5165                 zone->zone_flags |= ZF_NET_EXCL;
5166         }
5167 
5168         if ((error = zone_set_name(zone, zone_name)) != 0) {
5169                 zone_free(zone);
5170                 return (zone_create_error(error, 0, extended_error));
5171         }
5172 
5173         if ((error = zone_set_root(zone, zone_root)) != 0) {
5174                 zone_free(zone);
5175                 return (zone_create_error(error, 0, extended_error));
5176         }
5177         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
5178                 zone_free(zone);
5179                 return (zone_create_error(error, 0, extended_error));
5180         }
5181 
5182         /* initialize node name to be the same as zone name */
5183         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
5184         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
5185         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
5186 
5187         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
5188         zone->zone_domain[0] = '\0';
5189         zone->zone_hostid = HW_INVALID_HOSTID;
5190         zone->zone_shares = 1;
5191         zone->zone_shmmax = 0;
5192         zone->zone_ipc.ipcq_shmmni = 0;
5193         zone->zone_ipc.ipcq_semmni = 0;
5194         zone->zone_ipc.ipcq_msgmni = 0;
5195         zone->zone_bootargs = NULL;
5196         zone->zone_fs_allowed = NULL;
5197 
5198         psecflags_default(&zone->zone_secflags);
5199 
5200         zone->zone_initname =
5201             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
5202         (void) strcpy(zone->zone_initname, zone_default_initname);
5203         zone->zone_nlwps = 0;
5204         zone->zone_nlwps_ctl = INT_MAX;
5205         zone->zone_nprocs = 0;
5206         zone->zone_nprocs_ctl = INT_MAX;
5207         zone->zone_locked_mem = 0;
5208         zone->zone_locked_mem_ctl = UINT64_MAX;
5209         zone->zone_max_swap = 0;
5210         zone->zone_max_swap_ctl = UINT64_MAX;
5211         zone->zone_max_lofi = 0;
5212         zone->zone_max_lofi_ctl = UINT64_MAX;
5213         zone->zone_lockedmem_kstat = NULL;
5214         zone->zone_swapresv_kstat = NULL;
5215         zone->zone_physmem_kstat = NULL;
5216 
5217         zone_pdata[zoneid].zpers_zfsp =
5218             kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
5219         zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
5220 
5221         zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
5222 
5223         /*
5224          * Zsched initializes the rctls.
5225          */
5226         zone->zone_rctls = NULL;
5227 
5228         /*
5229          * Ensure page count is 0 (in case zoneid has wrapped).
5230          * Initialize physical memory cap as unlimited.
5231          */
5232         zone_pdata[zoneid].zpers_pg_cnt = 0;
5233         zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
5234 
5235         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
5236                 zone_free(zone);
5237                 return (zone_create_error(error, 0, extended_error));
5238         }
5239 
5240         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
5241                 zone_free(zone);
5242                 return (set_errno(error));
5243         }
5244 
5245         /*
5246          * Read in the trusted system parameters:
5247          * match flag and sensitivity label.
5248          */
5249         zone->zone_match = match;
5250         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
5251                 /* Fail if requested to set doi to anything but system's doi */
5252                 if (doi != 0 && doi != default_doi) {
5253                         zone_free(zone);
5254                         return (set_errno(EINVAL));
5255                 }
5256                 /* Always apply system's doi to the zone */
5257                 error = zone_set_label(zone, label, default_doi);
5258                 if (error != 0) {
5259                         zone_free(zone);
5260                         return (set_errno(error));
5261                 }
5262                 insert_label_hash = B_TRUE;
5263         } else {
5264                 /* all zones get an admin_low label if system is not labeled */
5265                 zone->zone_slabel = l_admin_low;
5266                 label_hold(l_admin_low);
5267                 insert_label_hash = B_FALSE;
5268         }
5269 
5270         /*
5271          * Stop all lwps since that's what normally happens as part of fork().
5272          * This needs to happen before we grab any locks to avoid deadlock
5273          * (another lwp in the process could be waiting for the held lock).
5274          */
5275         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
5276                 zone_free(zone);
5277                 nvlist_free(rctls);
5278                 return (zone_create_error(error, 0, extended_error));
5279         }
5280 
5281         if (block_mounts(zone) == 0) {
5282                 mutex_enter(&pp->p_lock);
5283                 if (curthread != pp->p_agenttp)
5284                         continuelwps(pp);
5285                 mutex_exit(&pp->p_lock);
5286                 zone_free(zone);
5287                 nvlist_free(rctls);
5288                 return (zone_create_error(error, 0, extended_error));
5289         }
5290 
5291         /*
5292          * Set up credential for kernel access.  After this, any errors
5293          * should go through the dance in errout rather than calling
5294          * zone_free directly.
5295          */
5296         zone->zone_kcred = crdup(kcred);
5297         crsetzone(zone->zone_kcred, zone);
5298         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
5299         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
5300         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
5301         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
5302 
5303         mutex_enter(&zonehash_lock);
5304         /*
5305          * Make sure zone doesn't already exist.
5306          *
5307          * If the system and zone are labeled,
5308          * make sure no other zone exists that has the same label.
5309          */
5310         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
5311             (insert_label_hash &&
5312             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
5313                 zone_status_t status;
5314 
5315                 status = zone_status_get(ztmp);
5316                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
5317                         error = EEXIST;
5318                 else
5319                         error = EBUSY;
5320 
5321                 if (insert_label_hash)
5322                         error2 = ZE_LABELINUSE;
5323 
5324                 goto errout;
5325         }
5326 
5327         /*
5328          * Don't allow zone creations which would cause one zone's rootpath to
5329          * be accessible from that of another (non-global) zone.
5330          */
5331         if (zone_is_nested(zone->zone_rootpath)) {
5332                 error = EBUSY;
5333                 goto errout;
5334         }
5335 
5336         ASSERT(zonecount != 0);         /* check for leaks */
5337         if (zonecount + 1 > maxzones) {
5338                 error = ENOMEM;
5339                 goto errout;
5340         }
5341 
5342         if (zone_mount_count(zone->zone_rootpath) != 0) {
5343                 error = EBUSY;
5344                 error2 = ZE_AREMOUNTS;
5345                 goto errout;
5346         }
5347 
5348         /*
5349          * Zone is still incomplete, but we need to drop all locks while
5350          * zsched() initializes this zone's kernel process.  We
5351          * optimistically add the zone to the hashtable and associated
5352          * lists so a parallel zone_create() doesn't try to create the
5353          * same zone.
5354          */
5355         zonecount++;
5356         (void) mod_hash_insert(zonehashbyid,
5357             (mod_hash_key_t)(uintptr_t)zone->zone_id,
5358             (mod_hash_val_t)(uintptr_t)zone);
5359         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
5360         (void) strcpy(str, zone->zone_name);
5361         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
5362             (mod_hash_val_t)(uintptr_t)zone);
5363         if (insert_label_hash) {
5364                 (void) mod_hash_insert(zonehashbylabel,
5365                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
5366                 zone->zone_flags |= ZF_HASHED_LABEL;
5367         }
5368 
5369         /*
5370          * Insert into active list.  At this point there are no 'hold's
5371          * on the zone, but everyone else knows not to use it, so we can
5372          * continue to use it.  zsched() will do a zone_hold() if the
5373          * newproc() is successful.
5374          */
5375         list_insert_tail(&zone_active, zone);
5376         mutex_exit(&zonehash_lock);
5377 
5378         zarg.zone = zone;
5379         zarg.nvlist = rctls;
5380         /*
5381          * The process, task, and project rctls are probably wrong;
5382          * we need an interface to get the default values of all rctls,
5383          * and initialize zsched appropriately. However, we allow zoneadmd
5384          * to pass down both zone and project rctls for the zone's init.
5385          */
5386         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
5387         if (error != 0) {
5388                 /*
5389                  * We need to undo all globally visible state.
5390                  */
5391                 mutex_enter(&zonehash_lock);
5392                 list_remove(&zone_active, zone);
5393                 if (zone->zone_flags & ZF_HASHED_LABEL) {
5394                         ASSERT(zone->zone_slabel != NULL);
5395                         (void) mod_hash_destroy(zonehashbylabel,
5396                             (mod_hash_key_t)zone->zone_slabel);
5397                 }
5398                 (void) mod_hash_destroy(zonehashbyname,
5399                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
5400                 (void) mod_hash_destroy(zonehashbyid,
5401                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
5402                 ASSERT(zonecount > 1);
5403                 zonecount--;
5404                 goto errout;
5405         }
5406 
5407         /*
5408          * Zone creation can't fail from now on.
5409          */
5410 
5411         /*
5412          * Create zone kstats
5413          */
5414         zone_kstat_create(zone);
5415 
5416         /*
5417          * Let the other lwps continue.
5418          */
5419         mutex_enter(&pp->p_lock);
5420         if (curthread != pp->p_agenttp)
5421                 continuelwps(pp);
5422         mutex_exit(&pp->p_lock);
5423 
5424         /*
5425          * Wait for zsched to finish initializing the zone.
5426          */
5427         zone_status_wait(zone, ZONE_IS_READY);
5428         /*
5429          * The zone is fully visible, so we can let mounts progress.
5430          */
5431         resume_mounts(zone);
5432         nvlist_free(rctls);
5433 
5434         return (zoneid);
5435 
5436 errout:
5437         mutex_exit(&zonehash_lock);
5438         /*
5439          * Let the other lwps continue.
5440          */
5441         mutex_enter(&pp->p_lock);
5442         if (curthread != pp->p_agenttp)
5443                 continuelwps(pp);
5444         mutex_exit(&pp->p_lock);
5445 
5446         resume_mounts(zone);
5447         nvlist_free(rctls);
5448         /*
5449          * There is currently one reference to the zone, a cred_ref from
5450          * zone_kcred.  To free the zone, we call crfree, which will call
5451          * zone_cred_rele, which will call zone_free.
5452          */
5453         ASSERT(zone->zone_cred_ref == 1);
5454         ASSERT(zone->zone_kcred->cr_ref == 1);
5455         ASSERT(zone->zone_ref == 0);
5456         zkcr = zone->zone_kcred;
5457         zone->zone_kcred = NULL;
5458         crfree(zkcr);                           /* triggers call to zone_free */
5459         return (zone_create_error(error, error2, extended_error));
5460 }
5461 
5462 /*
5463  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
5464  * the heavy lifting.  initname is the path to the program to launch
5465  * at the "top" of the zone; if this is NULL, we use the system default,
5466  * which is stored at zone_default_initname.
5467  */
5468 static int
5469 zone_boot(zoneid_t zoneid)
5470 {
5471         int err;
5472         zone_t *zone;
5473 
5474         if (secpolicy_zone_config(CRED()) != 0)
5475                 return (set_errno(EPERM));
5476         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5477                 return (set_errno(EINVAL));
5478 
5479         mutex_enter(&zonehash_lock);
5480         /*
5481          * Look for zone under hash lock to prevent races with calls to
5482          * zone_shutdown, zone_destroy, etc.
5483          */
5484         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5485                 mutex_exit(&zonehash_lock);
5486                 return (set_errno(EINVAL));
5487         }
5488 
5489         mutex_enter(&zone_status_lock);
5490         if (zone_status_get(zone) != ZONE_IS_READY) {
5491                 mutex_exit(&zone_status_lock);
5492                 mutex_exit(&zonehash_lock);
5493                 return (set_errno(EINVAL));
5494         }
5495         zone_status_set(zone, ZONE_IS_BOOTING);
5496         mutex_exit(&zone_status_lock);
5497 
5498         zone_hold(zone);        /* so we can use the zone_t later */
5499         mutex_exit(&zonehash_lock);
5500 
5501         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
5502                 zone_rele(zone);
5503                 return (set_errno(EINTR));
5504         }
5505 
5506         /*
5507          * Boot (starting init) might have failed, in which case the zone
5508          * will go to the SHUTTING_DOWN state; an appropriate errno will
5509          * be placed in zone->zone_boot_err, and so we return that.
5510          */
5511         err = zone->zone_boot_err;
5512         zone_rele(zone);
5513         return (err ? set_errno(err) : 0);
5514 }
5515 
5516 /*
5517  * Kills all user processes in the zone, waiting for them all to exit
5518  * before returning.
5519  */
5520 static int
5521 zone_empty(zone_t *zone)
5522 {
5523         int cnt = 0;
5524         int waitstatus;
5525 
5526         /*
5527          * We need to drop zonehash_lock before killing all
5528          * processes, otherwise we'll deadlock with zone_find_*
5529          * which can be called from the exit path.
5530          */
5531         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
5532         while ((waitstatus = zone_status_timedwait_sig(zone,
5533             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
5534                 boolean_t force = B_FALSE;
5535 
5536                 /* Every 30 seconds, try harder */
5537                 if (cnt++ >= 30) {
5538                         cmn_err(CE_WARN, "attempt to force kill zone %d\n",
5539                             zone->zone_id);
5540                         force = B_TRUE;
5541                         cnt = 0;
5542                 }
5543                 killall(zone->zone_id, force);
5544         }
5545         /*
5546          * return EINTR if we were signaled
5547          */
5548         if (waitstatus == 0)
5549                 return (EINTR);
5550         return (0);
5551 }
5552 
5553 /*
5554  * This function implements the policy for zone visibility.
5555  *
5556  * In standard Solaris, a non-global zone can only see itself.
5557  *
5558  * In Trusted Extensions, a labeled zone can lookup any zone whose label
5559  * it dominates. For this test, the label of the global zone is treated as
5560  * admin_high so it is special-cased instead of being checked for dominance.
5561  *
5562  * Returns true if zone attributes are viewable, false otherwise.
5563  */
5564 static boolean_t
5565 zone_list_access(zone_t *zone)
5566 {
5567 
5568         if (curproc->p_zone == global_zone ||
5569             curproc->p_zone == zone) {
5570                 return (B_TRUE);
5571         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
5572                 bslabel_t *curproc_label;
5573                 bslabel_t *zone_label;
5574 
5575                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
5576                 zone_label = label2bslabel(zone->zone_slabel);
5577 
5578                 if (zone->zone_id != GLOBAL_ZONEID &&
5579                     bldominates(curproc_label, zone_label)) {
5580                         return (B_TRUE);
5581                 } else {
5582                         return (B_FALSE);
5583                 }
5584         } else {
5585                 return (B_FALSE);
5586         }
5587 }
5588 
5589 /*
5590  * Systemcall to start the zone's halt sequence.  By the time this
5591  * function successfully returns, all user processes and kernel threads
5592  * executing in it will have exited, ZSD shutdown callbacks executed,
5593  * and the zone status set to ZONE_IS_DOWN.
5594  *
5595  * It is possible that the call will interrupt itself if the caller is the
5596  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
5597  */
5598 static int
5599 zone_shutdown(zoneid_t zoneid)
5600 {
5601         int error;
5602         zone_t *zone;
5603         zone_status_t status;
5604 
5605         if (secpolicy_zone_config(CRED()) != 0)
5606                 return (set_errno(EPERM));
5607         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5608                 return (set_errno(EINVAL));
5609 
5610         mutex_enter(&zonehash_lock);
5611         /*
5612          * Look for zone under hash lock to prevent races with other
5613          * calls to zone_shutdown and zone_destroy.
5614          */
5615         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5616                 mutex_exit(&zonehash_lock);
5617                 return (set_errno(EINVAL));
5618         }
5619 
5620         /*
5621          * We have to drop zonehash_lock before calling block_mounts.
5622          * Hold the zone so we can continue to use the zone_t.
5623          */
5624         zone_hold(zone);
5625         mutex_exit(&zonehash_lock);
5626 
5627         /*
5628          * Block mounts so that VFS_MOUNT() can get an accurate view of
5629          * the zone's status with regards to ZONE_IS_SHUTTING down.
5630          *
5631          * e.g. NFS can fail the mount if it determines that the zone
5632          * has already begun the shutdown sequence.
5633          *
5634          */
5635         if (block_mounts(zone) == 0) {
5636                 zone_rele(zone);
5637                 return (set_errno(EINTR));
5638         }
5639 
5640         mutex_enter(&zonehash_lock);
5641         mutex_enter(&zone_status_lock);
5642         status = zone_status_get(zone);
5643         /*
5644          * Fail if the zone isn't fully initialized yet.
5645          */
5646         if (status < ZONE_IS_READY) {
5647                 mutex_exit(&zone_status_lock);
5648                 mutex_exit(&zonehash_lock);
5649                 resume_mounts(zone);
5650                 zone_rele(zone);
5651                 return (set_errno(EINVAL));
5652         }
5653         /*
5654          * If conditions required for zone_shutdown() to return have been met,
5655          * return success.
5656          */
5657         if (status >= ZONE_IS_DOWN) {
5658                 mutex_exit(&zone_status_lock);
5659                 mutex_exit(&zonehash_lock);
5660                 resume_mounts(zone);
5661                 zone_rele(zone);
5662                 return (0);
5663         }
5664         /*
5665          * If zone_shutdown() hasn't been called before, go through the motions.
5666          * If it has, there's nothing to do but wait for the kernel threads to
5667          * drain.
5668          */
5669         if (status < ZONE_IS_EMPTY) {
5670                 uint_t ntasks;
5671 
5672                 mutex_enter(&zone->zone_lock);
5673                 if ((ntasks = zone->zone_ntasks) != 1) {
5674                         /*
5675                          * There's still stuff running.
5676                          */
5677                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5678                 }
5679                 mutex_exit(&zone->zone_lock);
5680                 if (ntasks == 1) {
5681                         /*
5682                          * The only way to create another task is through
5683                          * zone_enter(), which will block until we drop
5684                          * zonehash_lock.  The zone is empty.
5685                          */
5686                         if (zone->zone_kthreads == NULL) {
5687                                 /*
5688                                  * Skip ahead to ZONE_IS_DOWN
5689                                  */
5690                                 zone_status_set(zone, ZONE_IS_DOWN);
5691                         } else {
5692                                 zone_status_set(zone, ZONE_IS_EMPTY);
5693                         }
5694                 }
5695         }
5696         mutex_exit(&zone_status_lock);
5697         mutex_exit(&zonehash_lock);
5698         resume_mounts(zone);
5699 
5700         if (error = zone_empty(zone)) {
5701                 zone_rele(zone);
5702                 return (set_errno(error));
5703         }
5704         /*
5705          * After the zone status goes to ZONE_IS_DOWN this zone will no
5706          * longer be notified of changes to the pools configuration, so
5707          * in order to not end up with a stale pool pointer, we point
5708          * ourselves at the default pool and remove all resource
5709          * visibility.  This is especially important as the zone_t may
5710          * languish on the deathrow for a very long time waiting for
5711          * cred's to drain out.
5712          *
5713          * This rebinding of the zone can happen multiple times
5714          * (presumably due to interrupted or parallel systemcalls)
5715          * without any adverse effects.
5716          */
5717         if (pool_lock_intr() != 0) {
5718                 zone_rele(zone);
5719                 return (set_errno(EINTR));
5720         }
5721         if (pool_state == POOL_ENABLED) {
5722                 mutex_enter(&cpu_lock);
5723                 zone_pool_set(zone, pool_default);
5724                 /*
5725                  * The zone no longer needs to be able to see any cpus.
5726                  */
5727                 zone_pset_set(zone, ZONE_PS_INVAL);
5728                 mutex_exit(&cpu_lock);
5729         }
5730         pool_unlock();
5731 
5732         /*
5733          * ZSD shutdown callbacks can be executed multiple times, hence
5734          * it is safe to not be holding any locks across this call.
5735          */
5736         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5737 
5738         mutex_enter(&zone_status_lock);
5739         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5740                 zone_status_set(zone, ZONE_IS_DOWN);
5741         mutex_exit(&zone_status_lock);
5742 
5743         /*
5744          * Wait for kernel threads to drain.
5745          */
5746         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5747                 zone_rele(zone);
5748                 return (set_errno(EINTR));
5749         }
5750 
5751         /*
5752          * Zone can be become down/destroyable even if the above wait
5753          * returns EINTR, so any code added here may never execute.
5754          * (i.e. don't add code here)
5755          */
5756 
5757         zone_rele(zone);
5758         return (0);
5759 }
5760 
5761 /*
5762  * Log the specified zone's reference counts.  The caller should not be
5763  * holding the zone's zone_lock.
5764  */
5765 static void
5766 zone_log_refcounts(zone_t *zone)
5767 {
5768         char *buffer;
5769         char *buffer_position;
5770         uint32_t buffer_size;
5771         uint32_t index;
5772         uint_t ref;
5773         uint_t cred_ref;
5774 
5775         /*
5776          * Construct a string representing the subsystem-specific reference
5777          * counts.  The counts are printed in ascending order by index into the
5778          * zone_t::zone_subsys_ref array.  The list will be surrounded by
5779          * square brackets [] and will only contain nonzero reference counts.
5780          *
5781          * The buffer will hold two square bracket characters plus ten digits,
5782          * one colon, one space, one comma, and some characters for a
5783          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5784          * bit integers have at most ten decimal digits.)  The last
5785          * reference count's comma is replaced by the closing square
5786          * bracket and a NULL character to terminate the string.
5787          *
5788          * NOTE: We have to grab the zone's zone_lock to create a consistent
5789          * snapshot of the zone's reference counters.
5790          *
5791          * First, figure out how much space the string buffer will need.
5792          * The buffer's size is stored in buffer_size.
5793          */
5794         buffer_size = 2;                        /* for the square brackets */
5795         mutex_enter(&zone->zone_lock);
5796         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5797         ref = zone->zone_ref;
5798         cred_ref = zone->zone_cred_ref;
5799         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5800                 if (zone->zone_subsys_ref[index] != 0)
5801                         buffer_size += strlen(zone_ref_subsys_names[index]) +
5802                             13;
5803         if (buffer_size == 2) {
5804                 /*
5805                  * No subsystems had nonzero reference counts.  Don't bother
5806                  * with allocating a buffer; just log the general-purpose and
5807                  * credential reference counts.
5808                  */
5809                 mutex_exit(&zone->zone_lock);
5810                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5811                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
5812                     "references and %u credential references are still extant",
5813                     zone->zone_name, zone->zone_id, ref, cred_ref);
5814                 return;
5815         }
5816 
5817         /*
5818          * buffer_size contains the exact number of characters that the
5819          * buffer will need.  Allocate the buffer and fill it with nonzero
5820          * subsystem-specific reference counts.  Surround the results with
5821          * square brackets afterwards.
5822          */
5823         buffer = kmem_alloc(buffer_size, KM_SLEEP);
5824         buffer_position = &buffer[1];
5825         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5826                 /*
5827                  * NOTE: The DDI's version of sprintf() returns a pointer to
5828                  * the modified buffer rather than the number of bytes written
5829                  * (as in snprintf(3C)).  This is unfortunate and annoying.
5830                  * Therefore, we'll use snprintf() with INT_MAX to get the
5831                  * number of bytes written.  Using INT_MAX is safe because
5832                  * the buffer is perfectly sized for the data: we'll never
5833                  * overrun the buffer.
5834                  */
5835                 if (zone->zone_subsys_ref[index] != 0)
5836                         buffer_position += snprintf(buffer_position, INT_MAX,
5837                             "%s: %u,", zone_ref_subsys_names[index],
5838                             zone->zone_subsys_ref[index]);
5839         }
5840         mutex_exit(&zone->zone_lock);
5841         buffer[0] = '[';
5842         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5843         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5844         buffer_position[-1] = ']';
5845 
5846         /*
5847          * Log the reference counts and free the message buffer.
5848          */
5849         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5850             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5851             "%u credential references are still extant %s", zone->zone_name,
5852             zone->zone_id, ref, cred_ref, buffer);
5853         kmem_free(buffer, buffer_size);
5854 }
5855 
5856 /*
5857  * Systemcall entry point to finalize the zone halt process.  The caller
5858  * must have already successfully called zone_shutdown().
5859  *
5860  * Upon successful completion, the zone will have been fully destroyed:
5861  * zsched will have exited, destructor callbacks executed, and the zone
5862  * removed from the list of active zones.
5863  */
5864 static int
5865 zone_destroy(zoneid_t zoneid)
5866 {
5867         uint64_t uniqid;
5868         zone_t *zone;
5869         zone_status_t status;
5870         clock_t wait_time;
5871         boolean_t log_refcounts;
5872         zone_persist_t *zp;
5873 
5874         if (secpolicy_zone_config(CRED()) != 0)
5875                 return (set_errno(EPERM));
5876         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5877                 return (set_errno(EINVAL));
5878 
5879         mutex_enter(&zonehash_lock);
5880         /*
5881          * Look for zone under hash lock to prevent races with other
5882          * calls to zone_destroy.
5883          */
5884         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5885                 mutex_exit(&zonehash_lock);
5886                 return (set_errno(EINVAL));
5887         }
5888 
5889         if (zone_mount_count(zone->zone_rootpath) != 0) {
5890                 mutex_exit(&zonehash_lock);
5891                 return (set_errno(EBUSY));
5892         }
5893         mutex_enter(&zone_status_lock);
5894         status = zone_status_get(zone);
5895         if (status < ZONE_IS_DOWN) {
5896                 mutex_exit(&zone_status_lock);
5897                 mutex_exit(&zonehash_lock);
5898                 return (set_errno(EBUSY));
5899         } else if (status == ZONE_IS_DOWN) {
5900                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5901         }
5902         mutex_exit(&zone_status_lock);
5903         zone_hold(zone);
5904         mutex_exit(&zonehash_lock);
5905 
5906         zp = &zone_pdata[zoneid];
5907         mutex_enter(&zp->zpers_zfs_lock);
5908         kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
5909         zp->zpers_zfsp = NULL;
5910         mutex_exit(&zp->zpers_zfs_lock);
5911 
5912         /*
5913          * wait for zsched to exit
5914          */
5915         zone_status_wait(zone, ZONE_IS_DEAD);
5916         zone_zsd_callbacks(zone, ZSD_DESTROY);
5917         zone->zone_netstack = NULL;
5918         uniqid = zone->zone_uniqid;
5919         zone_rele(zone);
5920         zone = NULL;    /* potentially free'd */
5921 
5922         log_refcounts = B_FALSE;
5923         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5924         mutex_enter(&zonehash_lock);
5925         for (; /* ever */; ) {
5926                 boolean_t unref;
5927                 boolean_t refs_have_been_logged;
5928 
5929                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5930                     zone->zone_uniqid != uniqid) {
5931                         /*
5932                          * The zone has gone away.  Necessary conditions
5933                          * are met, so we return success.
5934                          */
5935                         mutex_exit(&zonehash_lock);
5936                         return (0);
5937                 }
5938                 mutex_enter(&zone->zone_lock);
5939                 unref = ZONE_IS_UNREF(zone);
5940                 refs_have_been_logged = (zone->zone_flags &
5941                     ZF_REFCOUNTS_LOGGED);
5942                 mutex_exit(&zone->zone_lock);
5943                 if (unref) {
5944                         /*
5945                          * There is only one reference to the zone -- that
5946                          * added when the zone was added to the hashtables --
5947                          * and things will remain this way until we drop
5948                          * zonehash_lock... we can go ahead and cleanup the
5949                          * zone.
5950                          */
5951                         break;
5952                 }
5953 
5954                 /*
5955                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5956                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5957                  * some zone's general-purpose reference count reaches one.
5958                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5959                  * on zone_destroy_cv, then log the zone's reference counts and
5960                  * continue to wait for zone_rele() and zone_cred_rele().
5961                  */
5962                 if (!refs_have_been_logged) {
5963                         if (!log_refcounts) {
5964                                 /*
5965                                  * This thread hasn't timed out waiting on
5966                                  * zone_destroy_cv yet.  Wait wait_time clock
5967                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5968                                  * seconds) for the zone's references to clear.
5969                                  */
5970                                 ASSERT(wait_time > 0);
5971                                 wait_time = cv_reltimedwait_sig(
5972                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5973                                     TR_SEC);
5974                                 if (wait_time > 0) {
5975                                         /*
5976                                          * A thread in zone_rele() or
5977                                          * zone_cred_rele() signaled
5978                                          * zone_destroy_cv before this thread's
5979                                          * wait timed out.  The zone might have
5980                                          * only one reference left; find out!
5981                                          */
5982                                         continue;
5983                                 } else if (wait_time == 0) {
5984                                         /* The thread's process was signaled. */
5985                                         mutex_exit(&zonehash_lock);
5986                                         return (set_errno(EINTR));
5987                                 }
5988 
5989                                 /*
5990                                  * The thread timed out while waiting on
5991                                  * zone_destroy_cv.  Even though the thread
5992                                  * timed out, it has to check whether another
5993                                  * thread woke up from zone_destroy_cv and
5994                                  * destroyed the zone.
5995                                  *
5996                                  * If the zone still exists and has more than
5997                                  * one unreleased general-purpose reference,
5998                                  * then log the zone's reference counts.
5999                                  */
6000                                 log_refcounts = B_TRUE;
6001                                 continue;
6002                         }
6003 
6004                         /*
6005                          * The thread already timed out on zone_destroy_cv while
6006                          * waiting for subsystems to release the zone's last
6007                          * general-purpose references.  Log the zone's reference
6008                          * counts and wait indefinitely on zone_destroy_cv.
6009                          */
6010                         zone_log_refcounts(zone);
6011                 }
6012                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
6013                         /* The thread's process was signaled. */
6014                         mutex_exit(&zonehash_lock);
6015                         return (set_errno(EINTR));
6016                 }
6017         }
6018 
6019         /*
6020          * Remove CPU cap for this zone now since we're not going to
6021          * fail below this point.
6022          */
6023         cpucaps_zone_remove(zone);
6024 
6025         /* Get rid of the zone's kstats */
6026         zone_kstat_delete(zone);
6027 
6028         /* remove the pfexecd doors */
6029         if (zone->zone_pfexecd != NULL) {
6030                 klpd_freelist(&zone->zone_pfexecd);
6031                 zone->zone_pfexecd = NULL;
6032         }
6033 
6034         /* free brand specific data */
6035         if (ZONE_IS_BRANDED(zone))
6036                 ZBROP(zone)->b_free_brand_data(zone);
6037 
6038         /* Say goodbye to brand framework. */
6039         brand_unregister_zone(zone->zone_brand);
6040 
6041         /*
6042          * It is now safe to let the zone be recreated; remove it from the
6043          * lists.  The memory will not be freed until the last cred
6044          * reference goes away.
6045          */
6046         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
6047         zonecount--;
6048         /* remove from active list and hash tables */
6049         list_remove(&zone_active, zone);
6050         (void) mod_hash_destroy(zonehashbyname,
6051             (mod_hash_key_t)zone->zone_name);
6052         (void) mod_hash_destroy(zonehashbyid,
6053             (mod_hash_key_t)(uintptr_t)zone->zone_id);
6054         if (zone->zone_flags & ZF_HASHED_LABEL)
6055                 (void) mod_hash_destroy(zonehashbylabel,
6056                     (mod_hash_key_t)zone->zone_slabel);
6057         mutex_exit(&zonehash_lock);
6058 
6059         /*
6060          * Release the root vnode; we're not using it anymore.  Nor should any
6061          * other thread that might access it exist.
6062          */
6063         if (zone->zone_rootvp != NULL) {
6064                 vnode_t *vp = zone->zone_rootvp;
6065 
6066                 mutex_enter(&vp->v_lock);
6067                 vp->v_flag &= ~VZONEROOT;
6068                 mutex_exit(&vp->v_lock);
6069                 VN_RELE(vp);
6070                 zone->zone_rootvp = NULL;
6071         }
6072 
6073         /* add to deathrow list */
6074         mutex_enter(&zone_deathrow_lock);
6075         list_insert_tail(&zone_deathrow, zone);
6076         mutex_exit(&zone_deathrow_lock);
6077 
6078         /*
6079          * Drop last reference (which was added by zsched()), this will
6080          * free the zone unless there are outstanding cred references.
6081          */
6082         zone_rele(zone);
6083         return (0);
6084 }
6085 
6086 /*
6087  * Systemcall entry point for zone_getattr(2).
6088  */
6089 static ssize_t
6090 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
6091 {
6092         size_t size;
6093         int error = 0, err;
6094         zone_t *zone;
6095         char *zonepath;
6096         char *outstr;
6097         zone_status_t zone_status;
6098         pid_t initpid;
6099         boolean_t global = (curzone == global_zone);
6100         boolean_t inzone = (curzone->zone_id == zoneid);
6101         ushort_t flags;
6102         zone_net_data_t *zbuf;
6103 
6104         mutex_enter(&zonehash_lock);
6105         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
6106                 mutex_exit(&zonehash_lock);
6107                 return (set_errno(EINVAL));
6108         }
6109         zone_status = zone_status_get(zone);
6110         if (zone_status < ZONE_IS_INITIALIZED) {
6111                 mutex_exit(&zonehash_lock);
6112                 return (set_errno(EINVAL));
6113         }
6114         zone_hold(zone);
6115         mutex_exit(&zonehash_lock);
6116 
6117         /*
6118          * If not in the global zone, don't show information about other zones,
6119          * unless the system is labeled and the local zone's label dominates
6120          * the other zone.
6121          */
6122         if (!zone_list_access(zone)) {
6123                 zone_rele(zone);
6124                 return (set_errno(EINVAL));
6125         }
6126 
6127         switch (attr) {
6128         case ZONE_ATTR_ROOT:
6129                 if (global) {
6130                         /*
6131                          * Copy the path to trim the trailing "/" (except for
6132                          * the global zone).
6133                          */
6134                         if (zone != global_zone)
6135                                 size = zone->zone_rootpathlen - 1;
6136                         else
6137                                 size = zone->zone_rootpathlen;
6138                         zonepath = kmem_alloc(size, KM_SLEEP);
6139                         bcopy(zone->zone_rootpath, zonepath, size);
6140                         zonepath[size - 1] = '\0';
6141                 } else {
6142                         if (inzone || !is_system_labeled()) {
6143                                 /*
6144                                  * Caller is not in the global zone.
6145                                  * if the query is on the current zone
6146                                  * or the system is not labeled,
6147                                  * just return faked-up path for current zone.
6148                                  */
6149                                 zonepath = "/";
6150                                 size = 2;
6151                         } else {
6152                                 /*
6153                                  * Return related path for current zone.
6154                                  */
6155                                 int prefix_len = strlen(zone_prefix);
6156                                 int zname_len = strlen(zone->zone_name);
6157 
6158                                 size = prefix_len + zname_len + 1;
6159                                 zonepath = kmem_alloc(size, KM_SLEEP);
6160                                 bcopy(zone_prefix, zonepath, prefix_len);
6161                                 bcopy(zone->zone_name, zonepath +
6162                                     prefix_len, zname_len);
6163                                 zonepath[size - 1] = '\0';
6164                         }
6165                 }
6166                 if (bufsize > size)
6167                         bufsize = size;
6168                 if (buf != NULL) {
6169                         err = copyoutstr(zonepath, buf, bufsize, NULL);
6170                         if (err != 0 && err != ENAMETOOLONG)
6171                                 error = EFAULT;
6172                 }
6173                 if (global || (is_system_labeled() && !inzone))
6174                         kmem_free(zonepath, size);
6175                 break;
6176 
6177         case ZONE_ATTR_NAME:
6178                 size = strlen(zone->zone_name) + 1;
6179                 if (bufsize > size)
6180                         bufsize = size;
6181                 if (buf != NULL) {
6182                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
6183                         if (err != 0 && err != ENAMETOOLONG)
6184                                 error = EFAULT;
6185                 }
6186                 break;
6187 
6188         case ZONE_ATTR_STATUS:
6189                 /*
6190                  * Since we're not holding zonehash_lock, the zone status
6191                  * may be anything; leave it up to userland to sort it out.
6192                  */
6193                 size = sizeof (zone_status);
6194                 if (bufsize > size)
6195                         bufsize = size;
6196                 zone_status = zone_status_get(zone);
6197                 if (buf != NULL &&
6198                     copyout(&zone_status, buf, bufsize) != 0)
6199                         error = EFAULT;
6200                 break;
6201         case ZONE_ATTR_FLAGS:
6202                 size = sizeof (zone->zone_flags);
6203                 if (bufsize > size)
6204                         bufsize = size;
6205                 flags = zone->zone_flags;
6206                 if (buf != NULL &&
6207                     copyout(&flags, buf, bufsize) != 0)
6208                         error = EFAULT;
6209                 break;
6210         case ZONE_ATTR_PRIVSET:
6211                 size = sizeof (priv_set_t);
6212                 if (bufsize > size)
6213                         bufsize = size;
6214                 if (buf != NULL &&
6215                     copyout(zone->zone_privset, buf, bufsize) != 0)
6216                         error = EFAULT;
6217                 break;
6218         case ZONE_ATTR_UNIQID:
6219                 size = sizeof (zone->zone_uniqid);
6220                 if (bufsize > size)
6221                         bufsize = size;
6222                 if (buf != NULL &&
6223                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
6224                         error = EFAULT;
6225                 break;
6226         case ZONE_ATTR_POOLID:
6227                 {
6228                         pool_t *pool;
6229                         poolid_t poolid;
6230 
6231                         if (pool_lock_intr() != 0) {
6232                                 error = EINTR;
6233                                 break;
6234                         }
6235                         pool = zone_pool_get(zone);
6236                         poolid = pool->pool_id;
6237                         pool_unlock();
6238                         size = sizeof (poolid);
6239                         if (bufsize > size)
6240                                 bufsize = size;
6241                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
6242                                 error = EFAULT;
6243                 }
6244                 break;
6245         case ZONE_ATTR_SLBL:
6246                 size = sizeof (bslabel_t);
6247                 if (bufsize > size)
6248                         bufsize = size;
6249                 if (zone->zone_slabel == NULL)
6250                         error = EINVAL;
6251                 else if (buf != NULL &&
6252                     copyout(label2bslabel(zone->zone_slabel), buf,
6253                     bufsize) != 0)
6254                         error = EFAULT;
6255                 break;
6256         case ZONE_ATTR_INITPID:
6257                 size = sizeof (initpid);
6258                 if (bufsize > size)
6259                         bufsize = size;
6260                 initpid = zone->zone_proc_initpid;
6261                 if (initpid == -1) {
6262                         error = ESRCH;
6263                         break;
6264                 }
6265                 if (buf != NULL &&
6266                     copyout(&initpid, buf, bufsize) != 0)
6267                         error = EFAULT;
6268                 break;
6269         case ZONE_ATTR_BRAND:
6270                 size = strlen(zone->zone_brand->b_name) + 1;
6271 
6272                 if (bufsize > size)
6273                         bufsize = size;
6274                 if (buf != NULL) {
6275                         err = copyoutstr(zone->zone_brand->b_name, buf,
6276                             bufsize, NULL);
6277                         if (err != 0 && err != ENAMETOOLONG)
6278                                 error = EFAULT;
6279                 }
6280                 break;
6281         case ZONE_ATTR_INITNAME:
6282                 size = strlen(zone->zone_initname) + 1;
6283                 if (bufsize > size)
6284                         bufsize = size;
6285                 if (buf != NULL) {
6286                         err = copyoutstr(zone->zone_initname, buf, bufsize,
6287                             NULL);
6288                         if (err != 0 && err != ENAMETOOLONG)
6289                                 error = EFAULT;
6290                 }
6291                 break;
6292         case ZONE_ATTR_BOOTARGS:
6293                 if (zone->zone_bootargs == NULL)
6294                         outstr = "";
6295                 else
6296                         outstr = zone->zone_bootargs;
6297                 size = strlen(outstr) + 1;
6298                 if (bufsize > size)
6299                         bufsize = size;
6300                 if (buf != NULL) {
6301                         err = copyoutstr(outstr, buf, bufsize, NULL);
6302                         if (err != 0 && err != ENAMETOOLONG)
6303                                 error = EFAULT;
6304                 }
6305                 break;
6306         case ZONE_ATTR_SCHED_CLASS:
6307                 mutex_enter(&class_lock);
6308 
6309                 if (zone->zone_defaultcid >= loaded_classes)
6310                         outstr = "";
6311                 else
6312                         outstr = sclass[zone->zone_defaultcid].cl_name;
6313                 size = strlen(outstr) + 1;
6314                 if (bufsize > size)
6315                         bufsize = size;
6316                 if (buf != NULL) {
6317                         err = copyoutstr(outstr, buf, bufsize, NULL);
6318                         if (err != 0 && err != ENAMETOOLONG)
6319                                 error = EFAULT;
6320                 }
6321 
6322                 mutex_exit(&class_lock);
6323                 break;
6324         case ZONE_ATTR_HOSTID:
6325                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
6326                     bufsize == sizeof (zone->zone_hostid)) {
6327                         size = sizeof (zone->zone_hostid);
6328                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
6329                             bufsize) != 0)
6330                                 error = EFAULT;
6331                 } else {
6332                         error = EINVAL;
6333                 }
6334                 break;
6335         case ZONE_ATTR_FS_ALLOWED:
6336                 if (zone->zone_fs_allowed == NULL)
6337                         outstr = "";
6338                 else
6339                         outstr = zone->zone_fs_allowed;
6340                 size = strlen(outstr) + 1;
6341                 if (bufsize > size)
6342                         bufsize = size;
6343                 if (buf != NULL) {
6344                         err = copyoutstr(outstr, buf, bufsize, NULL);
6345                         if (err != 0 && err != ENAMETOOLONG)
6346                                 error = EFAULT;
6347                 }
6348                 break;
6349         case ZONE_ATTR_SECFLAGS:
6350                 size = sizeof (zone->zone_secflags);
6351                 if (bufsize > size)
6352                         bufsize = size;
6353                 if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
6354                         error = EFAULT;
6355                 break;
6356         case ZONE_ATTR_NETWORK:
6357                 bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
6358                 size = bufsize;
6359                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6360                 if (copyin(buf, zbuf, bufsize) != 0) {
6361                         error = EFAULT;
6362                 } else {
6363                         error = zone_get_network(zoneid, zbuf);
6364                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
6365                                 error = EFAULT;
6366                 }
6367                 kmem_free(zbuf, bufsize);
6368                 break;
6369         case ZONE_ATTR_DID:
6370                 size = sizeof (zoneid_t);
6371                 if (bufsize > size)
6372                         bufsize = size;
6373 
6374                 if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
6375                         error = EFAULT;
6376                 break;
6377         case ZONE_ATTR_SCHED_FIXEDHI:
6378                 size = sizeof (boolean_t);
6379                 if (bufsize > size)
6380                         bufsize = size;
6381 
6382                 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6383                     bufsize) != 0)
6384                         error = EFAULT;
6385                 break;
6386         default:
6387                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6388                         size = bufsize;
6389                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6390                 } else {
6391                         error = EINVAL;
6392                 }
6393         }
6394         zone_rele(zone);
6395 
6396         if (error)
6397                 return (set_errno(error));
6398         return ((ssize_t)size);
6399 }
6400 
6401 /*
6402  * Systemcall entry point for zone_setattr(2).
6403  */
6404 /*ARGSUSED*/
6405 static int
6406 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
6407 {
6408         zone_t *zone;
6409         zone_status_t zone_status;
6410         int err = -1;
6411         zone_net_data_t *zbuf;
6412 
6413         if (secpolicy_zone_config(CRED()) != 0)
6414                 return (set_errno(EPERM));
6415 
6416         /*
6417          * No attributes can be set on the global zone.
6418          */
6419         if (zoneid == GLOBAL_ZONEID) {
6420                 return (set_errno(EINVAL));
6421         }
6422 
6423         mutex_enter(&zonehash_lock);
6424         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
6425                 mutex_exit(&zonehash_lock);
6426                 return (set_errno(EINVAL));
6427         }
6428         zone_hold(zone);
6429         mutex_exit(&zonehash_lock);
6430 
6431         /*
6432          * At present attributes can only be set on non-running,
6433          * non-global zones.
6434          */
6435         zone_status = zone_status_get(zone);
6436         if (zone_status > ZONE_IS_READY) {
6437                 err = EINVAL;
6438                 goto done;
6439         }
6440 
6441         switch (attr) {
6442         case ZONE_ATTR_INITNAME:
6443                 err = zone_set_initname(zone, (const char *)buf);
6444                 break;
6445         case ZONE_ATTR_INITNORESTART:
6446                 zone->zone_restart_init = B_FALSE;
6447                 err = 0;
6448                 break;
6449         case ZONE_ATTR_INITRESTART0:
6450                 zone->zone_restart_init_0 = B_TRUE;
6451                 err = 0;
6452                 break;
6453         case ZONE_ATTR_INITREBOOT:
6454                 zone->zone_reboot_on_init_exit = B_TRUE;
6455                 err = 0;
6456                 break;
6457         case ZONE_ATTR_BOOTARGS:
6458                 err = zone_set_bootargs(zone, (const char *)buf);
6459                 break;
6460         case ZONE_ATTR_BRAND:
6461                 err = zone_set_brand(zone, (const char *)buf);
6462                 break;
6463         case ZONE_ATTR_FS_ALLOWED:
6464                 err = zone_set_fs_allowed(zone, (const char *)buf);
6465                 break;
6466         case ZONE_ATTR_SECFLAGS:
6467                 err = zone_set_secflags(zone, (psecflags_t *)buf);
6468                 break;
6469         case ZONE_ATTR_SCHED_CLASS:
6470                 err = zone_set_sched_class(zone, (const char *)buf);
6471                 break;
6472         case ZONE_ATTR_HOSTID:
6473                 if (bufsize == sizeof (zone->zone_hostid)) {
6474                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
6475                                 err = 0;
6476                         else
6477                                 err = EFAULT;
6478                 } else {
6479                         err = EINVAL;
6480                 }
6481                 break;
6482         case ZONE_ATTR_NETWORK:
6483                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
6484                         err = EINVAL;
6485                         break;
6486                 }
6487                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6488                 if (copyin(buf, zbuf, bufsize) != 0) {
6489                         kmem_free(zbuf, bufsize);
6490                         err = EFAULT;
6491                         break;
6492                 }
6493                 err = zone_set_network(zoneid, zbuf);
6494                 kmem_free(zbuf, bufsize);
6495                 break;
6496         case ZONE_ATTR_APP_SVC_CT:
6497                 if (bufsize != sizeof (boolean_t)) {
6498                         err = EINVAL;
6499                 } else {
6500                         zone->zone_setup_app_contract = (boolean_t)buf;
6501                         err = 0;
6502                 }
6503                 break;
6504         case ZONE_ATTR_SCHED_FIXEDHI:
6505                 if (bufsize != sizeof (boolean_t)) {
6506                         err = EINVAL;
6507                 } else {
6508                         zone->zone_fixed_hipri = (boolean_t)buf;
6509                         err = 0;
6510                 }
6511                 break;
6512         default:
6513                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
6514                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
6515                 else
6516                         err = EINVAL;
6517         }
6518 
6519 done:
6520         zone_rele(zone);
6521         ASSERT(err != -1);
6522         return (err != 0 ? set_errno(err) : 0);
6523 }
6524 
6525 /*
6526  * Return zero if the process has at least one vnode mapped in to its
6527  * address space which shouldn't be allowed to change zones.
6528  *
6529  * Also return zero if the process has any shared mappings which reserve
6530  * swap.  This is because the counting for zone.max-swap does not allow swap
6531  * reservation to be shared between zones.  zone swap reservation is counted
6532  * on zone->zone_max_swap.
6533  */
6534 static int
6535 as_can_change_zones(void)
6536 {
6537         proc_t *pp = curproc;
6538         struct seg *seg;
6539         struct as *as = pp->p_as;
6540         vnode_t *vp;
6541         int allow = 1;
6542 
6543         ASSERT(pp->p_as != &kas);
6544         AS_LOCK_ENTER(as, RW_READER);
6545         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
6546 
6547                 /*
6548                  * Cannot enter zone with shared anon memory which
6549                  * reserves swap.  See comment above.
6550                  */
6551                 if (seg_can_change_zones(seg) == B_FALSE) {
6552                         allow = 0;
6553                         break;
6554                 }
6555                 /*
6556                  * if we can't get a backing vnode for this segment then skip
6557                  * it.
6558                  */
6559                 vp = NULL;
6560                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
6561                         continue;
6562                 if (!vn_can_change_zones(vp)) { /* bail on first match */
6563                         allow = 0;
6564                         break;
6565                 }
6566         }
6567         AS_LOCK_EXIT(as);
6568         return (allow);
6569 }
6570 
6571 /*
6572  * Count swap reserved by curproc's address space
6573  */
6574 static size_t
6575 as_swresv(void)
6576 {
6577         proc_t *pp = curproc;
6578         struct seg *seg;
6579         struct as *as = pp->p_as;
6580         size_t swap = 0;
6581 
6582         ASSERT(pp->p_as != &kas);
6583         ASSERT(AS_WRITE_HELD(as));
6584         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
6585                 swap += seg_swresv(seg);
6586 
6587         return (swap);
6588 }
6589 
6590 /*
6591  * Systemcall entry point for zone_enter().
6592  *
6593  * The current process is injected into said zone.  In the process
6594  * it will change its project membership, privileges, rootdir/cwd,
6595  * zone-wide rctls, and pool association to match those of the zone.
6596  *
6597  * The first zone_enter() called while the zone is in the ZONE_IS_READY
6598  * state will transition it to ZONE_IS_RUNNING.  Processes may only
6599  * enter a zone that is "ready" or "running".
6600  */
6601 static int
6602 zone_enter(zoneid_t zoneid)
6603 {
6604         zone_t *zone;
6605         vnode_t *vp;
6606         proc_t *pp = curproc;
6607         contract_t *ct;
6608         cont_process_t *ctp;
6609         task_t *tk, *oldtk;
6610         kproject_t *zone_proj0;
6611         cred_t *cr, *newcr;
6612         pool_t *oldpool, *newpool;
6613         sess_t *sp;
6614         uid_t uid;
6615         zone_status_t status;
6616         int err = 0;
6617         rctl_entity_p_t e;
6618         size_t swap;
6619         kthread_id_t t;
6620 
6621         if (secpolicy_zone_config(CRED()) != 0)
6622                 return (set_errno(EPERM));
6623         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
6624                 return (set_errno(EINVAL));
6625 
6626         /*
6627          * Stop all lwps so we don't need to hold a lock to look at
6628          * curproc->p_zone.  This needs to happen before we grab any
6629          * locks to avoid deadlock (another lwp in the process could
6630          * be waiting for the held lock).
6631          */
6632         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
6633                 return (set_errno(EINTR));
6634 
6635         /*
6636          * Make sure we're not changing zones with files open or mapped in
6637          * to our address space which shouldn't be changing zones.
6638          */
6639         if (!files_can_change_zones()) {
6640                 err = EBADF;
6641                 goto out;
6642         }
6643         if (!as_can_change_zones()) {
6644                 err = EFAULT;
6645                 goto out;
6646         }
6647 
6648         mutex_enter(&zonehash_lock);
6649         if (pp->p_zone != global_zone) {
6650                 mutex_exit(&zonehash_lock);
6651                 err = EINVAL;
6652                 goto out;
6653         }
6654 
6655         zone = zone_find_all_by_id(zoneid);
6656         if (zone == NULL) {
6657                 mutex_exit(&zonehash_lock);
6658                 err = EINVAL;
6659                 goto out;
6660         }
6661 
6662         /*
6663          * To prevent processes in a zone from holding contracts on
6664          * extrazonal resources, and to avoid process contract
6665          * memberships which span zones, contract holders and processes
6666          * which aren't the sole members of their encapsulating process
6667          * contracts are not allowed to zone_enter.
6668          */
6669         ctp = pp->p_ct_process;
6670         ct = &ctp->conp_contract;
6671         mutex_enter(&ct->ct_lock);
6672         mutex_enter(&pp->p_lock);
6673         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
6674                 mutex_exit(&pp->p_lock);
6675                 mutex_exit(&ct->ct_lock);
6676                 mutex_exit(&zonehash_lock);
6677                 err = EINVAL;
6678                 goto out;
6679         }
6680 
6681         /*
6682          * Moreover, we don't allow processes whose encapsulating
6683          * process contracts have inherited extrazonal contracts.
6684          * While it would be easier to eliminate all process contracts
6685          * with inherited contracts, we need to be able to give a
6686          * restarted init (or other zone-penetrating process) its
6687          * predecessor's contracts.
6688          */
6689         if (ctp->conp_ninherited != 0) {
6690                 contract_t *next;
6691                 for (next = list_head(&ctp->conp_inherited); next;
6692                     next = list_next(&ctp->conp_inherited, next)) {
6693                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
6694                                 mutex_exit(&pp->p_lock);
6695                                 mutex_exit(&ct->ct_lock);
6696                                 mutex_exit(&zonehash_lock);
6697                                 err = EINVAL;
6698                                 goto out;
6699                         }
6700                 }
6701         }
6702 
6703         mutex_exit(&pp->p_lock);
6704         mutex_exit(&ct->ct_lock);
6705 
6706         status = zone_status_get(zone);
6707         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
6708                 /*
6709                  * Can't join
6710                  */
6711                 mutex_exit(&zonehash_lock);
6712                 err = EINVAL;
6713                 goto out;
6714         }
6715 
6716         /*
6717          * Make sure new priv set is within the permitted set for caller
6718          */
6719         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
6720                 mutex_exit(&zonehash_lock);
6721                 err = EPERM;
6722                 goto out;
6723         }
6724         /*
6725          * We want to momentarily drop zonehash_lock while we optimistically
6726          * bind curproc to the pool it should be running in.  This is safe
6727          * since the zone can't disappear (we have a hold on it).
6728          */
6729         zone_hold(zone);
6730         mutex_exit(&zonehash_lock);
6731 
6732         /*
6733          * Grab pool_lock to keep the pools configuration from changing
6734          * and to stop ourselves from getting rebound to another pool
6735          * until we join the zone.
6736          */
6737         if (pool_lock_intr() != 0) {
6738                 zone_rele(zone);
6739                 err = EINTR;
6740                 goto out;
6741         }
6742         ASSERT(secpolicy_pool(CRED()) == 0);
6743         /*
6744          * Bind ourselves to the pool currently associated with the zone.
6745          */
6746         oldpool = curproc->p_pool;
6747         newpool = zone_pool_get(zone);
6748         if (pool_state == POOL_ENABLED && newpool != oldpool &&
6749             (err = pool_do_bind(newpool, P_PID, P_MYID,
6750             POOL_BIND_ALL)) != 0) {
6751                 pool_unlock();
6752                 zone_rele(zone);
6753                 goto out;
6754         }
6755 
6756         /*
6757          * Grab cpu_lock now; we'll need it later when we call
6758          * task_join().
6759          */
6760         mutex_enter(&cpu_lock);
6761         mutex_enter(&zonehash_lock);
6762         /*
6763          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6764          */
6765         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6766                 /*
6767                  * Can't join anymore.
6768                  */
6769                 mutex_exit(&zonehash_lock);
6770                 mutex_exit(&cpu_lock);
6771                 if (pool_state == POOL_ENABLED &&
6772                     newpool != oldpool)
6773                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
6774                             POOL_BIND_ALL);
6775                 pool_unlock();
6776                 zone_rele(zone);
6777                 err = EINVAL;
6778                 goto out;
6779         }
6780 
6781         /*
6782          * a_lock must be held while transfering locked memory and swap
6783          * reservation from the global zone to the non global zone because
6784          * asynchronous faults on the processes' address space can lock
6785          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6786          * segments respectively.
6787          */
6788         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6789         swap = as_swresv();
6790         mutex_enter(&pp->p_lock);
6791         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6792         /* verify that we do not exceed and task or lwp limits */
6793         mutex_enter(&zone->zone_nlwps_lock);
6794         /* add new lwps to zone and zone's proj0 */
6795         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6796         zone->zone_nlwps += pp->p_lwpcnt;
6797         /* add 1 task to zone's proj0 */
6798         zone_proj0->kpj_ntasks += 1;
6799 
6800         zone_proj0->kpj_nprocs++;
6801         zone->zone_nprocs++;
6802         mutex_exit(&zone->zone_nlwps_lock);
6803 
6804         mutex_enter(&zone->zone_mem_lock);
6805         zone->zone_locked_mem += pp->p_locked_mem;
6806         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6807         zone->zone_max_swap += swap;
6808         mutex_exit(&zone->zone_mem_lock);
6809 
6810         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6811         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6812         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6813 
6814         /* remove lwps and process from proc's old zone and old project */
6815         mutex_enter(&pp->p_zone->zone_nlwps_lock);
6816         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6817         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6818         pp->p_task->tk_proj->kpj_nprocs--;
6819         pp->p_zone->zone_nprocs--;
6820         mutex_exit(&pp->p_zone->zone_nlwps_lock);
6821 
6822         mutex_enter(&pp->p_zone->zone_mem_lock);
6823         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6824         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6825         pp->p_zone->zone_max_swap -= swap;
6826         mutex_exit(&pp->p_zone->zone_mem_lock);
6827 
6828         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6829         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6830         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6831 
6832         pp->p_flag |= SZONETOP;
6833         pp->p_zone = zone;
6834         mutex_exit(&pp->p_lock);
6835         AS_LOCK_EXIT(pp->p_as);
6836 
6837         /*
6838          * Joining the zone cannot fail from now on.
6839          *
6840          * This means that a lot of the following code can be commonized and
6841          * shared with zsched().
6842          */
6843 
6844         /*
6845          * If the process contract fmri was inherited, we need to
6846          * flag this so that any contract status will not leak
6847          * extra zone information, svc_fmri in this case
6848          */
6849         if (ctp->conp_svc_ctid != ct->ct_id) {
6850                 mutex_enter(&ct->ct_lock);
6851                 ctp->conp_svc_zone_enter = ct->ct_id;
6852                 mutex_exit(&ct->ct_lock);
6853         }
6854 
6855         /*
6856          * Reset the encapsulating process contract's zone.
6857          */
6858         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6859         contract_setzuniqid(ct, zone->zone_uniqid);
6860 
6861         /*
6862          * Create a new task and associate the process with the project keyed
6863          * by (projid,zoneid).
6864          *
6865          * We might as well be in project 0; the global zone's projid doesn't
6866          * make much sense in a zone anyhow.
6867          *
6868          * This also increments zone_ntasks, and returns with p_lock held.
6869          */
6870         tk = task_create(0, zone);
6871         oldtk = task_join(tk, 0);
6872         mutex_exit(&cpu_lock);
6873 
6874         /*
6875          * call RCTLOP_SET functions on this proc
6876          */
6877         e.rcep_p.zone = zone;
6878         e.rcep_t = RCENTITY_ZONE;
6879         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6880             RCD_CALLBACK);
6881         mutex_exit(&pp->p_lock);
6882 
6883         /*
6884          * We don't need to hold any of zsched's locks here; not only do we know
6885          * the process and zone aren't going away, we know its session isn't
6886          * changing either.
6887          *
6888          * By joining zsched's session here, we mimic the behavior in the
6889          * global zone of init's sid being the pid of sched.  We extend this
6890          * to all zlogin-like zone_enter()'ing processes as well.
6891          */
6892         mutex_enter(&pidlock);
6893         sp = zone->zone_zsched->p_sessp;
6894         sess_hold(zone->zone_zsched);
6895         mutex_enter(&pp->p_lock);
6896         pgexit(pp);
6897         sess_rele(pp->p_sessp, B_TRUE);
6898         pp->p_sessp = sp;
6899         pgjoin(pp, zone->zone_zsched->p_pidp);
6900 
6901         /*
6902          * If any threads are scheduled to be placed on zone wait queue they
6903          * should abandon the idea since the wait queue is changing.
6904          * We need to be holding pidlock & p_lock to do this.
6905          */
6906         if ((t = pp->p_tlist) != NULL) {
6907                 do {
6908                         thread_lock(t);
6909                         /*
6910                          * Kick this thread so that it doesn't sit
6911                          * on a wrong wait queue.
6912                          */
6913                         if (ISWAITING(t))
6914                                 setrun_locked(t);
6915 
6916                         if (t->t_schedflag & TS_ANYWAITQ)
6917                                 t->t_schedflag &= ~ TS_ANYWAITQ;
6918 
6919                         thread_unlock(t);
6920                 } while ((t = t->t_forw) != pp->p_tlist);
6921         }
6922 
6923         /*
6924          * If there is a default scheduling class for the zone and it is not
6925          * the class we are currently in, change all of the threads in the
6926          * process to the new class.  We need to be holding pidlock & p_lock
6927          * when we call parmsset so this is a good place to do it.
6928          */
6929         if (zone->zone_defaultcid > 0 &&
6930             zone->zone_defaultcid != curthread->t_cid) {
6931                 pcparms_t pcparms;
6932 
6933                 pcparms.pc_cid = zone->zone_defaultcid;
6934                 pcparms.pc_clparms[0] = 0;
6935 
6936                 /*
6937                  * If setting the class fails, we still want to enter the zone.
6938                  */
6939                 if ((t = pp->p_tlist) != NULL) {
6940                         do {
6941                                 (void) parmsset(&pcparms, t);
6942                         } while ((t = t->t_forw) != pp->p_tlist);
6943                 }
6944         }
6945 
6946         mutex_exit(&pp->p_lock);
6947         mutex_exit(&pidlock);
6948 
6949         mutex_exit(&zonehash_lock);
6950         /*
6951          * We're firmly in the zone; let pools progress.
6952          */
6953         pool_unlock();
6954         task_rele(oldtk);
6955         /*
6956          * We don't need to retain a hold on the zone since we already
6957          * incremented zone_ntasks, so the zone isn't going anywhere.
6958          */
6959         zone_rele(zone);
6960 
6961         /*
6962          * Chroot
6963          */
6964         vp = zone->zone_rootvp;
6965         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6966         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6967 
6968         /*
6969          * Change process security flags.  Note that the _effective_ flags
6970          * cannot change
6971          */
6972         secflags_copy(&pp->p_secflags.psf_lower,
6973             &zone->zone_secflags.psf_lower);
6974         secflags_copy(&pp->p_secflags.psf_upper,
6975             &zone->zone_secflags.psf_upper);
6976         secflags_copy(&pp->p_secflags.psf_inherit,
6977             &zone->zone_secflags.psf_inherit);
6978 
6979         /*
6980          * Change process credentials
6981          */
6982         newcr = cralloc();
6983         mutex_enter(&pp->p_crlock);
6984         cr = pp->p_cred;
6985         crcopy_to(cr, newcr);
6986         crsetzone(newcr, zone);
6987         pp->p_cred = newcr;
6988 
6989         /*
6990          * Restrict all process privilege sets to zone limit
6991          */
6992         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6993         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6994         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6995         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6996         mutex_exit(&pp->p_crlock);
6997         crset(pp, newcr);
6998 
6999         /*
7000          * Adjust upcount to reflect zone entry.
7001          */
7002         uid = crgetruid(newcr);
7003         mutex_enter(&pidlock);
7004         upcount_dec(uid, GLOBAL_ZONEID);
7005         upcount_inc(uid, zoneid);
7006         mutex_exit(&pidlock);
7007 
7008         /*
7009          * Set up core file path and content.
7010          */
7011         set_core_defaults();
7012 
7013 out:
7014         /*
7015          * Let the other lwps continue.
7016          */
7017         mutex_enter(&pp->p_lock);
7018         if (curthread != pp->p_agenttp)
7019                 continuelwps(pp);
7020         mutex_exit(&pp->p_lock);
7021 
7022         return (err != 0 ? set_errno(err) : 0);
7023 }
7024 
7025 /*
7026  * Systemcall entry point for zone_list(2).
7027  *
7028  * Processes running in a (non-global) zone only see themselves.
7029  * On labeled systems, they see all zones whose label they dominate.
7030  */
7031 static int
7032 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
7033 {
7034         zoneid_t *zoneids;
7035         zone_t *zone, *myzone;
7036         uint_t user_nzones, real_nzones;
7037         uint_t domi_nzones;
7038         int error;
7039 
7040         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
7041                 return (set_errno(EFAULT));
7042 
7043         myzone = curproc->p_zone;
7044         if (myzone != global_zone) {
7045                 bslabel_t *mybslab;
7046 
7047                 if (!is_system_labeled()) {
7048                         /* just return current zone */
7049                         real_nzones = domi_nzones = 1;
7050                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
7051                         zoneids[0] = myzone->zone_id;
7052                 } else {
7053                         /* return all zones that are dominated */
7054                         mutex_enter(&zonehash_lock);
7055                         real_nzones = zonecount;
7056                         domi_nzones = 0;
7057                         if (real_nzones > 0) {
7058                                 zoneids = kmem_alloc(real_nzones *
7059                                     sizeof (zoneid_t), KM_SLEEP);
7060                                 mybslab = label2bslabel(myzone->zone_slabel);
7061                                 for (zone = list_head(&zone_active);
7062                                     zone != NULL;
7063                                     zone = list_next(&zone_active, zone)) {
7064                                         if (zone->zone_id == GLOBAL_ZONEID)
7065                                                 continue;
7066                                         if (zone != myzone &&
7067                                             (zone->zone_flags & ZF_IS_SCRATCH))
7068                                                 continue;
7069                                         /*
7070                                          * Note that a label always dominates
7071                                          * itself, so myzone is always included
7072                                          * in the list.
7073                                          */
7074                                         if (bldominates(mybslab,
7075                                             label2bslabel(zone->zone_slabel))) {
7076                                                 zoneids[domi_nzones++] =
7077                                                     zone->zone_id;
7078                                         }
7079                                 }
7080                         }
7081                         mutex_exit(&zonehash_lock);
7082                 }
7083         } else {
7084                 mutex_enter(&zonehash_lock);
7085                 real_nzones = zonecount;
7086                 domi_nzones = 0;
7087                 if (real_nzones > 0) {
7088                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
7089                             KM_SLEEP);
7090                         for (zone = list_head(&zone_active); zone != NULL;
7091                             zone = list_next(&zone_active, zone))
7092                                 zoneids[domi_nzones++] = zone->zone_id;
7093                         ASSERT(domi_nzones == real_nzones);
7094                 }
7095                 mutex_exit(&zonehash_lock);
7096         }
7097 
7098         /*
7099          * If user has allocated space for fewer entries than we found, then
7100          * return only up to their limit.  Either way, tell them exactly how
7101          * many we found.
7102          */
7103         if (domi_nzones < user_nzones)
7104                 user_nzones = domi_nzones;
7105         error = 0;
7106         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
7107                 error = EFAULT;
7108         } else if (zoneidlist != NULL && user_nzones != 0) {
7109                 if (copyout(zoneids, zoneidlist,
7110                     user_nzones * sizeof (zoneid_t)) != 0)
7111                         error = EFAULT;
7112         }
7113 
7114         if (real_nzones > 0)
7115                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
7116 
7117         if (error != 0)
7118                 return (set_errno(error));
7119         else
7120                 return (0);
7121 }
7122 
7123 /*
7124  * Systemcall entry point for zone_lookup(2).
7125  *
7126  * Non-global zones are only able to see themselves and (on labeled systems)
7127  * the zones they dominate.
7128  */
7129 static zoneid_t
7130 zone_lookup(const char *zone_name)
7131 {
7132         char *kname;
7133         zone_t *zone;
7134         zoneid_t zoneid;
7135         int err;
7136 
7137         if (zone_name == NULL) {
7138                 /* return caller's zone id */
7139                 return (getzoneid());
7140         }
7141 
7142         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
7143         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
7144                 kmem_free(kname, ZONENAME_MAX);
7145                 return (set_errno(err));
7146         }
7147 
7148         mutex_enter(&zonehash_lock);
7149         zone = zone_find_all_by_name(kname);
7150         kmem_free(kname, ZONENAME_MAX);
7151         /*
7152          * In a non-global zone, can only lookup global and own name.
7153          * In Trusted Extensions zone label dominance rules apply.
7154          */
7155         if (zone == NULL ||
7156             zone_status_get(zone) < ZONE_IS_READY ||
7157             !zone_list_access(zone)) {
7158                 mutex_exit(&zonehash_lock);
7159                 return (set_errno(EINVAL));
7160         } else {
7161                 zoneid = zone->zone_id;
7162                 mutex_exit(&zonehash_lock);
7163                 return (zoneid);
7164         }
7165 }
7166 
7167 static int
7168 zone_version(int *version_arg)
7169 {
7170         int version = ZONE_SYSCALL_API_VERSION;
7171 
7172         if (copyout(&version, version_arg, sizeof (int)) != 0)
7173                 return (set_errno(EFAULT));
7174         return (0);
7175 }
7176 
7177 /* ARGSUSED */
7178 long
7179 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
7180 {
7181         zone_def zs;
7182         int err;
7183 
7184         switch (cmd) {
7185         case ZONE_CREATE:
7186                 if (get_udatamodel() == DATAMODEL_NATIVE) {
7187                         if (copyin(arg1, &zs, sizeof (zone_def))) {
7188                                 return (set_errno(EFAULT));
7189                         }
7190                 } else {
7191 #ifdef _SYSCALL32_IMPL
7192                         zone_def32 zs32;
7193 
7194                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
7195                                 return (set_errno(EFAULT));
7196                         }
7197                         zs.zone_name =
7198                             (const char *)(unsigned long)zs32.zone_name;
7199                         zs.zone_root =
7200                             (const char *)(unsigned long)zs32.zone_root;
7201                         zs.zone_privs =
7202                             (const struct priv_set *)
7203                             (unsigned long)zs32.zone_privs;
7204                         zs.zone_privssz = zs32.zone_privssz;
7205                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
7206                         zs.rctlbufsz = zs32.rctlbufsz;
7207                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
7208                         zs.zfsbufsz = zs32.zfsbufsz;
7209                         zs.extended_error =
7210                             (int *)(unsigned long)zs32.extended_error;
7211                         zs.match = zs32.match;
7212                         zs.doi = zs32.doi;
7213                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
7214                         zs.flags = zs32.flags;
7215                         zs.zoneid = zs32.zoneid;
7216 #else
7217                         panic("get_udatamodel() returned bogus result\n");
7218 #endif
7219                 }
7220 
7221                 return (zone_create(zs.zone_name, zs.zone_root,
7222                     zs.zone_privs, zs.zone_privssz,
7223                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
7224                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
7225                     zs.extended_error, zs.match, zs.doi,
7226                     zs.label, zs.flags, zs.zoneid));
7227         case ZONE_BOOT:
7228                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
7229         case ZONE_DESTROY:
7230                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
7231         case ZONE_GETATTR:
7232                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
7233                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7234         case ZONE_SETATTR:
7235                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
7236                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7237         case ZONE_ENTER:
7238                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
7239         case ZONE_LIST:
7240                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
7241         case ZONE_SHUTDOWN:
7242                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
7243         case ZONE_LOOKUP:
7244                 return (zone_lookup((const char *)arg1));
7245         case ZONE_VERSION:
7246                 return (zone_version((int *)arg1));
7247         case ZONE_ADD_DATALINK:
7248                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
7249                     (datalink_id_t)(uintptr_t)arg2));
7250         case ZONE_DEL_DATALINK:
7251                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
7252                     (datalink_id_t)(uintptr_t)arg2));
7253         case ZONE_CHECK_DATALINK: {
7254                 zoneid_t        zoneid;
7255                 boolean_t       need_copyout;
7256 
7257                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
7258                         return (EFAULT);
7259                 need_copyout = (zoneid == ALL_ZONES);
7260                 err = zone_check_datalink(&zoneid,
7261                     (datalink_id_t)(uintptr_t)arg2);
7262                 if (err == 0 && need_copyout) {
7263                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
7264                                 err = EFAULT;
7265                 }
7266                 return (err == 0 ? 0 : set_errno(err));
7267         }
7268         case ZONE_LIST_DATALINK:
7269                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
7270                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
7271         default:
7272                 return (set_errno(EINVAL));
7273         }
7274 }
7275 
7276 struct zarg {
7277         zone_t *zone;
7278         zone_cmd_arg_t arg;
7279 };
7280 
7281 static int
7282 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
7283 {
7284         char *buf;
7285         size_t buflen;
7286         int error;
7287 
7288         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
7289         buf = kmem_alloc(buflen, KM_SLEEP);
7290         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
7291         error = door_ki_open(buf, doorp);
7292         kmem_free(buf, buflen);
7293         return (error);
7294 }
7295 
7296 static void
7297 zone_release_door(door_handle_t *doorp)
7298 {
7299         door_ki_rele(*doorp);
7300         *doorp = NULL;
7301 }
7302 
7303 static void
7304 zone_ki_call_zoneadmd(struct zarg *zargp)
7305 {
7306         door_handle_t door = NULL;
7307         door_arg_t darg, save_arg;
7308         char *zone_name;
7309         size_t zone_namelen;
7310         zoneid_t zoneid;
7311         zone_t *zone;
7312         zone_cmd_arg_t arg;
7313         uint64_t uniqid;
7314         size_t size;
7315         int error;
7316         int retry;
7317 
7318         zone = zargp->zone;
7319         arg = zargp->arg;
7320         kmem_free(zargp, sizeof (*zargp));
7321 
7322         zone_namelen = strlen(zone->zone_name) + 1;
7323         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
7324         bcopy(zone->zone_name, zone_name, zone_namelen);
7325         zoneid = zone->zone_id;
7326         uniqid = zone->zone_uniqid;
7327         arg.status = zone->zone_init_status;
7328         /*
7329          * zoneadmd may be down, but at least we can empty out the zone.
7330          * We can ignore the return value of zone_empty() since we're called
7331          * from a kernel thread and know we won't be delivered any signals.
7332          */
7333         ASSERT(curproc == &p0);
7334         (void) zone_empty(zone);
7335         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
7336         zone_rele(zone);
7337 
7338         size = sizeof (arg);
7339         darg.rbuf = (char *)&arg;
7340         darg.data_ptr = (char *)&arg;
7341         darg.rsize = size;
7342         darg.data_size = size;
7343         darg.desc_ptr = NULL;
7344         darg.desc_num = 0;
7345 
7346         save_arg = darg;
7347         /*
7348          * Since we're not holding a reference to the zone, any number of
7349          * things can go wrong, including the zone disappearing before we get a
7350          * chance to talk to zoneadmd.
7351          */
7352         for (retry = 0; /* forever */; retry++) {
7353                 if (door == NULL &&
7354                     (error = zone_lookup_door(zone_name, &door)) != 0) {
7355                         goto next;
7356                 }
7357                 ASSERT(door != NULL);
7358 
7359                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
7360                     SIZE_MAX, 0)) == 0) {
7361                         break;
7362                 }
7363                 switch (error) {
7364                 case EINTR:
7365                         /* FALLTHROUGH */
7366                 case EAGAIN:    /* process may be forking */
7367                         /*
7368                          * Back off for a bit
7369                          */
7370                         break;
7371                 case EBADF:
7372                         zone_release_door(&door);
7373                         if (zone_lookup_door(zone_name, &door) != 0) {
7374                                 /*
7375                                  * zoneadmd may be dead, but it may come back to
7376                                  * life later.
7377                                  */
7378                                 break;
7379                         }
7380                         break;
7381                 default:
7382                         cmn_err(CE_WARN,
7383                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
7384                             error);
7385                         goto out;
7386                 }
7387 next:
7388                 /*
7389                  * If this isn't the same zone_t that we originally had in mind,
7390                  * then this is the same as if two kadmin requests come in at
7391                  * the same time: the first one wins.  This means we lose, so we
7392                  * bail.
7393                  */
7394                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
7395                         /*
7396                          * Problem is solved.
7397                          */
7398                         break;
7399                 }
7400                 if (zone->zone_uniqid != uniqid) {
7401                         /*
7402                          * zoneid recycled
7403                          */
7404                         zone_rele(zone);
7405                         break;
7406                 }
7407                 /*
7408                  * We could zone_status_timedwait(), but there doesn't seem to
7409                  * be much point in doing that (plus, it would mean that
7410                  * zone_free() isn't called until this thread exits).
7411                  */
7412                 zone_rele(zone);
7413                 delay(hz);
7414                 darg = save_arg;
7415         }
7416 out:
7417         if (door != NULL) {
7418                 zone_release_door(&door);
7419         }
7420         kmem_free(zone_name, zone_namelen);
7421         thread_exit();
7422 }
7423 
7424 /*
7425  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
7426  * kadmin().  The caller is a process in the zone.
7427  *
7428  * In order to shutdown the zone, we will hand off control to zoneadmd
7429  * (running in the global zone) via a door.  We do a half-hearted job at
7430  * killing all processes in the zone, create a kernel thread to contact
7431  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
7432  * a form of generation number used to let zoneadmd (as well as
7433  * zone_destroy()) know exactly which zone they're re talking about.
7434  */
7435 int
7436 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
7437 {
7438         struct zarg *zargp;
7439         zone_cmd_t zcmd;
7440         zone_t *zone;
7441 
7442         zone = curproc->p_zone;
7443         ASSERT(getzoneid() != GLOBAL_ZONEID);
7444 
7445         switch (cmd) {
7446         case A_SHUTDOWN:
7447                 switch (fcn) {
7448                 case AD_HALT:
7449                 case AD_POWEROFF:
7450                         zcmd = Z_HALT;
7451                         break;
7452                 case AD_BOOT:
7453                         zcmd = Z_REBOOT;
7454                         break;
7455                 case AD_IBOOT:
7456                 case AD_SBOOT:
7457                 case AD_SIBOOT:
7458                 case AD_NOSYNC:
7459                         return (ENOTSUP);
7460                 default:
7461                         return (EINVAL);
7462                 }
7463                 break;
7464         case A_REBOOT:
7465                 zcmd = Z_REBOOT;
7466                 break;
7467         case A_FTRACE:
7468         case A_REMOUNT:
7469         case A_FREEZE:
7470         case A_DUMP:
7471         case A_CONFIG:
7472                 return (ENOTSUP);
7473         default:
7474                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
7475                 return (EINVAL);
7476         }
7477 
7478         if (secpolicy_zone_admin(credp, B_FALSE))
7479                 return (EPERM);
7480         mutex_enter(&zone_status_lock);
7481 
7482         /*
7483          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
7484          * is in the zone.
7485          */
7486         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
7487         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
7488                 /*
7489                  * This zone is already on its way down.
7490                  */
7491                 mutex_exit(&zone_status_lock);
7492                 return (0);
7493         }
7494         /*
7495          * Prevent future zone_enter()s
7496          */
7497         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
7498         mutex_exit(&zone_status_lock);
7499 
7500         /*
7501          * Kill everyone now and call zoneadmd later.
7502          * zone_ki_call_zoneadmd() will do a more thorough job of this
7503          * later.
7504          */
7505         killall(zone->zone_id, B_FALSE);
7506         /*
7507          * Now, create the thread to contact zoneadmd and do the rest of the
7508          * work.  This thread can't be created in our zone otherwise
7509          * zone_destroy() would deadlock.
7510          */
7511         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
7512         zargp->arg.cmd = zcmd;
7513         zargp->arg.uniqid = zone->zone_uniqid;
7514         zargp->zone = zone;
7515         (void) strcpy(zargp->arg.locale, "C");
7516         /* mdep was already copied in for us by uadmin */
7517         if (mdep != NULL)
7518                 (void) strlcpy(zargp->arg.bootbuf, mdep,
7519                     sizeof (zargp->arg.bootbuf));
7520         zone_hold(zone);
7521 
7522         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
7523             TS_RUN, minclsyspri);
7524         exit(CLD_EXITED, 0);
7525 
7526         return (EINVAL);
7527 }
7528 
7529 /*
7530  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
7531  * status to ZONE_IS_SHUTTING_DOWN.
7532  *
7533  * This function also shuts down all running zones to ensure that they won't
7534  * fork new processes.
7535  */
7536 void
7537 zone_shutdown_global(void)
7538 {
7539         zone_t *current_zonep;
7540 
7541         ASSERT(INGLOBALZONE(curproc));
7542         mutex_enter(&zonehash_lock);
7543         mutex_enter(&zone_status_lock);
7544 
7545         /* Modify the global zone's status first. */
7546         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
7547         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
7548 
7549         /*
7550          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
7551          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
7552          * could cause assertions to fail (e.g., assertions about a zone's
7553          * state during initialization, readying, or booting) or produce races.
7554          * We'll let threads continue to initialize and ready new zones: they'll
7555          * fail to boot the new zones when they see that the global zone is
7556          * shutting down.
7557          */
7558         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
7559             current_zonep = list_next(&zone_active, current_zonep)) {
7560                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
7561                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
7562         }
7563         mutex_exit(&zone_status_lock);
7564         mutex_exit(&zonehash_lock);
7565 }
7566 
7567 /*
7568  * Returns true if the named dataset is visible in the specified zone.
7569  * The 'write' parameter is set to 1 if the dataset is also writable.
7570  */
7571 int
7572 zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
7573 {
7574         static int zfstype = -1;
7575         zone_dataset_t *zd;
7576         size_t len;
7577         const char *name = NULL;
7578         vfs_t *vfsp = NULL;
7579 
7580         if (dataset[0] == '\0')
7581                 return (0);
7582 
7583         /*
7584          * Walk the list once, looking for datasets which match exactly, or
7585          * specify a dataset underneath an exported dataset.  If found, return
7586          * true and note that it is writable.
7587          */
7588         for (zd = list_head(&zone->zone_datasets); zd != NULL;
7589             zd = list_next(&zone->zone_datasets, zd)) {
7590 
7591                 len = strlen(zd->zd_dataset);
7592                 if (strlen(dataset) >= len &&
7593                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
7594                     (dataset[len] == '\0' || dataset[len] == '/' ||
7595                     dataset[len] == '@')) {
7596                         if (write)
7597                                 *write = 1;
7598                         return (1);
7599                 }
7600         }
7601 
7602         /*
7603          * Walk the list a second time, searching for datasets which are parents
7604          * of exported datasets.  These should be visible, but read-only.
7605          *
7606          * Note that we also have to support forms such as 'pool/dataset/', with
7607          * a trailing slash.
7608          */
7609         for (zd = list_head(&zone->zone_datasets); zd != NULL;
7610             zd = list_next(&zone->zone_datasets, zd)) {
7611 
7612                 len = strlen(dataset);
7613                 if (dataset[len - 1] == '/')
7614                         len--;  /* Ignore trailing slash */
7615                 if (len < strlen(zd->zd_dataset) &&
7616                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
7617                     zd->zd_dataset[len] == '/') {
7618                         if (write)
7619                                 *write = 0;
7620                         return (1);
7621                 }
7622         }
7623 
7624         /*
7625          * We reach here if the given dataset is not found in the zone_dataset
7626          * list. Check if this dataset was added as a filesystem (ie. "add fs")
7627          * instead of delegation. For this we search for the dataset in the
7628          * zone_vfslist of this zone. If found, return true and note that it is
7629          * not writable.
7630          */
7631 
7632         /*
7633          * Initialize zfstype if it is not initialized yet.
7634          */
7635         if (zfstype == -1) {
7636                 struct vfssw *vswp = vfs_getvfssw("zfs");
7637                 zfstype = vswp - vfssw;
7638                 vfs_unrefvfssw(vswp);
7639         }
7640 
7641         vfs_list_read_lock();
7642         vfsp = zone->zone_vfslist;
7643         do {
7644                 if (vfsp == NULL)
7645                         break;
7646                 if (vfsp->vfs_fstype == zfstype) {
7647                         name = refstr_value(vfsp->vfs_resource);
7648 
7649                         /*
7650                          * Check if we have an exact match.
7651                          */
7652                         if (strcmp(dataset, name) == 0) {
7653                                 vfs_list_unlock();
7654                                 if (write)
7655                                         *write = 0;
7656                                 return (1);
7657                         }
7658                         /*
7659                          * We need to check if we are looking for parents of
7660                          * a dataset. These should be visible, but read-only.
7661                          */
7662                         len = strlen(dataset);
7663                         if (dataset[len - 1] == '/')
7664                                 len--;
7665 
7666                         if (len < strlen(name) &&
7667                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
7668                                 vfs_list_unlock();
7669                                 if (write)
7670                                         *write = 0;
7671                                 return (1);
7672                         }
7673                 }
7674                 vfsp = vfsp->vfs_zone_next;
7675         } while (vfsp != zone->zone_vfslist);
7676 
7677         vfs_list_unlock();
7678         return (0);
7679 }
7680 
7681 /*
7682  * Returns true if the named dataset is visible in the current zone.
7683  * The 'write' parameter is set to 1 if the dataset is also writable.
7684  */
7685 int
7686 zone_dataset_visible(const char *dataset, int *write)
7687 {
7688         zone_t *zone = curproc->p_zone;
7689 
7690         return (zone_dataset_visible_inzone(zone, dataset, write));
7691 }
7692 
7693 /*
7694  * zone_find_by_any_path() -
7695  *
7696  * kernel-private routine similar to zone_find_by_path(), but which
7697  * effectively compares against zone paths rather than zonerootpath
7698  * (i.e., the last component of zonerootpaths, which should be "root/",
7699  * are not compared.)  This is done in order to accurately identify all
7700  * paths, whether zone-visible or not, including those which are parallel
7701  * to /root/, such as /dev/, /home/, etc...
7702  *
7703  * If the specified path does not fall under any zone path then global
7704  * zone is returned.
7705  *
7706  * The treat_abs parameter indicates whether the path should be treated as
7707  * an absolute path although it does not begin with "/".  (This supports
7708  * nfs mount syntax such as host:any/path.)
7709  *
7710  * The caller is responsible for zone_rele of the returned zone.
7711  */
7712 zone_t *
7713 zone_find_by_any_path(const char *path, boolean_t treat_abs)
7714 {
7715         zone_t *zone;
7716         int path_offset = 0;
7717 
7718         if (path == NULL) {
7719                 zone_hold(global_zone);
7720                 return (global_zone);
7721         }
7722 
7723         if (*path != '/') {
7724                 ASSERT(treat_abs);
7725                 path_offset = 1;
7726         }
7727 
7728         mutex_enter(&zonehash_lock);
7729         for (zone = list_head(&zone_active); zone != NULL;
7730             zone = list_next(&zone_active, zone)) {
7731                 char    *c;
7732                 size_t  pathlen;
7733                 char *rootpath_start;
7734 
7735                 if (zone == global_zone)        /* skip global zone */
7736                         continue;
7737 
7738                 /* scan backwards to find start of last component */
7739                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
7740                 do {
7741                         c--;
7742                 } while (*c != '/');
7743 
7744                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
7745                 rootpath_start = (zone->zone_rootpath + path_offset);
7746                 if (strncmp(path, rootpath_start, pathlen) == 0)
7747                         break;
7748         }
7749         if (zone == NULL)
7750                 zone = global_zone;
7751         zone_hold(zone);
7752         mutex_exit(&zonehash_lock);
7753         return (zone);
7754 }
7755 
7756 /*
7757  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7758  * zone_dl_t pointer if found, and NULL otherwise.
7759  */
7760 static zone_dl_t *
7761 zone_find_dl(zone_t *zone, datalink_id_t linkid)
7762 {
7763         zone_dl_t *zdl;
7764 
7765         ASSERT(mutex_owned(&zone->zone_lock));
7766         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7767             zdl = list_next(&zone->zone_dl_list, zdl)) {
7768                 if (zdl->zdl_id == linkid)
7769                         break;
7770         }
7771         return (zdl);
7772 }
7773 
7774 static boolean_t
7775 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7776 {
7777         boolean_t exists;
7778 
7779         mutex_enter(&zone->zone_lock);
7780         exists = (zone_find_dl(zone, linkid) != NULL);
7781         mutex_exit(&zone->zone_lock);
7782         return (exists);
7783 }
7784 
7785 /*
7786  * Add an data link name for the zone.
7787  */
7788 static int
7789 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7790 {
7791         zone_dl_t *zdl;
7792         zone_t *zone;
7793         zone_t *thiszone;
7794 
7795         /*
7796          * Only the GZ may add a datalink to a zone's list.
7797          */
7798         if (getzoneid() != GLOBAL_ZONEID)
7799                 return (set_errno(EPERM));
7800 
7801         /*
7802          * Only a process with the datalink config priv may add a
7803          * datalink to a zone's list.
7804          */
7805         if (secpolicy_dl_config(CRED()) != 0)
7806                 return (set_errno(EPERM));
7807 
7808         /*
7809          * When links exist in the GZ, they aren't added to the GZ's
7810          * zone_dl_list. We must enforce this because link_activate()
7811          * depends on zone_check_datalink() returning only NGZs.
7812          */
7813         if (zoneid == GLOBAL_ZONEID)
7814                 return (set_errno(EINVAL));
7815 
7816         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7817                 return (set_errno(ENXIO));
7818 
7819         /* Verify that the datalink ID doesn't already belong to a zone. */
7820         mutex_enter(&zonehash_lock);
7821         for (zone = list_head(&zone_active); zone != NULL;
7822             zone = list_next(&zone_active, zone)) {
7823                 if (zone_dl_exists(zone, linkid)) {
7824                         mutex_exit(&zonehash_lock);
7825                         zone_rele(thiszone);
7826                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7827                 }
7828         }
7829 
7830         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7831         zdl->zdl_id = linkid;
7832         zdl->zdl_net = NULL;
7833         mutex_enter(&thiszone->zone_lock);
7834         list_insert_head(&thiszone->zone_dl_list, zdl);
7835         mutex_exit(&thiszone->zone_lock);
7836         mutex_exit(&zonehash_lock);
7837         zone_rele(thiszone);
7838         return (0);
7839 }
7840 
7841 static int
7842 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7843 {
7844         zone_dl_t *zdl;
7845         zone_t *zone;
7846         int err = 0;
7847 
7848         /*
7849          * Only the GZ may remove a datalink from a zone's list.
7850          */
7851         if (getzoneid() != GLOBAL_ZONEID)
7852                 return (set_errno(EPERM));
7853 
7854         /*
7855          * Only a process with the datalink config priv may remove a
7856          * datalink from a zone's list.
7857          */
7858         if (secpolicy_dl_config(CRED()) != 0)
7859                 return (set_errno(EPERM));
7860 
7861         /*
7862          * If we can't add a datalink to the GZ's zone_dl_list then we
7863          * certainly can't remove them either.
7864          */
7865         if (zoneid == GLOBAL_ZONEID)
7866                 return (set_errno(EINVAL));
7867 
7868         if ((zone = zone_find_by_id(zoneid)) == NULL)
7869                 return (set_errno(EINVAL));
7870 
7871         mutex_enter(&zone->zone_lock);
7872         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7873                 err = ENXIO;
7874         } else {
7875                 list_remove(&zone->zone_dl_list, zdl);
7876                 nvlist_free(zdl->zdl_net);
7877                 kmem_free(zdl, sizeof (zone_dl_t));
7878         }
7879         mutex_exit(&zone->zone_lock);
7880         zone_rele(zone);
7881         return (err == 0 ? 0 : set_errno(err));
7882 }
7883 
7884 /*
7885  *
7886  * This function may be used in two ways:
7887  *
7888  * 1. to get the zoneid of the zone this link is under, or
7889  *
7890  * 2. to verify that the link is under a specific zone.
7891  *
7892  * The first use is achieved by passing a zoneid of ALL_ZONES. The
7893  * function then iterates the datalink list of every zone on the
7894  * system until it finds the linkid. If the linkid is found then the
7895  * function returns 0 and zoneidp is updated. Otherwise, ENXIO is
7896  * returned and zoneidp is not modified. The use of ALL_ZONES is
7897  * limited to callers in the GZ to prevent leaking information to
7898  * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed
7899  * to the second type in the list above.
7900  *
7901  * The second use is achieved by passing a specific zoneid. The GZ can
7902  * use this to verify a link is under a particular zone. An NGZ can
7903  * use this to verify a link is under itself. But an NGZ cannot use
7904  * this to determine if a link is under some other zone as that would
7905  * result in information leakage. If the link exists under the zone
7906  * then 0 is returned. Otherwise, ENXIO is returned.
7907  */
7908 int
7909 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7910 {
7911         zone_t *zone;
7912         zoneid_t zoneid = *zoneidp;
7913         zoneid_t caller = getzoneid();
7914         int err = ENXIO;
7915 
7916         /*
7917          * Only the GZ may enquire about all zones; an NGZ may only
7918          * enuqire about itself.
7919          */
7920         if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID)
7921                 zoneid = caller;
7922 
7923         if (zoneid != caller && caller != GLOBAL_ZONEID)
7924                 return (err);
7925 
7926         if (zoneid != ALL_ZONES) {
7927                 if ((zone = zone_find_by_id(zoneid)) != NULL) {
7928                         if (zone_dl_exists(zone, linkid)) {
7929                                 /*
7930                                  * We need to set this in case an NGZ
7931                                  * passes ALL_ZONES.
7932                                  */
7933                                 *zoneidp = zoneid;
7934                                 err = 0;
7935                         }
7936                         zone_rele(zone);
7937                 }
7938                 return (err);
7939         }
7940 
7941         ASSERT(caller == GLOBAL_ZONEID);
7942         mutex_enter(&zonehash_lock);
7943         for (zone = list_head(&zone_active); zone != NULL;
7944             zone = list_next(&zone_active, zone)) {
7945                 if (zone_dl_exists(zone, linkid)) {
7946                         *zoneidp = zone->zone_id;
7947                         err = 0;
7948                         break;
7949                 }
7950         }
7951         mutex_exit(&zonehash_lock);
7952 
7953         return (err);
7954 }
7955 
7956 /*
7957  * Get the list of datalink IDs assigned to a zone.
7958  *
7959  * On input, *nump is the number of datalink IDs that can fit in the supplied
7960  * idarray.  Upon return, *nump is either set to the number of datalink IDs
7961  * that were placed in the array if the array was large enough, or to the
7962  * number of datalink IDs that the function needs to place in the array if the
7963  * array is too small.
7964  */
7965 static int
7966 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7967 {
7968         uint_t num, dlcount;
7969         zone_t *zone;
7970         zone_dl_t *zdl;
7971         datalink_id_t *idptr = idarray;
7972 
7973         /*
7974          * Only the GZ or the owning zone may look at the datalink list.
7975          */
7976         if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid))
7977                 return (set_errno(EPERM));
7978 
7979         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7980                 return (set_errno(EFAULT));
7981         if ((zone = zone_find_by_id(zoneid)) == NULL)
7982                 return (set_errno(ENXIO));
7983 
7984         num = 0;
7985         mutex_enter(&zone->zone_lock);
7986         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7987             zdl = list_next(&zone->zone_dl_list, zdl)) {
7988                 /*
7989                  * If the list is bigger than what the caller supplied, just
7990                  * count, don't do copyout.
7991                  */
7992                 if (++num > dlcount)
7993                         continue;
7994                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7995                         mutex_exit(&zone->zone_lock);
7996                         zone_rele(zone);
7997                         return (set_errno(EFAULT));
7998                 }
7999                 idptr++;
8000         }
8001         mutex_exit(&zone->zone_lock);
8002         zone_rele(zone);
8003 
8004         /*
8005          * Prevent returning negative nump values -- we should never
8006          * have this many links anyways.
8007          */
8008         if (num > INT_MAX)
8009                 return (set_errno(EOVERFLOW));
8010 
8011         /* Increased or decreased, caller should be notified. */
8012         if (num != dlcount) {
8013                 if (copyout(&num, nump, sizeof (num)) != 0)
8014                         return (set_errno(EFAULT));
8015         }
8016         return (0);
8017 }
8018 
8019 /*
8020  * Public interface for looking up a zone by zoneid. It's a customized version
8021  * for netstack_zone_create(). It can only be called from the zsd create
8022  * callbacks, since it doesn't have reference on the zone structure hence if
8023  * it is called elsewhere the zone could disappear after the zonehash_lock
8024  * is dropped.
8025  *
8026  * Furthermore it
8027  * 1. Doesn't check the status of the zone.
8028  * 2. It will be called even before zone_init is called, in that case the
8029  *    address of zone0 is returned directly, and netstack_zone_create()
8030  *    will only assign a value to zone0.zone_netstack, won't break anything.
8031  * 3. Returns without the zone being held.
8032  */
8033 zone_t *
8034 zone_find_by_id_nolock(zoneid_t zoneid)
8035 {
8036         zone_t *zone;
8037 
8038         mutex_enter(&zonehash_lock);
8039         if (zonehashbyid == NULL)
8040                 zone = &zone0;
8041         else
8042                 zone = zone_find_all_by_id(zoneid);
8043         mutex_exit(&zonehash_lock);
8044         return (zone);
8045 }
8046 
8047 /*
8048  * Walk the datalinks for a given zone
8049  */
8050 int
8051 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
8052     void *data)
8053 {
8054         zone_t          *zone;
8055         zone_dl_t       *zdl;
8056         datalink_id_t   *idarray;
8057         uint_t          idcount = 0;
8058         int             i, ret = 0;
8059 
8060         if ((zone = zone_find_by_id(zoneid)) == NULL)
8061                 return (ENOENT);
8062 
8063         /*
8064          * We first build an array of linkid's so that we can walk these and
8065          * execute the callback with the zone_lock dropped.
8066          */
8067         mutex_enter(&zone->zone_lock);
8068         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
8069             zdl = list_next(&zone->zone_dl_list, zdl)) {
8070                 idcount++;
8071         }
8072 
8073         if (idcount == 0) {
8074                 mutex_exit(&zone->zone_lock);
8075                 zone_rele(zone);
8076                 return (0);
8077         }
8078 
8079         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
8080         if (idarray == NULL) {
8081                 mutex_exit(&zone->zone_lock);
8082                 zone_rele(zone);
8083                 return (ENOMEM);
8084         }
8085 
8086         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
8087             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
8088                 idarray[i] = zdl->zdl_id;
8089         }
8090 
8091         mutex_exit(&zone->zone_lock);
8092 
8093         for (i = 0; i < idcount && ret == 0; i++) {
8094                 if ((ret = (*cb)(idarray[i], data)) != 0)
8095                         break;
8096         }
8097 
8098         zone_rele(zone);
8099         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
8100         return (ret);
8101 }
8102 
8103 static char *
8104 zone_net_type2name(int type)
8105 {
8106         switch (type) {
8107         case ZONE_NETWORK_ADDRESS:
8108                 return (ZONE_NET_ADDRNAME);
8109         case ZONE_NETWORK_DEFROUTER:
8110                 return (ZONE_NET_RTRNAME);
8111         default:
8112                 return (NULL);
8113         }
8114 }
8115 
8116 static int
8117 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
8118 {
8119         zone_t *zone;
8120         zone_dl_t *zdl;
8121         nvlist_t *nvl;
8122         int err = 0;
8123         uint8_t *new = NULL;
8124         char *nvname;
8125         int bufsize;
8126         datalink_id_t linkid = znbuf->zn_linkid;
8127 
8128         if (secpolicy_zone_config(CRED()) != 0)
8129                 return (set_errno(EPERM));
8130 
8131         if (zoneid == GLOBAL_ZONEID)
8132                 return (set_errno(EINVAL));
8133 
8134         nvname = zone_net_type2name(znbuf->zn_type);
8135         bufsize = znbuf->zn_len;
8136         new = znbuf->zn_val;
8137         if (nvname == NULL)
8138                 return (set_errno(EINVAL));
8139 
8140         if ((zone = zone_find_by_id(zoneid)) == NULL) {
8141                 return (set_errno(EINVAL));
8142         }
8143 
8144         mutex_enter(&zone->zone_lock);
8145         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
8146                 err = ENXIO;
8147                 goto done;
8148         }
8149         if ((nvl = zdl->zdl_net) == NULL) {
8150                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
8151                         err = ENOMEM;
8152                         goto done;
8153                 } else {
8154                         zdl->zdl_net = nvl;
8155                 }
8156         }
8157         if (nvlist_exists(nvl, nvname)) {
8158                 err = EINVAL;
8159                 goto done;
8160         }
8161         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
8162         ASSERT(err == 0);
8163 done:
8164         mutex_exit(&zone->zone_lock);
8165         zone_rele(zone);
8166         if (err != 0)
8167                 return (set_errno(err));
8168         else
8169                 return (0);
8170 }
8171 
8172 static int
8173 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
8174 {
8175         zone_t *zone;
8176         zone_dl_t *zdl;
8177         nvlist_t *nvl;
8178         uint8_t *ptr;
8179         uint_t psize;
8180         int err = 0;
8181         char *nvname;
8182         int bufsize;
8183         void *buf;
8184         datalink_id_t linkid = znbuf->zn_linkid;
8185 
8186         if (zoneid == GLOBAL_ZONEID)
8187                 return (set_errno(EINVAL));
8188 
8189         nvname = zone_net_type2name(znbuf->zn_type);
8190         bufsize = znbuf->zn_len;
8191         buf = znbuf->zn_val;
8192 
8193         if (nvname == NULL)
8194                 return (set_errno(EINVAL));
8195         if ((zone = zone_find_by_id(zoneid)) == NULL)
8196                 return (set_errno(EINVAL));
8197 
8198         mutex_enter(&zone->zone_lock);
8199         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
8200                 err = ENXIO;
8201                 goto done;
8202         }
8203         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
8204                 err = ENOENT;
8205                 goto done;
8206         }
8207         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
8208         ASSERT(err == 0);
8209 
8210         if (psize > bufsize) {
8211                 err = ENOBUFS;
8212                 goto done;
8213         }
8214         znbuf->zn_len = psize;
8215         bcopy(ptr, buf, psize);
8216 done:
8217         mutex_exit(&zone->zone_lock);
8218         zone_rele(zone);
8219         if (err != 0)
8220                 return (set_errno(err));
8221         else
8222                 return (0);
8223 }
8224 
8225 static void
8226 zone_incr_capped(zoneid_t zid)
8227 {
8228         zone_persist_t *zp = &zone_pdata[zid];
8229 
8230         /* See if over (unlimited is UINT32_MAX), or already marked that way. */
8231         if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
8232                 return;
8233         }
8234 
8235         mutex_enter(&zone_physcap_lock);
8236         /* Recheck setting under mutex */
8237         if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
8238                 zp->zpers_over = 1;
8239                 zp->zpers_nover++;
8240                 zone_num_over_cap++;
8241                 DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
8242         }
8243         mutex_exit(&zone_physcap_lock);
8244 }
8245 
8246 /*
8247  * We want some hysteresis when the zone is going under its cap so that we're
8248  * not continuously toggling page scanning back and forth by a single page
8249  * around the cap. Using ~1% of the zone's page limit seems to be a good
8250  * quantity. This table shows some various zone memory caps and the number of
8251  * pages (assuming a 4k page size). Given this, we choose to shift the page
8252  * limit by 7 places to get a hysteresis that is slightly less than 1%.
8253  *
8254  *   cap    pages     pages     1% shift7  shift7
8255  *  128M    32768 0x0008000    327    256 0x00100
8256  *  512M   131072 0x0020000   1310   1024 0x00400
8257  *    1G   262144 0x0040000   2621   2048 0x00800
8258  *    4G  1048576 0x0100000  10485   8192 0x02000
8259  *    8G  2097152 0x0200000  20971  16384 0x04000
8260  *   16G  4194304 0x0400000  41943  32768 0x08000
8261  *   32G  8388608 0x0800000  83886  65536 0x10000
8262  *   64G 16777216 0x1000000 167772 131072 0x20000
8263  */
8264 static void
8265 zone_decr_capped(zoneid_t zid)
8266 {
8267         zone_persist_t *zp = &zone_pdata[zid];
8268         uint32_t adjusted_limit;
8269 
8270         /*
8271          * See if under, or already marked that way. There is no need to
8272          * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
8273          * since we'll never set zpers_over in zone_incr_capped().
8274          */
8275         if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
8276                 return;
8277         }
8278 
8279         adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
8280 
8281         /* Recheck, accounting for our hysteresis. */
8282         if (zp->zpers_pg_cnt >= adjusted_limit) {
8283                 return;
8284         }
8285 
8286         mutex_enter(&zone_physcap_lock);
8287         /* Recheck under mutex. */
8288         if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
8289                 zp->zpers_over = 0;
8290                 ASSERT(zone_num_over_cap > 0);
8291                 zone_num_over_cap--;
8292                 DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
8293         }
8294         mutex_exit(&zone_physcap_lock);
8295 }
8296 
8297 /*
8298  * For zone_add_page() and zone_rm_page(), access to the page we're touching is
8299  * controlled by our caller's locking.
8300  * On x86 our callers already did: ASSERT(x86_hm_held(pp))
8301  * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp))
8302  */
8303 void
8304 zone_add_page(page_t *pp)
8305 {
8306         uint_t pcnt;
8307         zone_persist_t *zp;
8308         zoneid_t zid;
8309 
8310         /* Skip pages in segkmem, etc. (KV_KVP, ...) */
8311         if (PP_ISKAS(pp))
8312                 return;
8313 
8314         ASSERT(!PP_ISFREE(pp));
8315 
8316         zid = curzone->zone_id;
8317         if (pp->p_zoneid == zid) {
8318                 /* Another mapping to this page for this zone, do nothing */
8319                 return;
8320         }
8321 
8322         if (pp->p_szc == 0) {
8323                 pcnt = 1;
8324         } else {
8325                 /* large page */
8326                 pcnt = page_get_pagecnt(pp->p_szc);
8327         }
8328 
8329         if (pp->p_share == 0) {
8330                 /* First mapping to this page. */
8331                 pp->p_zoneid = zid;
8332                 zp = &zone_pdata[zid];
8333                 ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
8334                 atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
8335                 zone_incr_capped(zid);
8336                 return;
8337         }
8338 
8339         if (pp->p_zoneid != ALL_ZONES) {
8340                 /*
8341                  * The page is now being shared across a different zone.
8342                  * Decrement the original zone's usage.
8343                  */
8344                 zid = pp->p_zoneid;
8345                 pp->p_zoneid = ALL_ZONES;
8346                 ASSERT(zid >= 0 && zid <= MAX_ZONEID);
8347                 zp = &zone_pdata[zid];
8348 
8349                 if (zp->zpers_pg_cnt > 0) {
8350                         atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
8351                 }
8352                 zone_decr_capped(zid);
8353         }
8354 }
8355 
8356 void
8357 zone_rm_page(page_t *pp)
8358 {
8359         uint_t pcnt;
8360         zone_persist_t *zp;
8361         zoneid_t zid;
8362 
8363         /* Skip pages in segkmem, etc. (KV_KVP, ...) */
8364         if (PP_ISKAS(pp))
8365                 return;
8366 
8367         zid = pp->p_zoneid;
8368         if (zid == ALL_ZONES || pp->p_share != 0)
8369                 return;
8370 
8371         /* This is the last mapping to the page for a zone. */
8372         if (pp->p_szc == 0) {
8373                 pcnt = 1;
8374         } else {
8375                 /* large page */
8376                 pcnt = (int64_t)page_get_pagecnt(pp->p_szc);
8377         }
8378 
8379         ASSERT(zid >= 0 && zid <= MAX_ZONEID);
8380         zp = &zone_pdata[zid];
8381         if (zp->zpers_pg_cnt > 0) {
8382                 atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
8383         }
8384         zone_decr_capped(zid);
8385         pp->p_zoneid = ALL_ZONES;
8386 }
8387 
8388 void
8389 zone_pageout_stat(int zid, zone_pageout_op_t op)
8390 {
8391         zone_persist_t *zp;
8392 
8393         if (zid == ALL_ZONES)
8394                 return;
8395 
8396         ASSERT(zid >= 0 && zid <= MAX_ZONEID);
8397         zp = &zone_pdata[zid];
8398 
8399 #ifndef DEBUG
8400         atomic_add_64(&zp->zpers_pg_out, 1);
8401 #else
8402         switch (op) {
8403         case ZPO_DIRTY:
8404                 atomic_add_64(&zp->zpers_pg_fsdirty, 1);
8405                 break;
8406         case ZPO_FS:
8407                 atomic_add_64(&zp->zpers_pg_fs, 1);
8408                 break;
8409         case ZPO_ANON:
8410                 atomic_add_64(&zp->zpers_pg_anon, 1);
8411                 break;
8412         case ZPO_ANONDIRTY:
8413                 atomic_add_64(&zp->zpers_pg_anondirty, 1);
8414                 break;
8415         default:
8416                 cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
8417                 break;
8418         }
8419 #endif
8420 }
8421 
8422 /*
8423  * Return the zone's physical memory cap and current free memory (in pages).
8424  */
8425 void
8426 zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
8427 {
8428         zone_persist_t *zp;
8429 
8430         ASSERT(zid >= 0 && zid <= MAX_ZONEID);
8431         zp = &zone_pdata[zid];
8432 
8433         /*
8434          * If memory or swap limits are set on the zone, use those, otherwise
8435          * use the system values. physmem and freemem are also in pages.
8436          */
8437         if (zp->zpers_pg_limit == UINT32_MAX) {
8438                 *memcap = physmem;
8439                 *free = freemem;
8440         } else {
8441                 int64_t freemem;
8442 
8443                 *memcap = (pgcnt_t)zp->zpers_pg_limit;
8444                 freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
8445                 if (freemem > 0) {
8446                         *free = (pgcnt_t)freemem;
8447                 } else {
8448                         *free = (pgcnt_t)0;
8449                 }
8450         }
8451 }