1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent Inc. All rights reserved.
  25  */
  26 
  27 /*
  28  * Zones
  29  *
  30  *   A zone is a named collection of processes, namespace constraints,
  31  *   and other system resources which comprise a secure and manageable
  32  *   application containment facility.
  33  *
  34  *   Zones (represented by the reference counted zone_t) are tracked in
  35  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36  *   (zoneid_t) are used to track zone association.  Zone IDs are
  37  *   dynamically generated when the zone is created; if a persistent
  38  *   identifier is needed (core files, accounting logs, audit trail,
  39  *   etc.), the zone name should be used.
  40  *
  41  *
  42  *   Global Zone:
  43  *
  44  *   The global zone (zoneid 0) is automatically associated with all
  45  *   system resources that have not been bound to a user-created zone.
  46  *   This means that even systems where zones are not in active use
  47  *   have a global zone, and all processes, mounts, etc. are
  48  *   associated with that zone.  The global zone is generally
  49  *   unconstrained in terms of privileges and access, though the usual
  50  *   credential and privilege based restrictions apply.
  51  *
  52  *
  53  *   Zone States:
  54  *
  55  *   The states in which a zone may be in and the transitions are as
  56  *   follows:
  57  *
  58  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  59  *   initialized zone is added to the list of active zones on the system but
  60  *   isn't accessible.
  61  *
  62  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  63  *   not yet completed. Not possible to enter the zone, but attributes can
  64  *   be retrieved.
  65  *
  66  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  67  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  68  *   executed.  A zone remains in this state until it transitions into
  69  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  70  *
  71  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  72  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  73  *   state.
  74  *
  75  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  76  *   successfully started init.   A zone remains in this state until
  77  *   zone_shutdown() is called.
  78  *
  79  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  80  *   killing all processes running in the zone. The zone remains
  81  *   in this state until there are no more user processes running in the zone.
  82  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  83  *   Since zone_shutdown() is restartable, it may be called successfully
  84  *   multiple times for the same zone_t.  Setting of the zone's state to
  85  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  86  *   the zone's status without worrying about it being a moving target.
  87  *
  88  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  89  *   are no more user processes in the zone.  The zone remains in this
  90  *   state until there are no more kernel threads associated with the
  91  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  92  *   fail.
  93  *
  94  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  95  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  96  *   join the zone or create kernel threads therein.
  97  *
  98  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  99  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 100  *   return NULL from now on.
 101  *
 102  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 103  *   processes or threads doing work on behalf of the zone.  The zone is
 104  *   removed from the list of active zones.  zone_destroy() returns, and
 105  *   the zone can be recreated.
 106  *
 107  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 108  *   callbacks are executed, and all memory associated with the zone is
 109  *   freed.
 110  *
 111  *   Threads can wait for the zone to enter a requested state by using
 112  *   zone_status_wait() or zone_status_timedwait() with the desired
 113  *   state passed in as an argument.  Zone state transitions are
 114  *   uni-directional; it is not possible to move back to an earlier state.
 115  *
 116  *
 117  *   Zone-Specific Data:
 118  *
 119  *   Subsystems needing to maintain zone-specific data can store that
 120  *   data using the ZSD mechanism.  This provides a zone-specific data
 121  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 122  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 123  *   to register callbacks to be invoked when a zone is created, shut
 124  *   down, or destroyed.  This can be used to initialize zone-specific
 125  *   data for new zones and to clean up when zones go away.
 126  *
 127  *
 128  *   Data Structures:
 129  *
 130  *   The per-zone structure (zone_t) is reference counted, and freed
 131  *   when all references are released.  zone_hold and zone_rele can be
 132  *   used to adjust the reference count.  In addition, reference counts
 133  *   associated with the cred_t structure are tracked separately using
 134  *   zone_cred_hold and zone_cred_rele.
 135  *
 136  *   Pointers to active zone_t's are stored in two hash tables; one
 137  *   for searching by id, the other for searching by name.  Lookups
 138  *   can be performed on either basis, using zone_find_by_id and
 139  *   zone_find_by_name.  Both return zone_t pointers with the zone
 140  *   held, so zone_rele should be called when the pointer is no longer
 141  *   needed.  Zones can also be searched by path; zone_find_by_path
 142  *   returns the zone with which a path name is associated (global
 143  *   zone if the path is not within some other zone's file system
 144  *   hierarchy).  This currently requires iterating through each zone,
 145  *   so it is slower than an id or name search via a hash table.
 146  *
 147  *
 148  *   Locking:
 149  *
 150  *   zonehash_lock: This is a top-level global lock used to protect the
 151  *       zone hash tables and lists.  Zones cannot be created or destroyed
 152  *       while this lock is held.
 153  *   zone_status_lock: This is a global lock protecting zone state.
 154  *       Zones cannot change state while this lock is held.  It also
 155  *       protects the list of kernel threads associated with a zone.
 156  *   zone_lock: This is a per-zone lock used to protect several fields of
 157  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 158  *       this lock means that the zone cannot go away.
 159  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 160  *       related to the zone.max-lwps rctl.
 161  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 162  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 163  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 164  *       currently just max_lofi
 165  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 166  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 167  *       list (a list of zones in the ZONE_IS_DEAD state).
 168  *
 169  *   Ordering requirements:
 170  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 171  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 172  *
 173  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 174  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 175  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 176  *
 177  *   Blocking memory allocations are permitted while holding any of the
 178  *   zone locks.
 179  *
 180  *
 181  *   System Call Interface:
 182  *
 183  *   The zone subsystem can be managed and queried from user level with
 184  *   the following system calls (all subcodes of the primary "zone"
 185  *   system call):
 186  *   - zone_create: creates a zone with selected attributes (name,
 187  *     root path, privileges, resource controls, ZFS datasets)
 188  *   - zone_enter: allows the current process to enter a zone
 189  *   - zone_getattr: reports attributes of a zone
 190  *   - zone_setattr: set attributes of a zone
 191  *   - zone_boot: set 'init' running for the zone
 192  *   - zone_list: lists all zones active in the system
 193  *   - zone_lookup: looks up zone id based on name
 194  *   - zone_shutdown: initiates shutdown process (see states above)
 195  *   - zone_destroy: completes shutdown process (see states above)
 196  *
 197  */
 198 
 199 #include <sys/priv_impl.h>
 200 #include <sys/cred.h>
 201 #include <c2/audit.h>
 202 #include <sys/debug.h>
 203 #include <sys/file.h>
 204 #include <sys/kmem.h>
 205 #include <sys/kstat.h>
 206 #include <sys/mutex.h>
 207 #include <sys/note.h>
 208 #include <sys/pathname.h>
 209 #include <sys/proc.h>
 210 #include <sys/project.h>
 211 #include <sys/sysevent.h>
 212 #include <sys/task.h>
 213 #include <sys/systm.h>
 214 #include <sys/types.h>
 215 #include <sys/utsname.h>
 216 #include <sys/vnode.h>
 217 #include <sys/vfs.h>
 218 #include <sys/systeminfo.h>
 219 #include <sys/policy.h>
 220 #include <sys/cred_impl.h>
 221 #include <sys/contract_impl.h>
 222 #include <sys/contract/process_impl.h>
 223 #include <sys/class.h>
 224 #include <sys/pool.h>
 225 #include <sys/pool_pset.h>
 226 #include <sys/pset.h>
 227 #include <sys/strlog.h>
 228 #include <sys/sysmacros.h>
 229 #include <sys/callb.h>
 230 #include <sys/vmparam.h>
 231 #include <sys/corectl.h>
 232 #include <sys/ipc_impl.h>
 233 #include <sys/klpd.h>
 234 
 235 #include <sys/door.h>
 236 #include <sys/cpuvar.h>
 237 #include <sys/sdt.h>
 238 
 239 #include <sys/uadmin.h>
 240 #include <sys/session.h>
 241 #include <sys/cmn_err.h>
 242 #include <sys/modhash.h>
 243 #include <sys/sunddi.h>
 244 #include <sys/nvpair.h>
 245 #include <sys/rctl.h>
 246 #include <sys/fss.h>
 247 #include <sys/brand.h>
 248 #include <sys/zone.h>
 249 #include <net/if.h>
 250 #include <sys/cpucaps.h>
 251 #include <vm/seg.h>
 252 #include <sys/mac.h>
 253 
 254 /*
 255  * This constant specifies the number of seconds that threads waiting for
 256  * subsystems to release a zone's general-purpose references will wait before
 257  * they log the zone's reference counts.  The constant's value shouldn't
 258  * be so small that reference counts are unnecessarily reported for zones
 259  * whose references are slowly released.  On the other hand, it shouldn't be so
 260  * large that users reboot their systems out of frustration over hung zones
 261  * before the system logs the zones' reference counts.
 262  */
 263 #define ZONE_DESTROY_TIMEOUT_SECS       60
 264 
 265 /* List of data link IDs which are accessible from the zone */
 266 typedef struct zone_dl {
 267         datalink_id_t   zdl_id;
 268         nvlist_t        *zdl_net;
 269         list_node_t     zdl_linkage;
 270 } zone_dl_t;
 271 
 272 /*
 273  * cv used to signal that all references to the zone have been released.  This
 274  * needs to be global since there may be multiple waiters, and the first to
 275  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 276  */
 277 static kcondvar_t zone_destroy_cv;
 278 /*
 279  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 280  * but then we'd need another lock for zone_destroy_cv, and why bother?
 281  */
 282 static kmutex_t zone_status_lock;
 283 
 284 /*
 285  * ZSD-related global variables.
 286  */
 287 static kmutex_t zsd_key_lock;   /* protects the following two */
 288 /*
 289  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 290  */
 291 static zone_key_t zsd_keyval = 0;
 292 /*
 293  * Global list of registered keys.  We use this when a new zone is created.
 294  */
 295 static list_t zsd_registered_keys;
 296 
 297 int zone_hash_size = 256;
 298 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 299 static kmutex_t zonehash_lock;
 300 static uint_t zonecount;
 301 static id_space_t *zoneid_space;
 302 
 303 /*
 304  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 305  * kernel proper runs, and which manages all other zones.
 306  *
 307  * Although not declared as static, the variable "zone0" should not be used
 308  * except for by code that needs to reference the global zone early on in boot,
 309  * before it is fully initialized.  All other consumers should use
 310  * 'global_zone'.
 311  */
 312 zone_t zone0;
 313 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 314 
 315 /*
 316  * List of active zones, protected by zonehash_lock.
 317  */
 318 static list_t zone_active;
 319 
 320 /*
 321  * List of destroyed zones that still have outstanding cred references.
 322  * Used for debugging.  Uses a separate lock to avoid lock ordering
 323  * problems in zone_free.
 324  */
 325 static list_t zone_deathrow;
 326 static kmutex_t zone_deathrow_lock;
 327 
 328 /* number of zones is limited by virtual interface limit in IP */
 329 uint_t maxzones = 8192;
 330 
 331 /* Event channel to sent zone state change notifications */
 332 evchan_t *zone_event_chan;
 333 
 334 /*
 335  * This table holds the mapping from kernel zone states to
 336  * states visible in the state notification API.
 337  * The idea is that we only expose "obvious" states and
 338  * do not expose states which are just implementation details.
 339  */
 340 const char  *zone_status_table[] = {
 341         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 342         ZONE_EVENT_INITIALIZED,         /* initialized */
 343         ZONE_EVENT_READY,               /* ready */
 344         ZONE_EVENT_READY,               /* booting */
 345         ZONE_EVENT_RUNNING,             /* running */
 346         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 347         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 350         ZONE_EVENT_UNINITIALIZED,       /* dead */
 351 };
 352 
 353 /*
 354  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 355  * (see sys/zone.h).
 356  */
 357 static char *zone_ref_subsys_names[] = {
 358         "NFS",          /* ZONE_REF_NFS */
 359         "NFSv4",        /* ZONE_REF_NFSV4 */
 360         "SMBFS",        /* ZONE_REF_SMBFS */
 361         "MNTFS",        /* ZONE_REF_MNTFS */
 362         "LOFI",         /* ZONE_REF_LOFI */
 363         "VFS",          /* ZONE_REF_VFS */
 364         "IPC"           /* ZONE_REF_IPC */
 365 };
 366 
 367 /*
 368  * This isn't static so lint doesn't complain.
 369  */
 370 rctl_hndl_t rc_zone_cpu_shares;
 371 rctl_hndl_t rc_zone_locked_mem;
 372 rctl_hndl_t rc_zone_max_swap;
 373 rctl_hndl_t rc_zone_max_lofi;
 374 rctl_hndl_t rc_zone_cpu_cap;
 375 rctl_hndl_t rc_zone_zfs_io_pri;
 376 rctl_hndl_t rc_zone_nlwps;
 377 rctl_hndl_t rc_zone_nprocs;
 378 rctl_hndl_t rc_zone_shmmax;
 379 rctl_hndl_t rc_zone_shmmni;
 380 rctl_hndl_t rc_zone_semmni;
 381 rctl_hndl_t rc_zone_msgmni;
 382 
 383 const char * const zone_default_initname = "/sbin/init";
 384 static char * const zone_prefix = "/zone/";
 385 static int zone_shutdown(zoneid_t zoneid);
 386 static int zone_add_datalink(zoneid_t, datalink_id_t);
 387 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 388 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 389 static int zone_set_network(zoneid_t, zone_net_data_t *);
 390 static int zone_get_network(zoneid_t, zone_net_data_t *);
 391 
 392 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 393 
 394 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 395 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 396 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 397 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 398     zone_key_t);
 399 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 400 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 401     kmutex_t *);
 402 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 403     kmutex_t *);
 404 
 405 /*
 406  * Bump this number when you alter the zone syscall interfaces; this is
 407  * because we need to have support for previous API versions in libc
 408  * to support patching; libc calls into the kernel to determine this number.
 409  *
 410  * Version 1 of the API is the version originally shipped with Solaris 10
 411  * Version 2 alters the zone_create system call in order to support more
 412  *     arguments by moving the args into a structure; and to do better
 413  *     error reporting when zone_create() fails.
 414  * Version 3 alters the zone_create system call in order to support the
 415  *     import of ZFS datasets to zones.
 416  * Version 4 alters the zone_create system call in order to support
 417  *     Trusted Extensions.
 418  * Version 5 alters the zone_boot system call, and converts its old
 419  *     bootargs parameter to be set by the zone_setattr API instead.
 420  * Version 6 adds the flag argument to zone_create.
 421  */
 422 static const int ZONE_SYSCALL_API_VERSION = 6;
 423 
 424 /*
 425  * Certain filesystems (such as NFS and autofs) need to know which zone
 426  * the mount is being placed in.  Because of this, we need to be able to
 427  * ensure that a zone isn't in the process of being created/destroyed such
 428  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 429  * it gets added the list of mounted zones, it ends up on the wrong zone's
 430  * mount list. Since a zone can't reside on an NFS file system, we don't
 431  * have to worry about the zonepath itself.
 432  *
 433  * The following functions: block_mounts()/resume_mounts() and
 434  * mount_in_progress()/mount_completed() are used by zones and the VFS
 435  * layer (respectively) to synchronize zone state transitions and new
 436  * mounts within a zone. This syncronization is on a per-zone basis, so
 437  * activity for one zone will not interfere with activity for another zone.
 438  *
 439  * The semantics are like a reader-reader lock such that there may
 440  * either be multiple mounts (or zone state transitions, if that weren't
 441  * serialized by zonehash_lock) in progress at the same time, but not
 442  * both.
 443  *
 444  * We use cv's so the user can ctrl-C out of the operation if it's
 445  * taking too long.
 446  *
 447  * The semantics are such that there is unfair bias towards the
 448  * "current" operation.  This means that zone halt may starve if
 449  * there is a rapid succession of new mounts coming in to the zone.
 450  */
 451 /*
 452  * Prevent new mounts from progressing to the point of calling
 453  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 454  * them to complete.
 455  */
 456 static int
 457 block_mounts(zone_t *zp)
 458 {
 459         int retval = 0;
 460 
 461         /*
 462          * Since it may block for a long time, block_mounts() shouldn't be
 463          * called with zonehash_lock held.
 464          */
 465         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 466         mutex_enter(&zp->zone_mount_lock);
 467         while (zp->zone_mounts_in_progress > 0) {
 468                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 469                         goto signaled;
 470         }
 471         /*
 472          * A negative value of mounts_in_progress indicates that mounts
 473          * have been blocked by (-mounts_in_progress) different callers
 474          * (remotely possible if two threads enter zone_shutdown at the same
 475          * time).
 476          */
 477         zp->zone_mounts_in_progress--;
 478         retval = 1;
 479 signaled:
 480         mutex_exit(&zp->zone_mount_lock);
 481         return (retval);
 482 }
 483 
 484 /*
 485  * The VFS layer may progress with new mounts as far as we're concerned.
 486  * Allow them to progress if we were the last obstacle.
 487  */
 488 static void
 489 resume_mounts(zone_t *zp)
 490 {
 491         mutex_enter(&zp->zone_mount_lock);
 492         if (++zp->zone_mounts_in_progress == 0)
 493                 cv_broadcast(&zp->zone_mount_cv);
 494         mutex_exit(&zp->zone_mount_lock);
 495 }
 496 
 497 /*
 498  * The VFS layer is busy with a mount; this zone should wait until all
 499  * of its mounts are completed to progress.
 500  */
 501 void
 502 mount_in_progress(zone_t *zp)
 503 {
 504         mutex_enter(&zp->zone_mount_lock);
 505         while (zp->zone_mounts_in_progress < 0)
 506                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 507         zp->zone_mounts_in_progress++;
 508         mutex_exit(&zp->zone_mount_lock);
 509 }
 510 
 511 /*
 512  * VFS is done with one mount; wake up any waiting block_mounts()
 513  * callers if this is the last mount.
 514  */
 515 void
 516 mount_completed(zone_t *zp)
 517 {
 518         mutex_enter(&zp->zone_mount_lock);
 519         if (--zp->zone_mounts_in_progress == 0)
 520                 cv_broadcast(&zp->zone_mount_cv);
 521         mutex_exit(&zp->zone_mount_lock);
 522 }
 523 
 524 /*
 525  * ZSD routines.
 526  *
 527  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 528  * defined by the pthread_key_create() and related interfaces.
 529  *
 530  * Kernel subsystems may register one or more data items and/or
 531  * callbacks to be executed when a zone is created, shutdown, or
 532  * destroyed.
 533  *
 534  * Unlike the thread counterpart, destructor callbacks will be executed
 535  * even if the data pointer is NULL and/or there are no constructor
 536  * callbacks, so it is the responsibility of such callbacks to check for
 537  * NULL data values if necessary.
 538  *
 539  * The locking strategy and overall picture is as follows:
 540  *
 541  * When someone calls zone_key_create(), a template ZSD entry is added to the
 542  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 543  * holding that lock all the existing zones are marked as
 544  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 545  * zone_zsd list (protected by zone_lock). The global list is updated first
 546  * (under zone_key_lock) to make sure that newly created zones use the
 547  * most recent list of keys. Then under zonehash_lock we walk the zones
 548  * and mark them.  Similar locking is used in zone_key_delete().
 549  *
 550  * The actual create, shutdown, and destroy callbacks are done without
 551  * holding any lock. And zsd_flags are used to ensure that the operations
 552  * completed so that when zone_key_create (and zone_create) is done, as well as
 553  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 554  * are completed.
 555  *
 556  * When new zones are created constructor callbacks for all registered ZSD
 557  * entries will be called. That also uses the above two phases of marking
 558  * what needs to be done, and then running the callbacks without holding
 559  * any locks.
 560  *
 561  * The framework does not provide any locking around zone_getspecific() and
 562  * zone_setspecific() apart from that needed for internal consistency, so
 563  * callers interested in atomic "test-and-set" semantics will need to provide
 564  * their own locking.
 565  */
 566 
 567 /*
 568  * Helper function to find the zsd_entry associated with the key in the
 569  * given list.
 570  */
 571 static struct zsd_entry *
 572 zsd_find(list_t *l, zone_key_t key)
 573 {
 574         struct zsd_entry *zsd;
 575 
 576         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 577                 if (zsd->zsd_key == key) {
 578                         return (zsd);
 579                 }
 580         }
 581         return (NULL);
 582 }
 583 
 584 /*
 585  * Helper function to find the zsd_entry associated with the key in the
 586  * given list. Move it to the front of the list.
 587  */
 588 static struct zsd_entry *
 589 zsd_find_mru(list_t *l, zone_key_t key)
 590 {
 591         struct zsd_entry *zsd;
 592 
 593         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 594                 if (zsd->zsd_key == key) {
 595                         /*
 596                          * Move to head of list to keep list in MRU order.
 597                          */
 598                         if (zsd != list_head(l)) {
 599                                 list_remove(l, zsd);
 600                                 list_insert_head(l, zsd);
 601                         }
 602                         return (zsd);
 603                 }
 604         }
 605         return (NULL);
 606 }
 607 
 608 void
 609 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 610     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 611 {
 612         struct zsd_entry *zsdp;
 613         struct zsd_entry *t;
 614         struct zone *zone;
 615         zone_key_t  key;
 616 
 617         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 618         zsdp->zsd_data = NULL;
 619         zsdp->zsd_create = create;
 620         zsdp->zsd_shutdown = shutdown;
 621         zsdp->zsd_destroy = destroy;
 622 
 623         /*
 624          * Insert in global list of callbacks. Makes future zone creations
 625          * see it.
 626          */
 627         mutex_enter(&zsd_key_lock);
 628         key = zsdp->zsd_key = ++zsd_keyval;
 629         ASSERT(zsd_keyval != 0);
 630         list_insert_tail(&zsd_registered_keys, zsdp);
 631         mutex_exit(&zsd_key_lock);
 632 
 633         /*
 634          * Insert for all existing zones and mark them as needing
 635          * a create callback.
 636          */
 637         mutex_enter(&zonehash_lock);        /* stop the world */
 638         for (zone = list_head(&zone_active); zone != NULL;
 639             zone = list_next(&zone_active, zone)) {
 640                 zone_status_t status;
 641 
 642                 mutex_enter(&zone->zone_lock);
 643 
 644                 /* Skip zones that are on the way down or not yet up */
 645                 status = zone_status_get(zone);
 646                 if (status >= ZONE_IS_DOWN ||
 647                     status == ZONE_IS_UNINITIALIZED) {
 648                         mutex_exit(&zone->zone_lock);
 649                         continue;
 650                 }
 651 
 652                 t = zsd_find_mru(&zone->zone_zsd, key);
 653                 if (t != NULL) {
 654                         /*
 655                          * A zsd_configure already inserted it after
 656                          * we dropped zsd_key_lock above.
 657                          */
 658                         mutex_exit(&zone->zone_lock);
 659                         continue;
 660                 }
 661                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 662                 t->zsd_key = key;
 663                 t->zsd_create = create;
 664                 t->zsd_shutdown = shutdown;
 665                 t->zsd_destroy = destroy;
 666                 if (create != NULL) {
 667                         t->zsd_flags = ZSD_CREATE_NEEDED;
 668                         DTRACE_PROBE2(zsd__create__needed,
 669                             zone_t *, zone, zone_key_t, key);
 670                 }
 671                 list_insert_tail(&zone->zone_zsd, t);
 672                 mutex_exit(&zone->zone_lock);
 673         }
 674         mutex_exit(&zonehash_lock);
 675 
 676         if (create != NULL) {
 677                 /* Now call the create callback for this key */
 678                 zsd_apply_all_zones(zsd_apply_create, key);
 679         }
 680         /*
 681          * It is safe for consumers to use the key now, make it
 682          * globally visible. Specifically zone_getspecific() will
 683          * always successfully return the zone specific data associated
 684          * with the key.
 685          */
 686         *keyp = key;
 687 
 688 }
 689 
 690 /*
 691  * Function called when a module is being unloaded, or otherwise wishes
 692  * to unregister its ZSD key and callbacks.
 693  *
 694  * Remove from the global list and determine the functions that need to
 695  * be called under a global lock. Then call the functions without
 696  * holding any locks. Finally free up the zone_zsd entries. (The apply
 697  * functions need to access the zone_zsd entries to find zsd_data etc.)
 698  */
 699 int
 700 zone_key_delete(zone_key_t key)
 701 {
 702         struct zsd_entry *zsdp = NULL;
 703         zone_t *zone;
 704 
 705         mutex_enter(&zsd_key_lock);
 706         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 707         if (zsdp == NULL) {
 708                 mutex_exit(&zsd_key_lock);
 709                 return (-1);
 710         }
 711         list_remove(&zsd_registered_keys, zsdp);
 712         mutex_exit(&zsd_key_lock);
 713 
 714         mutex_enter(&zonehash_lock);
 715         for (zone = list_head(&zone_active); zone != NULL;
 716             zone = list_next(&zone_active, zone)) {
 717                 struct zsd_entry *del;
 718 
 719                 mutex_enter(&zone->zone_lock);
 720                 del = zsd_find_mru(&zone->zone_zsd, key);
 721                 if (del == NULL) {
 722                         /*
 723                          * Somebody else got here first e.g the zone going
 724                          * away.
 725                          */
 726                         mutex_exit(&zone->zone_lock);
 727                         continue;
 728                 }
 729                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 730                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 731                 if (del->zsd_shutdown != NULL &&
 732                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 733                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 734                         DTRACE_PROBE2(zsd__shutdown__needed,
 735                             zone_t *, zone, zone_key_t, key);
 736                 }
 737                 if (del->zsd_destroy != NULL &&
 738                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 739                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 740                         DTRACE_PROBE2(zsd__destroy__needed,
 741                             zone_t *, zone, zone_key_t, key);
 742                 }
 743                 mutex_exit(&zone->zone_lock);
 744         }
 745         mutex_exit(&zonehash_lock);
 746         kmem_free(zsdp, sizeof (*zsdp));
 747 
 748         /* Now call the shutdown and destroy callback for this key */
 749         zsd_apply_all_zones(zsd_apply_shutdown, key);
 750         zsd_apply_all_zones(zsd_apply_destroy, key);
 751 
 752         /* Now we can free up the zsdp structures in each zone */
 753         mutex_enter(&zonehash_lock);
 754         for (zone = list_head(&zone_active); zone != NULL;
 755             zone = list_next(&zone_active, zone)) {
 756                 struct zsd_entry *del;
 757 
 758                 mutex_enter(&zone->zone_lock);
 759                 del = zsd_find(&zone->zone_zsd, key);
 760                 if (del != NULL) {
 761                         list_remove(&zone->zone_zsd, del);
 762                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 763                         kmem_free(del, sizeof (*del));
 764                 }
 765                 mutex_exit(&zone->zone_lock);
 766         }
 767         mutex_exit(&zonehash_lock);
 768 
 769         return (0);
 770 }
 771 
 772 /*
 773  * ZSD counterpart of pthread_setspecific().
 774  *
 775  * Since all zsd callbacks, including those with no create function,
 776  * have an entry in zone_zsd, if the key is registered it is part of
 777  * the zone_zsd list.
 778  * Return an error if the key wasn't registerd.
 779  */
 780 int
 781 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 782 {
 783         struct zsd_entry *t;
 784 
 785         mutex_enter(&zone->zone_lock);
 786         t = zsd_find_mru(&zone->zone_zsd, key);
 787         if (t != NULL) {
 788                 /*
 789                  * Replace old value with new
 790                  */
 791                 t->zsd_data = (void *)data;
 792                 mutex_exit(&zone->zone_lock);
 793                 return (0);
 794         }
 795         mutex_exit(&zone->zone_lock);
 796         return (-1);
 797 }
 798 
 799 /*
 800  * ZSD counterpart of pthread_getspecific().
 801  */
 802 void *
 803 zone_getspecific(zone_key_t key, zone_t *zone)
 804 {
 805         struct zsd_entry *t;
 806         void *data;
 807 
 808         mutex_enter(&zone->zone_lock);
 809         t = zsd_find_mru(&zone->zone_zsd, key);
 810         data = (t == NULL ? NULL : t->zsd_data);
 811         mutex_exit(&zone->zone_lock);
 812         return (data);
 813 }
 814 
 815 /*
 816  * Function used to initialize a zone's list of ZSD callbacks and data
 817  * when the zone is being created.  The callbacks are initialized from
 818  * the template list (zsd_registered_keys). The constructor callback is
 819  * executed later (once the zone exists and with locks dropped).
 820  */
 821 static void
 822 zone_zsd_configure(zone_t *zone)
 823 {
 824         struct zsd_entry *zsdp;
 825         struct zsd_entry *t;
 826 
 827         ASSERT(MUTEX_HELD(&zonehash_lock));
 828         ASSERT(list_head(&zone->zone_zsd) == NULL);
 829         mutex_enter(&zone->zone_lock);
 830         mutex_enter(&zsd_key_lock);
 831         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 832             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 833                 /*
 834                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 835                  * should not have added anything to it.
 836                  */
 837                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 838 
 839                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 840                 t->zsd_key = zsdp->zsd_key;
 841                 t->zsd_create = zsdp->zsd_create;
 842                 t->zsd_shutdown = zsdp->zsd_shutdown;
 843                 t->zsd_destroy = zsdp->zsd_destroy;
 844                 if (zsdp->zsd_create != NULL) {
 845                         t->zsd_flags = ZSD_CREATE_NEEDED;
 846                         DTRACE_PROBE2(zsd__create__needed,
 847                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 848                 }
 849                 list_insert_tail(&zone->zone_zsd, t);
 850         }
 851         mutex_exit(&zsd_key_lock);
 852         mutex_exit(&zone->zone_lock);
 853 }
 854 
 855 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 856 
 857 /*
 858  * Helper function to execute shutdown or destructor callbacks.
 859  */
 860 static void
 861 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 862 {
 863         struct zsd_entry *t;
 864 
 865         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 866         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 867         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 868 
 869         /*
 870          * Run the callback solely based on what is registered for the zone
 871          * in zone_zsd. The global list can change independently of this
 872          * as keys are registered and unregistered and we don't register new
 873          * callbacks for a zone that is in the process of going away.
 874          */
 875         mutex_enter(&zone->zone_lock);
 876         for (t = list_head(&zone->zone_zsd); t != NULL;
 877             t = list_next(&zone->zone_zsd, t)) {
 878                 zone_key_t key = t->zsd_key;
 879 
 880                 /* Skip if no callbacks registered */
 881 
 882                 if (ct == ZSD_SHUTDOWN) {
 883                         if (t->zsd_shutdown != NULL &&
 884                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 885                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 886                                 DTRACE_PROBE2(zsd__shutdown__needed,
 887                                     zone_t *, zone, zone_key_t, key);
 888                         }
 889                 } else {
 890                         if (t->zsd_destroy != NULL &&
 891                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 892                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 893                                 DTRACE_PROBE2(zsd__destroy__needed,
 894                                     zone_t *, zone, zone_key_t, key);
 895                         }
 896                 }
 897         }
 898         mutex_exit(&zone->zone_lock);
 899 
 900         /* Now call the shutdown and destroy callback for this key */
 901         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 902         zsd_apply_all_keys(zsd_apply_destroy, zone);
 903 
 904 }
 905 
 906 /*
 907  * Called when the zone is going away; free ZSD-related memory, and
 908  * destroy the zone_zsd list.
 909  */
 910 static void
 911 zone_free_zsd(zone_t *zone)
 912 {
 913         struct zsd_entry *t, *next;
 914 
 915         /*
 916          * Free all the zsd_entry's we had on this zone.
 917          */
 918         mutex_enter(&zone->zone_lock);
 919         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 920                 next = list_next(&zone->zone_zsd, t);
 921                 list_remove(&zone->zone_zsd, t);
 922                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 923                 kmem_free(t, sizeof (*t));
 924         }
 925         list_destroy(&zone->zone_zsd);
 926         mutex_exit(&zone->zone_lock);
 927 
 928 }
 929 
 930 /*
 931  * Apply a function to all zones for particular key value.
 932  *
 933  * The applyfn has to drop zonehash_lock if it does some work, and
 934  * then reacquire it before it returns.
 935  * When the lock is dropped we don't follow list_next even
 936  * if it is possible to do so without any hazards. This is
 937  * because we want the design to allow for the list of zones
 938  * to change in any arbitrary way during the time the
 939  * lock was dropped.
 940  *
 941  * It is safe to restart the loop at list_head since the applyfn
 942  * changes the zsd_flags as it does work, so a subsequent
 943  * pass through will have no effect in applyfn, hence the loop will terminate
 944  * in at worst O(N^2).
 945  */
 946 static void
 947 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 948 {
 949         zone_t *zone;
 950 
 951         mutex_enter(&zonehash_lock);
 952         zone = list_head(&zone_active);
 953         while (zone != NULL) {
 954                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 955                         /* Lock dropped - restart at head */
 956                         zone = list_head(&zone_active);
 957                 } else {
 958                         zone = list_next(&zone_active, zone);
 959                 }
 960         }
 961         mutex_exit(&zonehash_lock);
 962 }
 963 
 964 /*
 965  * Apply a function to all keys for a particular zone.
 966  *
 967  * The applyfn has to drop zonehash_lock if it does some work, and
 968  * then reacquire it before it returns.
 969  * When the lock is dropped we don't follow list_next even
 970  * if it is possible to do so without any hazards. This is
 971  * because we want the design to allow for the list of zsd callbacks
 972  * to change in any arbitrary way during the time the
 973  * lock was dropped.
 974  *
 975  * It is safe to restart the loop at list_head since the applyfn
 976  * changes the zsd_flags as it does work, so a subsequent
 977  * pass through will have no effect in applyfn, hence the loop will terminate
 978  * in at worst O(N^2).
 979  */
 980 static void
 981 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 982 {
 983         struct zsd_entry *t;
 984 
 985         mutex_enter(&zone->zone_lock);
 986         t = list_head(&zone->zone_zsd);
 987         while (t != NULL) {
 988                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 989                         /* Lock dropped - restart at head */
 990                         t = list_head(&zone->zone_zsd);
 991                 } else {
 992                         t = list_next(&zone->zone_zsd, t);
 993                 }
 994         }
 995         mutex_exit(&zone->zone_lock);
 996 }
 997 
 998 /*
 999  * Call the create function for the zone and key if CREATE_NEEDED
1000  * is set.
1001  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1002  * we wait for that thread to complete so that we can ensure that
1003  * all the callbacks are done when we've looped over all zones/keys.
1004  *
1005  * When we call the create function, we drop the global held by the
1006  * caller, and return true to tell the caller it needs to re-evalute the
1007  * state.
1008  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1009  * remains held on exit.
1010  */
1011 static boolean_t
1012 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1013     zone_t *zone, zone_key_t key)
1014 {
1015         void *result;
1016         struct zsd_entry *t;
1017         boolean_t dropped;
1018 
1019         if (lockp != NULL) {
1020                 ASSERT(MUTEX_HELD(lockp));
1021         }
1022         if (zone_lock_held) {
1023                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1024         } else {
1025                 mutex_enter(&zone->zone_lock);
1026         }
1027 
1028         t = zsd_find(&zone->zone_zsd, key);
1029         if (t == NULL) {
1030                 /*
1031                  * Somebody else got here first e.g the zone going
1032                  * away.
1033                  */
1034                 if (!zone_lock_held)
1035                         mutex_exit(&zone->zone_lock);
1036                 return (B_FALSE);
1037         }
1038         dropped = B_FALSE;
1039         if (zsd_wait_for_inprogress(zone, t, lockp))
1040                 dropped = B_TRUE;
1041 
1042         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1043                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1044                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1045                 DTRACE_PROBE2(zsd__create__inprogress,
1046                     zone_t *, zone, zone_key_t, key);
1047                 mutex_exit(&zone->zone_lock);
1048                 if (lockp != NULL)
1049                         mutex_exit(lockp);
1050 
1051                 dropped = B_TRUE;
1052                 ASSERT(t->zsd_create != NULL);
1053                 DTRACE_PROBE2(zsd__create__start,
1054                     zone_t *, zone, zone_key_t, key);
1055 
1056                 result = (*t->zsd_create)(zone->zone_id);
1057 
1058                 DTRACE_PROBE2(zsd__create__end,
1059                     zone_t *, zone, voidn *, result);
1060 
1061                 ASSERT(result != NULL);
1062                 if (lockp != NULL)
1063                         mutex_enter(lockp);
1064                 mutex_enter(&zone->zone_lock);
1065                 t->zsd_data = result;
1066                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1067                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1068                 cv_broadcast(&t->zsd_cv);
1069                 DTRACE_PROBE2(zsd__create__completed,
1070                     zone_t *, zone, zone_key_t, key);
1071         }
1072         if (!zone_lock_held)
1073                 mutex_exit(&zone->zone_lock);
1074         return (dropped);
1075 }
1076 
1077 /*
1078  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1079  * is set.
1080  * If some other thread gets here first and sets *_INPROGRESS, then
1081  * we wait for that thread to complete so that we can ensure that
1082  * all the callbacks are done when we've looped over all zones/keys.
1083  *
1084  * When we call the shutdown function, we drop the global held by the
1085  * caller, and return true to tell the caller it needs to re-evalute the
1086  * state.
1087  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1088  * remains held on exit.
1089  */
1090 static boolean_t
1091 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1092     zone_t *zone, zone_key_t key)
1093 {
1094         struct zsd_entry *t;
1095         void *data;
1096         boolean_t dropped;
1097 
1098         if (lockp != NULL) {
1099                 ASSERT(MUTEX_HELD(lockp));
1100         }
1101         if (zone_lock_held) {
1102                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1103         } else {
1104                 mutex_enter(&zone->zone_lock);
1105         }
1106 
1107         t = zsd_find(&zone->zone_zsd, key);
1108         if (t == NULL) {
1109                 /*
1110                  * Somebody else got here first e.g the zone going
1111                  * away.
1112                  */
1113                 if (!zone_lock_held)
1114                         mutex_exit(&zone->zone_lock);
1115                 return (B_FALSE);
1116         }
1117         dropped = B_FALSE;
1118         if (zsd_wait_for_creator(zone, t, lockp))
1119                 dropped = B_TRUE;
1120 
1121         if (zsd_wait_for_inprogress(zone, t, lockp))
1122                 dropped = B_TRUE;
1123 
1124         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1125                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1126                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1127                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1128                     zone_t *, zone, zone_key_t, key);
1129                 mutex_exit(&zone->zone_lock);
1130                 if (lockp != NULL)
1131                         mutex_exit(lockp);
1132                 dropped = B_TRUE;
1133 
1134                 ASSERT(t->zsd_shutdown != NULL);
1135                 data = t->zsd_data;
1136 
1137                 DTRACE_PROBE2(zsd__shutdown__start,
1138                     zone_t *, zone, zone_key_t, key);
1139 
1140                 (t->zsd_shutdown)(zone->zone_id, data);
1141                 DTRACE_PROBE2(zsd__shutdown__end,
1142                     zone_t *, zone, zone_key_t, key);
1143 
1144                 if (lockp != NULL)
1145                         mutex_enter(lockp);
1146                 mutex_enter(&zone->zone_lock);
1147                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1148                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1149                 cv_broadcast(&t->zsd_cv);
1150                 DTRACE_PROBE2(zsd__shutdown__completed,
1151                     zone_t *, zone, zone_key_t, key);
1152         }
1153         if (!zone_lock_held)
1154                 mutex_exit(&zone->zone_lock);
1155         return (dropped);
1156 }
1157 
1158 /*
1159  * Call the destroy function for the zone and key if DESTROY_NEEDED
1160  * is set.
1161  * If some other thread gets here first and sets *_INPROGRESS, then
1162  * we wait for that thread to complete so that we can ensure that
1163  * all the callbacks are done when we've looped over all zones/keys.
1164  *
1165  * When we call the destroy function, we drop the global held by the
1166  * caller, and return true to tell the caller it needs to re-evalute the
1167  * state.
1168  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1169  * remains held on exit.
1170  */
1171 static boolean_t
1172 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1173     zone_t *zone, zone_key_t key)
1174 {
1175         struct zsd_entry *t;
1176         void *data;
1177         boolean_t dropped;
1178 
1179         if (lockp != NULL) {
1180                 ASSERT(MUTEX_HELD(lockp));
1181         }
1182         if (zone_lock_held) {
1183                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1184         } else {
1185                 mutex_enter(&zone->zone_lock);
1186         }
1187 
1188         t = zsd_find(&zone->zone_zsd, key);
1189         if (t == NULL) {
1190                 /*
1191                  * Somebody else got here first e.g the zone going
1192                  * away.
1193                  */
1194                 if (!zone_lock_held)
1195                         mutex_exit(&zone->zone_lock);
1196                 return (B_FALSE);
1197         }
1198         dropped = B_FALSE;
1199         if (zsd_wait_for_creator(zone, t, lockp))
1200                 dropped = B_TRUE;
1201 
1202         if (zsd_wait_for_inprogress(zone, t, lockp))
1203                 dropped = B_TRUE;
1204 
1205         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1206                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1207                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1208                 DTRACE_PROBE2(zsd__destroy__inprogress,
1209                     zone_t *, zone, zone_key_t, key);
1210                 mutex_exit(&zone->zone_lock);
1211                 if (lockp != NULL)
1212                         mutex_exit(lockp);
1213                 dropped = B_TRUE;
1214 
1215                 ASSERT(t->zsd_destroy != NULL);
1216                 data = t->zsd_data;
1217                 DTRACE_PROBE2(zsd__destroy__start,
1218                     zone_t *, zone, zone_key_t, key);
1219 
1220                 (t->zsd_destroy)(zone->zone_id, data);
1221                 DTRACE_PROBE2(zsd__destroy__end,
1222                     zone_t *, zone, zone_key_t, key);
1223 
1224                 if (lockp != NULL)
1225                         mutex_enter(lockp);
1226                 mutex_enter(&zone->zone_lock);
1227                 t->zsd_data = NULL;
1228                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1229                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1230                 cv_broadcast(&t->zsd_cv);
1231                 DTRACE_PROBE2(zsd__destroy__completed,
1232                     zone_t *, zone, zone_key_t, key);
1233         }
1234         if (!zone_lock_held)
1235                 mutex_exit(&zone->zone_lock);
1236         return (dropped);
1237 }
1238 
1239 /*
1240  * Wait for any CREATE_NEEDED flag to be cleared.
1241  * Returns true if lockp was temporarily dropped while waiting.
1242  */
1243 static boolean_t
1244 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1245 {
1246         boolean_t dropped = B_FALSE;
1247 
1248         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1249                 DTRACE_PROBE2(zsd__wait__for__creator,
1250                     zone_t *, zone, struct zsd_entry *, t);
1251                 if (lockp != NULL) {
1252                         dropped = B_TRUE;
1253                         mutex_exit(lockp);
1254                 }
1255                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1256                 if (lockp != NULL) {
1257                         /* First drop zone_lock to preserve order */
1258                         mutex_exit(&zone->zone_lock);
1259                         mutex_enter(lockp);
1260                         mutex_enter(&zone->zone_lock);
1261                 }
1262         }
1263         return (dropped);
1264 }
1265 
1266 /*
1267  * Wait for any INPROGRESS flag to be cleared.
1268  * Returns true if lockp was temporarily dropped while waiting.
1269  */
1270 static boolean_t
1271 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1272 {
1273         boolean_t dropped = B_FALSE;
1274 
1275         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1276                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1277                     zone_t *, zone, struct zsd_entry *, t);
1278                 if (lockp != NULL) {
1279                         dropped = B_TRUE;
1280                         mutex_exit(lockp);
1281                 }
1282                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1283                 if (lockp != NULL) {
1284                         /* First drop zone_lock to preserve order */
1285                         mutex_exit(&zone->zone_lock);
1286                         mutex_enter(lockp);
1287                         mutex_enter(&zone->zone_lock);
1288                 }
1289         }
1290         return (dropped);
1291 }
1292 
1293 /*
1294  * Frees memory associated with the zone dataset list.
1295  */
1296 static void
1297 zone_free_datasets(zone_t *zone)
1298 {
1299         zone_dataset_t *t, *next;
1300 
1301         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1302                 next = list_next(&zone->zone_datasets, t);
1303                 list_remove(&zone->zone_datasets, t);
1304                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1305                 kmem_free(t, sizeof (*t));
1306         }
1307         list_destroy(&zone->zone_datasets);
1308 }
1309 
1310 /*
1311  * zone.cpu-shares resource control support.
1312  */
1313 /*ARGSUSED*/
1314 static rctl_qty_t
1315 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1316 {
1317         ASSERT(MUTEX_HELD(&p->p_lock));
1318         return (p->p_zone->zone_shares);
1319 }
1320 
1321 /*ARGSUSED*/
1322 static int
1323 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1324     rctl_qty_t nv)
1325 {
1326         ASSERT(MUTEX_HELD(&p->p_lock));
1327         ASSERT(e->rcep_t == RCENTITY_ZONE);
1328         if (e->rcep_p.zone == NULL)
1329                 return (0);
1330 
1331         e->rcep_p.zone->zone_shares = nv;
1332         return (0);
1333 }
1334 
1335 static rctl_ops_t zone_cpu_shares_ops = {
1336         rcop_no_action,
1337         zone_cpu_shares_usage,
1338         zone_cpu_shares_set,
1339         rcop_no_test
1340 };
1341 
1342 /*
1343  * zone.cpu-cap resource control support.
1344  */
1345 /*ARGSUSED*/
1346 static rctl_qty_t
1347 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1348 {
1349         ASSERT(MUTEX_HELD(&p->p_lock));
1350         return (cpucaps_zone_get(p->p_zone));
1351 }
1352 
1353 /*ARGSUSED*/
1354 static int
1355 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1356     rctl_qty_t nv)
1357 {
1358         zone_t *zone = e->rcep_p.zone;
1359 
1360         ASSERT(MUTEX_HELD(&p->p_lock));
1361         ASSERT(e->rcep_t == RCENTITY_ZONE);
1362 
1363         if (zone == NULL)
1364                 return (0);
1365 
1366         /*
1367          * set cap to the new value.
1368          */
1369         return (cpucaps_zone_set(zone, nv));
1370 }
1371 
1372 static rctl_ops_t zone_cpu_cap_ops = {
1373         rcop_no_action,
1374         zone_cpu_cap_get,
1375         zone_cpu_cap_set,
1376         rcop_no_test
1377 };
1378 
1379 /*
1380  * zone.zfs-io-pri resource control support (IO priority).
1381  */
1382 /*ARGSUSED*/
1383 static rctl_qty_t
1384 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1385 {
1386         ASSERT(MUTEX_HELD(&p->p_lock));
1387         return (p->p_zone->zone_zfs_io_pri);
1388 }
1389 
1390 /*ARGSUSED*/
1391 static int
1392 zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1393     rctl_qty_t nv)
1394 {
1395         zone_t *zone = e->rcep_p.zone;
1396 
1397         ASSERT(MUTEX_HELD(&p->p_lock));
1398         ASSERT(e->rcep_t == RCENTITY_ZONE);
1399 
1400         if (zone == NULL)
1401                 return (0);
1402 
1403         /*
1404          * set priority to the new value.
1405          */
1406         zone->zone_zfs_io_pri = nv;
1407         return (0);
1408 }
1409 
1410 static rctl_ops_t zone_zfs_io_pri_ops = {
1411         rcop_no_action,
1412         zone_zfs_io_pri_get,
1413         zone_zfs_io_pri_set,
1414         rcop_no_test
1415 };
1416 
1417 /*ARGSUSED*/
1418 static rctl_qty_t
1419 zone_lwps_usage(rctl_t *r, proc_t *p)
1420 {
1421         rctl_qty_t nlwps;
1422         zone_t *zone = p->p_zone;
1423 
1424         ASSERT(MUTEX_HELD(&p->p_lock));
1425 
1426         mutex_enter(&zone->zone_nlwps_lock);
1427         nlwps = zone->zone_nlwps;
1428         mutex_exit(&zone->zone_nlwps_lock);
1429 
1430         return (nlwps);
1431 }
1432 
1433 /*ARGSUSED*/
1434 static int
1435 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1436     rctl_qty_t incr, uint_t flags)
1437 {
1438         rctl_qty_t nlwps;
1439 
1440         ASSERT(MUTEX_HELD(&p->p_lock));
1441         ASSERT(e->rcep_t == RCENTITY_ZONE);
1442         if (e->rcep_p.zone == NULL)
1443                 return (0);
1444         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1445         nlwps = e->rcep_p.zone->zone_nlwps;
1446 
1447         if (nlwps + incr > rcntl->rcv_value)
1448                 return (1);
1449 
1450         return (0);
1451 }
1452 
1453 /*ARGSUSED*/
1454 static int
1455 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1456 {
1457         ASSERT(MUTEX_HELD(&p->p_lock));
1458         ASSERT(e->rcep_t == RCENTITY_ZONE);
1459         if (e->rcep_p.zone == NULL)
1460                 return (0);
1461         e->rcep_p.zone->zone_nlwps_ctl = nv;
1462         return (0);
1463 }
1464 
1465 static rctl_ops_t zone_lwps_ops = {
1466         rcop_no_action,
1467         zone_lwps_usage,
1468         zone_lwps_set,
1469         zone_lwps_test,
1470 };
1471 
1472 /*ARGSUSED*/
1473 static rctl_qty_t
1474 zone_procs_usage(rctl_t *r, proc_t *p)
1475 {
1476         rctl_qty_t nprocs;
1477         zone_t *zone = p->p_zone;
1478 
1479         ASSERT(MUTEX_HELD(&p->p_lock));
1480 
1481         mutex_enter(&zone->zone_nlwps_lock);
1482         nprocs = zone->zone_nprocs;
1483         mutex_exit(&zone->zone_nlwps_lock);
1484 
1485         return (nprocs);
1486 }
1487 
1488 /*ARGSUSED*/
1489 static int
1490 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1491     rctl_qty_t incr, uint_t flags)
1492 {
1493         rctl_qty_t nprocs;
1494 
1495         ASSERT(MUTEX_HELD(&p->p_lock));
1496         ASSERT(e->rcep_t == RCENTITY_ZONE);
1497         if (e->rcep_p.zone == NULL)
1498                 return (0);
1499         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1500         nprocs = e->rcep_p.zone->zone_nprocs;
1501 
1502         if (nprocs + incr > rcntl->rcv_value)
1503                 return (1);
1504 
1505         return (0);
1506 }
1507 
1508 /*ARGSUSED*/
1509 static int
1510 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1511 {
1512         ASSERT(MUTEX_HELD(&p->p_lock));
1513         ASSERT(e->rcep_t == RCENTITY_ZONE);
1514         if (e->rcep_p.zone == NULL)
1515                 return (0);
1516         e->rcep_p.zone->zone_nprocs_ctl = nv;
1517         return (0);
1518 }
1519 
1520 static rctl_ops_t zone_procs_ops = {
1521         rcop_no_action,
1522         zone_procs_usage,
1523         zone_procs_set,
1524         zone_procs_test,
1525 };
1526 
1527 /*ARGSUSED*/
1528 static rctl_qty_t
1529 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1530 {
1531         ASSERT(MUTEX_HELD(&p->p_lock));
1532         return (p->p_zone->zone_shmmax);
1533 }
1534 
1535 /*ARGSUSED*/
1536 static int
1537 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1538     rctl_qty_t incr, uint_t flags)
1539 {
1540         rctl_qty_t v;
1541         ASSERT(MUTEX_HELD(&p->p_lock));
1542         ASSERT(e->rcep_t == RCENTITY_ZONE);
1543         v = e->rcep_p.zone->zone_shmmax + incr;
1544         if (v > rval->rcv_value)
1545                 return (1);
1546         return (0);
1547 }
1548 
1549 static rctl_ops_t zone_shmmax_ops = {
1550         rcop_no_action,
1551         zone_shmmax_usage,
1552         rcop_no_set,
1553         zone_shmmax_test
1554 };
1555 
1556 /*ARGSUSED*/
1557 static rctl_qty_t
1558 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1559 {
1560         ASSERT(MUTEX_HELD(&p->p_lock));
1561         return (p->p_zone->zone_ipc.ipcq_shmmni);
1562 }
1563 
1564 /*ARGSUSED*/
1565 static int
1566 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1567     rctl_qty_t incr, uint_t flags)
1568 {
1569         rctl_qty_t v;
1570         ASSERT(MUTEX_HELD(&p->p_lock));
1571         ASSERT(e->rcep_t == RCENTITY_ZONE);
1572         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1573         if (v > rval->rcv_value)
1574                 return (1);
1575         return (0);
1576 }
1577 
1578 static rctl_ops_t zone_shmmni_ops = {
1579         rcop_no_action,
1580         zone_shmmni_usage,
1581         rcop_no_set,
1582         zone_shmmni_test
1583 };
1584 
1585 /*ARGSUSED*/
1586 static rctl_qty_t
1587 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1588 {
1589         ASSERT(MUTEX_HELD(&p->p_lock));
1590         return (p->p_zone->zone_ipc.ipcq_semmni);
1591 }
1592 
1593 /*ARGSUSED*/
1594 static int
1595 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1596     rctl_qty_t incr, uint_t flags)
1597 {
1598         rctl_qty_t v;
1599         ASSERT(MUTEX_HELD(&p->p_lock));
1600         ASSERT(e->rcep_t == RCENTITY_ZONE);
1601         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1602         if (v > rval->rcv_value)
1603                 return (1);
1604         return (0);
1605 }
1606 
1607 static rctl_ops_t zone_semmni_ops = {
1608         rcop_no_action,
1609         zone_semmni_usage,
1610         rcop_no_set,
1611         zone_semmni_test
1612 };
1613 
1614 /*ARGSUSED*/
1615 static rctl_qty_t
1616 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1617 {
1618         ASSERT(MUTEX_HELD(&p->p_lock));
1619         return (p->p_zone->zone_ipc.ipcq_msgmni);
1620 }
1621 
1622 /*ARGSUSED*/
1623 static int
1624 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1625     rctl_qty_t incr, uint_t flags)
1626 {
1627         rctl_qty_t v;
1628         ASSERT(MUTEX_HELD(&p->p_lock));
1629         ASSERT(e->rcep_t == RCENTITY_ZONE);
1630         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1631         if (v > rval->rcv_value)
1632                 return (1);
1633         return (0);
1634 }
1635 
1636 static rctl_ops_t zone_msgmni_ops = {
1637         rcop_no_action,
1638         zone_msgmni_usage,
1639         rcop_no_set,
1640         zone_msgmni_test
1641 };
1642 
1643 /*ARGSUSED*/
1644 static rctl_qty_t
1645 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1646 {
1647         rctl_qty_t q;
1648         ASSERT(MUTEX_HELD(&p->p_lock));
1649         mutex_enter(&p->p_zone->zone_mem_lock);
1650         q = p->p_zone->zone_locked_mem;
1651         mutex_exit(&p->p_zone->zone_mem_lock);
1652         return (q);
1653 }
1654 
1655 /*ARGSUSED*/
1656 static int
1657 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1658     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1659 {
1660         rctl_qty_t q;
1661         zone_t *z;
1662 
1663         z = e->rcep_p.zone;
1664         ASSERT(MUTEX_HELD(&p->p_lock));
1665         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1666         q = z->zone_locked_mem;
1667         if (q + incr > rcntl->rcv_value)
1668                 return (1);
1669         return (0);
1670 }
1671 
1672 /*ARGSUSED*/
1673 static int
1674 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1675     rctl_qty_t nv)
1676 {
1677         ASSERT(MUTEX_HELD(&p->p_lock));
1678         ASSERT(e->rcep_t == RCENTITY_ZONE);
1679         if (e->rcep_p.zone == NULL)
1680                 return (0);
1681         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1682         return (0);
1683 }
1684 
1685 static rctl_ops_t zone_locked_mem_ops = {
1686         rcop_no_action,
1687         zone_locked_mem_usage,
1688         zone_locked_mem_set,
1689         zone_locked_mem_test
1690 };
1691 
1692 /*ARGSUSED*/
1693 static rctl_qty_t
1694 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1695 {
1696         rctl_qty_t q;
1697         zone_t *z = p->p_zone;
1698 
1699         ASSERT(MUTEX_HELD(&p->p_lock));
1700         mutex_enter(&z->zone_mem_lock);
1701         q = z->zone_max_swap;
1702         mutex_exit(&z->zone_mem_lock);
1703         return (q);
1704 }
1705 
1706 /*ARGSUSED*/
1707 static int
1708 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1709     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1710 {
1711         rctl_qty_t q;
1712         zone_t *z;
1713 
1714         z = e->rcep_p.zone;
1715         ASSERT(MUTEX_HELD(&p->p_lock));
1716         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1717         q = z->zone_max_swap;
1718         if (q + incr > rcntl->rcv_value)
1719                 return (1);
1720         return (0);
1721 }
1722 
1723 /*ARGSUSED*/
1724 static int
1725 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1726     rctl_qty_t nv)
1727 {
1728         ASSERT(MUTEX_HELD(&p->p_lock));
1729         ASSERT(e->rcep_t == RCENTITY_ZONE);
1730         if (e->rcep_p.zone == NULL)
1731                 return (0);
1732         e->rcep_p.zone->zone_max_swap_ctl = nv;
1733         return (0);
1734 }
1735 
1736 static rctl_ops_t zone_max_swap_ops = {
1737         rcop_no_action,
1738         zone_max_swap_usage,
1739         zone_max_swap_set,
1740         zone_max_swap_test
1741 };
1742 
1743 /*ARGSUSED*/
1744 static rctl_qty_t
1745 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1746 {
1747         rctl_qty_t q;
1748         zone_t *z = p->p_zone;
1749 
1750         ASSERT(MUTEX_HELD(&p->p_lock));
1751         mutex_enter(&z->zone_rctl_lock);
1752         q = z->zone_max_lofi;
1753         mutex_exit(&z->zone_rctl_lock);
1754         return (q);
1755 }
1756 
1757 /*ARGSUSED*/
1758 static int
1759 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1760     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1761 {
1762         rctl_qty_t q;
1763         zone_t *z;
1764 
1765         z = e->rcep_p.zone;
1766         ASSERT(MUTEX_HELD(&p->p_lock));
1767         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1768         q = z->zone_max_lofi;
1769         if (q + incr > rcntl->rcv_value)
1770                 return (1);
1771         return (0);
1772 }
1773 
1774 /*ARGSUSED*/
1775 static int
1776 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1777     rctl_qty_t nv)
1778 {
1779         ASSERT(MUTEX_HELD(&p->p_lock));
1780         ASSERT(e->rcep_t == RCENTITY_ZONE);
1781         if (e->rcep_p.zone == NULL)
1782                 return (0);
1783         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1784         return (0);
1785 }
1786 
1787 static rctl_ops_t zone_max_lofi_ops = {
1788         rcop_no_action,
1789         zone_max_lofi_usage,
1790         zone_max_lofi_set,
1791         zone_max_lofi_test
1792 };
1793 
1794 /*
1795  * Helper function to brand the zone with a unique ID.
1796  */
1797 static void
1798 zone_uniqid(zone_t *zone)
1799 {
1800         static uint64_t uniqid = 0;
1801 
1802         ASSERT(MUTEX_HELD(&zonehash_lock));
1803         zone->zone_uniqid = uniqid++;
1804 }
1805 
1806 /*
1807  * Returns a held pointer to the "kcred" for the specified zone.
1808  */
1809 struct cred *
1810 zone_get_kcred(zoneid_t zoneid)
1811 {
1812         zone_t *zone;
1813         cred_t *cr;
1814 
1815         if ((zone = zone_find_by_id(zoneid)) == NULL)
1816                 return (NULL);
1817         cr = zone->zone_kcred;
1818         crhold(cr);
1819         zone_rele(zone);
1820         return (cr);
1821 }
1822 
1823 static int
1824 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1825 {
1826         zone_t *zone = ksp->ks_private;
1827         zone_kstat_t *zk = ksp->ks_data;
1828 
1829         if (rw == KSTAT_WRITE)
1830                 return (EACCES);
1831 
1832         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1833         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1834         return (0);
1835 }
1836 
1837 static int
1838 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1839 {
1840         zone_t *zone = ksp->ks_private;
1841         zone_kstat_t *zk = ksp->ks_data;
1842 
1843         if (rw == KSTAT_WRITE)
1844                 return (EACCES);
1845 
1846         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1847         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1848         return (0);
1849 }
1850 
1851 static int
1852 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1853 {
1854         zone_t *zone = ksp->ks_private;
1855         zone_kstat_t *zk = ksp->ks_data;
1856 
1857         if (rw == KSTAT_WRITE)
1858                 return (EACCES);
1859 
1860         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1861         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1862         return (0);
1863 }
1864 
1865 static kstat_t *
1866 zone_kstat_create_common(zone_t *zone, char *name,
1867     int (*updatefunc) (kstat_t *, int))
1868 {
1869         kstat_t *ksp;
1870         zone_kstat_t *zk;
1871 
1872         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1873             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1874             KSTAT_FLAG_VIRTUAL);
1875 
1876         if (ksp == NULL)
1877                 return (NULL);
1878 
1879         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1880         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1881         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1882         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1883         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1884         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1885         ksp->ks_update = updatefunc;
1886         ksp->ks_private = zone;
1887         kstat_install(ksp);
1888         return (ksp);
1889 }
1890 
1891 
1892 static int
1893 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1894 {
1895         zone_t *zone = ksp->ks_private;
1896         zone_mcap_kstat_t *zmp = ksp->ks_data;
1897 
1898         if (rw == KSTAT_WRITE)
1899                 return (EACCES);
1900 
1901         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1902         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1903         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1904         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1905         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1906 
1907         return (0);
1908 }
1909 
1910 static kstat_t *
1911 zone_mcap_kstat_create(zone_t *zone)
1912 {
1913         kstat_t *ksp;
1914         zone_mcap_kstat_t *zmp;
1915 
1916         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1917             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1918             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1919             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1920                 return (NULL);
1921 
1922         if (zone->zone_id != GLOBAL_ZONEID)
1923                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1924 
1925         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1926         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1927         ksp->ks_lock = &zone->zone_mcap_lock;
1928         zone->zone_mcap_stats = zmp;
1929 
1930         /* The kstat "name" field is not large enough for a full zonename */
1931         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1932         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1933         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1934         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1935         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1936         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1937         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1938             KSTAT_DATA_UINT64);
1939 
1940         ksp->ks_update = zone_mcap_kstat_update;
1941         ksp->ks_private = zone;
1942 
1943         kstat_install(ksp);
1944         return (ksp);
1945 }
1946 
1947 static int
1948 zone_misc_kstat_update(kstat_t *ksp, int rw)
1949 {
1950         zone_t *zone = ksp->ks_private;
1951         zone_misc_kstat_t *zmp = ksp->ks_data;
1952         hrtime_t tmp;
1953 
1954         if (rw == KSTAT_WRITE)
1955                 return (EACCES);
1956 
1957         tmp = zone->zone_utime;
1958         scalehrtime(&tmp);
1959         zmp->zm_utime.value.ui64 = tmp;
1960         tmp = zone->zone_stime;
1961         scalehrtime(&tmp);
1962         zmp->zm_stime.value.ui64 = tmp;
1963         tmp = zone->zone_wtime;
1964         scalehrtime(&tmp);
1965         zmp->zm_wtime.value.ui64 = tmp;
1966 
1967         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1968         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1969         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1970 
1971         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1972         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1973         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1974         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1975 
1976         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1977 
1978         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1979         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1980 
1981         return (0);
1982 }
1983 
1984 static kstat_t *
1985 zone_misc_kstat_create(zone_t *zone)
1986 {
1987         kstat_t *ksp;
1988         zone_misc_kstat_t *zmp;
1989 
1990         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1991             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1992             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1993             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1994                 return (NULL);
1995 
1996         if (zone->zone_id != GLOBAL_ZONEID)
1997                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1998 
1999         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2000         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2001         ksp->ks_lock = &zone->zone_misc_lock;
2002         zone->zone_misc_stats = zmp;
2003 
2004         /* The kstat "name" field is not large enough for a full zonename */
2005         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2006         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2007         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2008         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
2009         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2010         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2011         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2012         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2013             KSTAT_DATA_UINT32);
2014         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2015         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2016             KSTAT_DATA_UINT32);
2017         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2018         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2019         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2020             KSTAT_DATA_UINT32);
2021         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2022         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2023 
2024         ksp->ks_update = zone_misc_kstat_update;
2025         ksp->ks_private = zone;
2026 
2027         kstat_install(ksp);
2028         return (ksp);
2029 }
2030 
2031 static void
2032 zone_kstat_create(zone_t *zone)
2033 {
2034         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2035             "lockedmem", zone_lockedmem_kstat_update);
2036         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2037             "swapresv", zone_swapresv_kstat_update);
2038         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2039             "nprocs", zone_nprocs_kstat_update);
2040 
2041         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2042                 zone->zone_mcap_stats = kmem_zalloc(
2043                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2044         }
2045 
2046         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2047                 zone->zone_misc_stats = kmem_zalloc(
2048                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2049         }
2050 }
2051 
2052 static void
2053 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2054 {
2055         void *data;
2056 
2057         if (*pkstat != NULL) {
2058                 data = (*pkstat)->ks_data;
2059                 kstat_delete(*pkstat);
2060                 kmem_free(data, datasz);
2061                 *pkstat = NULL;
2062         }
2063 }
2064 
2065 static void
2066 zone_kstat_delete(zone_t *zone)
2067 {
2068         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2069             sizeof (zone_kstat_t));
2070         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2071             sizeof (zone_kstat_t));
2072         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2073             sizeof (zone_kstat_t));
2074         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2075             sizeof (zone_mcap_kstat_t));
2076         zone_kstat_delete_common(&zone->zone_misc_ksp,
2077             sizeof (zone_misc_kstat_t));
2078 }
2079 
2080 /*
2081  * Called very early on in boot to initialize the ZSD list so that
2082  * zone_key_create() can be called before zone_init().  It also initializes
2083  * portions of zone0 which may be used before zone_init() is called.  The
2084  * variable "global_zone" will be set when zone0 is fully initialized by
2085  * zone_init().
2086  */
2087 void
2088 zone_zsd_init(void)
2089 {
2090         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2091         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2092         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2093             offsetof(struct zsd_entry, zsd_linkage));
2094         list_create(&zone_active, sizeof (zone_t),
2095             offsetof(zone_t, zone_linkage));
2096         list_create(&zone_deathrow, sizeof (zone_t),
2097             offsetof(zone_t, zone_linkage));
2098 
2099         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2100         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2101         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2102         zone0.zone_shares = 1;
2103         zone0.zone_nlwps = 0;
2104         zone0.zone_nlwps_ctl = INT_MAX;
2105         zone0.zone_nprocs = 0;
2106         zone0.zone_nprocs_ctl = INT_MAX;
2107         zone0.zone_locked_mem = 0;
2108         zone0.zone_locked_mem_ctl = UINT64_MAX;
2109         ASSERT(zone0.zone_max_swap == 0);
2110         zone0.zone_max_swap_ctl = UINT64_MAX;
2111         zone0.zone_max_lofi = 0;
2112         zone0.zone_max_lofi_ctl = UINT64_MAX;
2113         zone0.zone_shmmax = 0;
2114         zone0.zone_ipc.ipcq_shmmni = 0;
2115         zone0.zone_ipc.ipcq_semmni = 0;
2116         zone0.zone_ipc.ipcq_msgmni = 0;
2117         zone0.zone_name = GLOBAL_ZONENAME;
2118         zone0.zone_nodename = utsname.nodename;
2119         zone0.zone_domain = srpc_domain;
2120         zone0.zone_hostid = HW_INVALID_HOSTID;
2121         zone0.zone_fs_allowed = NULL;
2122         zone0.zone_ref = 1;
2123         zone0.zone_id = GLOBAL_ZONEID;
2124         zone0.zone_status = ZONE_IS_RUNNING;
2125         zone0.zone_rootpath = "/";
2126         zone0.zone_rootpathlen = 2;
2127         zone0.zone_psetid = ZONE_PS_INVAL;
2128         zone0.zone_ncpus = 0;
2129         zone0.zone_ncpus_online = 0;
2130         zone0.zone_proc_initpid = 1;
2131         zone0.zone_initname = initname;
2132         zone0.zone_lockedmem_kstat = NULL;
2133         zone0.zone_swapresv_kstat = NULL;
2134         zone0.zone_nprocs_kstat = NULL;
2135         zone0.zone_zfs_io_pri = 1;
2136 
2137         zone0.zone_stime = 0;
2138         zone0.zone_utime = 0;
2139         zone0.zone_wtime = 0;
2140 
2141         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2142             offsetof(zone_ref_t, zref_linkage));
2143         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2144             offsetof(struct zsd_entry, zsd_linkage));
2145         list_insert_head(&zone_active, &zone0);
2146 
2147         /*
2148          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2149          * to anything meaningful.  It is assigned to be 'rootdir' in
2150          * vfs_mountroot().
2151          */
2152         zone0.zone_rootvp = NULL;
2153         zone0.zone_vfslist = NULL;
2154         zone0.zone_bootargs = initargs;
2155         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2156         /*
2157          * The global zone has all privileges
2158          */
2159         priv_fillset(zone0.zone_privset);
2160         /*
2161          * Add p0 to the global zone
2162          */
2163         zone0.zone_zsched = &p0;
2164         p0.p_zone = &zone0;
2165 }
2166 
2167 /*
2168  * Compute a hash value based on the contents of the label and the DOI.  The
2169  * hash algorithm is somewhat arbitrary, but is based on the observation that
2170  * humans will likely pick labels that differ by amounts that work out to be
2171  * multiples of the number of hash chains, and thus stirring in some primes
2172  * should help.
2173  */
2174 static uint_t
2175 hash_bylabel(void *hdata, mod_hash_key_t key)
2176 {
2177         const ts_label_t *lab = (ts_label_t *)key;
2178         const uint32_t *up, *ue;
2179         uint_t hash;
2180         int i;
2181 
2182         _NOTE(ARGUNUSED(hdata));
2183 
2184         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2185         /* we depend on alignment of label, but not representation */
2186         up = (const uint32_t *)&lab->tsl_label;
2187         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2188         i = 1;
2189         while (up < ue) {
2190                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2191                 hash += *up + (*up << ((i % 16) + 1));
2192                 up++;
2193                 i++;
2194         }
2195         return (hash);
2196 }
2197 
2198 /*
2199  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2200  * equal).  This may need to be changed if less than / greater than is ever
2201  * needed.
2202  */
2203 static int
2204 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2205 {
2206         ts_label_t *lab1 = (ts_label_t *)key1;
2207         ts_label_t *lab2 = (ts_label_t *)key2;
2208 
2209         return (label_equal(lab1, lab2) ? 0 : 1);
2210 }
2211 
2212 /*
2213  * Called by main() to initialize the zones framework.
2214  */
2215 void
2216 zone_init(void)
2217 {
2218         rctl_dict_entry_t *rde;
2219         rctl_val_t *dval;
2220         rctl_set_t *set;
2221         rctl_alloc_gp_t *gp;
2222         rctl_entity_p_t e;
2223         int res;
2224 
2225         ASSERT(curproc == &p0);
2226 
2227         /*
2228          * Create ID space for zone IDs.  ID 0 is reserved for the
2229          * global zone.
2230          */
2231         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2232 
2233         /*
2234          * Initialize generic zone resource controls, if any.
2235          */
2236         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2237             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2238             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2239             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2240 
2241         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2242             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2243             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2244             RCTL_GLOBAL_INFINITE,
2245             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2246 
2247         rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2248             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2249             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2250             1024, 1024, &zone_zfs_io_pri_ops);
2251 
2252         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2253             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2254             INT_MAX, INT_MAX, &zone_lwps_ops);
2255 
2256         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2257             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2258             INT_MAX, INT_MAX, &zone_procs_ops);
2259 
2260         /*
2261          * System V IPC resource controls
2262          */
2263         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2264             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2265             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2266 
2267         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2268             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2269             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2270 
2271         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2272             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2273             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2274 
2275         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2276             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2277             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2278 
2279         /*
2280          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2281          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2282          */
2283         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2284         bzero(dval, sizeof (rctl_val_t));
2285         dval->rcv_value = 1;
2286         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2287         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2288         dval->rcv_action_recip_pid = -1;
2289 
2290         rde = rctl_dict_lookup("zone.cpu-shares");
2291         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2292 
2293         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2294             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2295             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2296             &zone_locked_mem_ops);
2297 
2298         rc_zone_max_swap = rctl_register("zone.max-swap",
2299             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2300             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2301             &zone_max_swap_ops);
2302 
2303         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2304             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2305             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2306             &zone_max_lofi_ops);
2307 
2308         /*
2309          * Initialize the ``global zone''.
2310          */
2311         set = rctl_set_create();
2312         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2313         mutex_enter(&p0.p_lock);
2314         e.rcep_p.zone = &zone0;
2315         e.rcep_t = RCENTITY_ZONE;
2316         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2317             gp);
2318 
2319         zone0.zone_nlwps = p0.p_lwpcnt;
2320         zone0.zone_nprocs = 1;
2321         zone0.zone_ntasks = 1;
2322         mutex_exit(&p0.p_lock);
2323         zone0.zone_restart_init = B_TRUE;
2324         zone0.zone_brand = &native_brand;
2325         rctl_prealloc_destroy(gp);
2326         /*
2327          * pool_default hasn't been initialized yet, so we let pool_init()
2328          * take care of making sure the global zone is in the default pool.
2329          */
2330 
2331         /*
2332          * Initialize global zone kstats
2333          */
2334         zone_kstat_create(&zone0);
2335 
2336         /*
2337          * Initialize zone label.
2338          * mlp are initialized when tnzonecfg is loaded.
2339          */
2340         zone0.zone_slabel = l_admin_low;
2341         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2342         label_hold(l_admin_low);
2343 
2344         /*
2345          * Initialise the lock for the database structure used by mntfs.
2346          */
2347         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2348 
2349         mutex_enter(&zonehash_lock);
2350         zone_uniqid(&zone0);
2351         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2352 
2353         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2354             mod_hash_null_valdtor);
2355         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2356             zone_hash_size, mod_hash_null_valdtor);
2357         /*
2358          * maintain zonehashbylabel only for labeled systems
2359          */
2360         if (is_system_labeled())
2361                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2362                     zone_hash_size, mod_hash_null_keydtor,
2363                     mod_hash_null_valdtor, hash_bylabel, NULL,
2364                     hash_labelkey_cmp, KM_SLEEP);
2365         zonecount = 1;
2366 
2367         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2368             (mod_hash_val_t)&zone0);
2369         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2370             (mod_hash_val_t)&zone0);
2371         if (is_system_labeled()) {
2372                 zone0.zone_flags |= ZF_HASHED_LABEL;
2373                 (void) mod_hash_insert(zonehashbylabel,
2374                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2375         }
2376         mutex_exit(&zonehash_lock);
2377 
2378         /*
2379          * We avoid setting zone_kcred until now, since kcred is initialized
2380          * sometime after zone_zsd_init() and before zone_init().
2381          */
2382         zone0.zone_kcred = kcred;
2383         /*
2384          * The global zone is fully initialized (except for zone_rootvp which
2385          * will be set when the root filesystem is mounted).
2386          */
2387         global_zone = &zone0;
2388 
2389         /*
2390          * Setup an event channel to send zone status change notifications on
2391          */
2392         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2393             EVCH_CREAT);
2394 
2395         if (res)
2396                 panic("Sysevent_evc_bind failed during zone setup.\n");
2397 
2398 }
2399 
2400 static void
2401 zone_free(zone_t *zone)
2402 {
2403         ASSERT(zone != global_zone);
2404         ASSERT(zone->zone_ntasks == 0);
2405         ASSERT(zone->zone_nlwps == 0);
2406         ASSERT(zone->zone_nprocs == 0);
2407         ASSERT(zone->zone_cred_ref == 0);
2408         ASSERT(zone->zone_kcred == NULL);
2409         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2410             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2411         ASSERT(list_is_empty(&zone->zone_ref_list));
2412 
2413         /*
2414          * Remove any zone caps.
2415          */
2416         cpucaps_zone_remove(zone);
2417 
2418         ASSERT(zone->zone_cpucap == NULL);
2419 
2420         /* remove from deathrow list */
2421         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2422                 ASSERT(zone->zone_ref == 0);
2423                 mutex_enter(&zone_deathrow_lock);
2424                 list_remove(&zone_deathrow, zone);
2425                 mutex_exit(&zone_deathrow_lock);
2426         }
2427 
2428         list_destroy(&zone->zone_ref_list);
2429         zone_free_zsd(zone);
2430         zone_free_datasets(zone);
2431         list_destroy(&zone->zone_dl_list);
2432 
2433         if (zone->zone_rootvp != NULL)
2434                 VN_RELE(zone->zone_rootvp);
2435         if (zone->zone_rootpath)
2436                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2437         if (zone->zone_name != NULL)
2438                 kmem_free(zone->zone_name, ZONENAME_MAX);
2439         if (zone->zone_slabel != NULL)
2440                 label_rele(zone->zone_slabel);
2441         if (zone->zone_nodename != NULL)
2442                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2443         if (zone->zone_domain != NULL)
2444                 kmem_free(zone->zone_domain, _SYS_NMLN);
2445         if (zone->zone_privset != NULL)
2446                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2447         if (zone->zone_rctls != NULL)
2448                 rctl_set_free(zone->zone_rctls);
2449         if (zone->zone_bootargs != NULL)
2450                 strfree(zone->zone_bootargs);
2451         if (zone->zone_initname != NULL)
2452                 strfree(zone->zone_initname);
2453         if (zone->zone_fs_allowed != NULL)
2454                 strfree(zone->zone_fs_allowed);
2455         if (zone->zone_pfexecd != NULL)
2456                 klpd_freelist(&zone->zone_pfexecd);
2457         id_free(zoneid_space, zone->zone_id);
2458         mutex_destroy(&zone->zone_lock);
2459         cv_destroy(&zone->zone_cv);
2460         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2461         rw_destroy(&zone->zone_mntfs_db_lock);
2462         kmem_free(zone, sizeof (zone_t));
2463 }
2464 
2465 /*
2466  * See block comment at the top of this file for information about zone
2467  * status values.
2468  */
2469 /*
2470  * Convenience function for setting zone status.
2471  */
2472 static void
2473 zone_status_set(zone_t *zone, zone_status_t status)
2474 {
2475 
2476         nvlist_t *nvl = NULL;
2477         ASSERT(MUTEX_HELD(&zone_status_lock));
2478         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2479             status >= zone_status_get(zone));
2480 
2481         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2482             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2483             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2484             zone_status_table[status]) ||
2485             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2486             zone_status_table[zone->zone_status]) ||
2487             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2488             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2489             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2490             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2491 #ifdef DEBUG
2492                 (void) printf(
2493                     "Failed to allocate and send zone state change event.\n");
2494 #endif
2495         }
2496         nvlist_free(nvl);
2497 
2498         zone->zone_status = status;
2499 
2500         cv_broadcast(&zone->zone_cv);
2501 }
2502 
2503 /*
2504  * Public function to retrieve the zone status.  The zone status may
2505  * change after it is retrieved.
2506  */
2507 zone_status_t
2508 zone_status_get(zone_t *zone)
2509 {
2510         return (zone->zone_status);
2511 }
2512 
2513 static int
2514 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2515 {
2516         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2517         int err = 0;
2518 
2519         ASSERT(zone != global_zone);
2520         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2521                 goto done;      /* EFAULT or ENAMETOOLONG */
2522 
2523         if (zone->zone_bootargs != NULL)
2524                 strfree(zone->zone_bootargs);
2525 
2526         zone->zone_bootargs = strdup(buf);
2527 
2528 done:
2529         kmem_free(buf, BOOTARGS_MAX);
2530         return (err);
2531 }
2532 
2533 static int
2534 zone_set_brand(zone_t *zone, const char *brand)
2535 {
2536         struct brand_attr *attrp;
2537         brand_t *bp;
2538 
2539         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2540         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2541                 kmem_free(attrp, sizeof (struct brand_attr));
2542                 return (EFAULT);
2543         }
2544 
2545         bp = brand_register_zone(attrp);
2546         kmem_free(attrp, sizeof (struct brand_attr));
2547         if (bp == NULL)
2548                 return (EINVAL);
2549 
2550         /*
2551          * This is the only place where a zone can change it's brand.
2552          * We already need to hold zone_status_lock to check the zone
2553          * status, so we'll just use that lock to serialize zone
2554          * branding requests as well.
2555          */
2556         mutex_enter(&zone_status_lock);
2557 
2558         /* Re-Branding is not allowed and the zone can't be booted yet */
2559         if ((ZONE_IS_BRANDED(zone)) ||
2560             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2561                 mutex_exit(&zone_status_lock);
2562                 brand_unregister_zone(bp);
2563                 return (EINVAL);
2564         }
2565 
2566         /* set up the brand specific data */
2567         zone->zone_brand = bp;
2568         ZBROP(zone)->b_init_brand_data(zone);
2569 
2570         mutex_exit(&zone_status_lock);
2571         return (0);
2572 }
2573 
2574 static int
2575 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2576 {
2577         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2578         int err = 0;
2579 
2580         ASSERT(zone != global_zone);
2581         if ((err = copyinstr(zone_fs_allowed, buf,
2582             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2583                 goto done;
2584 
2585         if (zone->zone_fs_allowed != NULL)
2586                 strfree(zone->zone_fs_allowed);
2587 
2588         zone->zone_fs_allowed = strdup(buf);
2589 
2590 done:
2591         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2592         return (err);
2593 }
2594 
2595 static int
2596 zone_set_initname(zone_t *zone, const char *zone_initname)
2597 {
2598         char initname[INITNAME_SZ];
2599         size_t len;
2600         int err = 0;
2601 
2602         ASSERT(zone != global_zone);
2603         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2604                 return (err);   /* EFAULT or ENAMETOOLONG */
2605 
2606         if (zone->zone_initname != NULL)
2607                 strfree(zone->zone_initname);
2608 
2609         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2610         (void) strcpy(zone->zone_initname, initname);
2611         return (0);
2612 }
2613 
2614 static int
2615 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2616 {
2617         uint64_t mcap;
2618         int err = 0;
2619 
2620         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2621                 zone->zone_phys_mcap = mcap;
2622 
2623         return (err);
2624 }
2625 
2626 static int
2627 zone_set_sched_class(zone_t *zone, const char *new_class)
2628 {
2629         char sched_class[PC_CLNMSZ];
2630         id_t classid;
2631         int err;
2632 
2633         ASSERT(zone != global_zone);
2634         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2635                 return (err);   /* EFAULT or ENAMETOOLONG */
2636 
2637         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2638                 return (set_errno(EINVAL));
2639         zone->zone_defaultcid = classid;
2640         ASSERT(zone->zone_defaultcid > 0 &&
2641             zone->zone_defaultcid < loaded_classes);
2642 
2643         return (0);
2644 }
2645 
2646 /*
2647  * Block indefinitely waiting for (zone_status >= status)
2648  */
2649 void
2650 zone_status_wait(zone_t *zone, zone_status_t status)
2651 {
2652         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2653 
2654         mutex_enter(&zone_status_lock);
2655         while (zone->zone_status < status) {
2656                 cv_wait(&zone->zone_cv, &zone_status_lock);
2657         }
2658         mutex_exit(&zone_status_lock);
2659 }
2660 
2661 /*
2662  * Private CPR-safe version of zone_status_wait().
2663  */
2664 static void
2665 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2666 {
2667         callb_cpr_t cprinfo;
2668 
2669         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2670 
2671         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2672             str);
2673         mutex_enter(&zone_status_lock);
2674         while (zone->zone_status < status) {
2675                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2676                 cv_wait(&zone->zone_cv, &zone_status_lock);
2677                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2678         }
2679         /*
2680          * zone_status_lock is implicitly released by the following.
2681          */
2682         CALLB_CPR_EXIT(&cprinfo);
2683 }
2684 
2685 /*
2686  * Block until zone enters requested state or signal is received.  Return (0)
2687  * if signaled, non-zero otherwise.
2688  */
2689 int
2690 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2691 {
2692         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2693 
2694         mutex_enter(&zone_status_lock);
2695         while (zone->zone_status < status) {
2696                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2697                         mutex_exit(&zone_status_lock);
2698                         return (0);
2699                 }
2700         }
2701         mutex_exit(&zone_status_lock);
2702         return (1);
2703 }
2704 
2705 /*
2706  * Block until the zone enters the requested state or the timeout expires,
2707  * whichever happens first.  Return (-1) if operation timed out, time remaining
2708  * otherwise.
2709  */
2710 clock_t
2711 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2712 {
2713         clock_t timeleft = 0;
2714 
2715         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2716 
2717         mutex_enter(&zone_status_lock);
2718         while (zone->zone_status < status && timeleft != -1) {
2719                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2720         }
2721         mutex_exit(&zone_status_lock);
2722         return (timeleft);
2723 }
2724 
2725 /*
2726  * Block until the zone enters the requested state, the current process is
2727  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2728  * operation timed out, 0 if signaled, time remaining otherwise.
2729  */
2730 clock_t
2731 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2732 {
2733         clock_t timeleft = tim - ddi_get_lbolt();
2734 
2735         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2736 
2737         mutex_enter(&zone_status_lock);
2738         while (zone->zone_status < status) {
2739                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2740                     tim);
2741                 if (timeleft <= 0)
2742                         break;
2743         }
2744         mutex_exit(&zone_status_lock);
2745         return (timeleft);
2746 }
2747 
2748 /*
2749  * Zones have two reference counts: one for references from credential
2750  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2751  * This is so we can allow a zone to be rebooted while there are still
2752  * outstanding cred references, since certain drivers cache dblks (which
2753  * implicitly results in cached creds).  We wait for zone_ref to drop to
2754  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2755  * later freed when the zone_cred_ref drops to 0, though nothing other
2756  * than the zone id and privilege set should be accessed once the zone
2757  * is "dead".
2758  *
2759  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2760  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2761  * to 0.  This can be useful to flush out other sources of cached creds
2762  * that may be less innocuous than the driver case.
2763  *
2764  * Zones also provide a tracked reference counting mechanism in which zone
2765  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2766  * debuggers determine the sources of leaked zone references.  See
2767  * zone_hold_ref() and zone_rele_ref() below for more information.
2768  */
2769 
2770 int zone_wait_for_cred = 0;
2771 
2772 static void
2773 zone_hold_locked(zone_t *z)
2774 {
2775         ASSERT(MUTEX_HELD(&z->zone_lock));
2776         z->zone_ref++;
2777         ASSERT(z->zone_ref != 0);
2778 }
2779 
2780 /*
2781  * Increment the specified zone's reference count.  The zone's zone_t structure
2782  * will not be freed as long as the zone's reference count is nonzero.
2783  * Decrement the zone's reference count via zone_rele().
2784  *
2785  * NOTE: This function should only be used to hold zones for short periods of
2786  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2787  */
2788 void
2789 zone_hold(zone_t *z)
2790 {
2791         mutex_enter(&z->zone_lock);
2792         zone_hold_locked(z);
2793         mutex_exit(&z->zone_lock);
2794 }
2795 
2796 /*
2797  * If the non-cred ref count drops to 1 and either the cred ref count
2798  * is 0 or we aren't waiting for cred references, the zone is ready to
2799  * be destroyed.
2800  */
2801 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2802             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2803 
2804 /*
2805  * Common zone reference release function invoked by zone_rele() and
2806  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2807  * zone's subsystem-specific reference counters are not affected by the
2808  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2809  * removed from the specified zone's reference list.  ref must be non-NULL iff
2810  * subsys is not ZONE_REF_NUM_SUBSYS.
2811  */
2812 static void
2813 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2814 {
2815         boolean_t wakeup;
2816 
2817         mutex_enter(&z->zone_lock);
2818         ASSERT(z->zone_ref != 0);
2819         z->zone_ref--;
2820         if (subsys != ZONE_REF_NUM_SUBSYS) {
2821                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2822                 z->zone_subsys_ref[subsys]--;
2823                 list_remove(&z->zone_ref_list, ref);
2824         }
2825         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2826                 /* no more refs, free the structure */
2827                 mutex_exit(&z->zone_lock);
2828                 zone_free(z);
2829                 return;
2830         }
2831         /* signal zone_destroy so the zone can finish halting */
2832         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2833         mutex_exit(&z->zone_lock);
2834 
2835         if (wakeup) {
2836                 /*
2837                  * Grabbing zonehash_lock here effectively synchronizes with
2838                  * zone_destroy() to avoid missed signals.
2839                  */
2840                 mutex_enter(&zonehash_lock);
2841                 cv_broadcast(&zone_destroy_cv);
2842                 mutex_exit(&zonehash_lock);
2843         }
2844 }
2845 
2846 /*
2847  * Decrement the specified zone's reference count.  The specified zone will
2848  * cease to exist after this function returns if the reference count drops to
2849  * zero.  This function should be paired with zone_hold().
2850  */
2851 void
2852 zone_rele(zone_t *z)
2853 {
2854         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2855 }
2856 
2857 /*
2858  * Initialize a zone reference structure.  This function must be invoked for
2859  * a reference structure before the structure is passed to zone_hold_ref().
2860  */
2861 void
2862 zone_init_ref(zone_ref_t *ref)
2863 {
2864         ref->zref_zone = NULL;
2865         list_link_init(&ref->zref_linkage);
2866 }
2867 
2868 /*
2869  * Acquire a reference to zone z.  The caller must specify the
2870  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2871  * zone_ref_t structure will represent a reference to the specified zone.  Use
2872  * zone_rele_ref() to release the reference.
2873  *
2874  * The referenced zone_t structure will not be freed as long as the zone_t's
2875  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2876  * references.
2877  *
2878  * NOTE: The zone_ref_t structure must be initialized before it is used.
2879  * See zone_init_ref() above.
2880  */
2881 void
2882 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2883 {
2884         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2885 
2886         /*
2887          * Prevent consumers from reusing a reference structure before
2888          * releasing it.
2889          */
2890         VERIFY(ref->zref_zone == NULL);
2891 
2892         ref->zref_zone = z;
2893         mutex_enter(&z->zone_lock);
2894         zone_hold_locked(z);
2895         z->zone_subsys_ref[subsys]++;
2896         ASSERT(z->zone_subsys_ref[subsys] != 0);
2897         list_insert_head(&z->zone_ref_list, ref);
2898         mutex_exit(&z->zone_lock);
2899 }
2900 
2901 /*
2902  * Release the zone reference represented by the specified zone_ref_t.
2903  * The reference is invalid after it's released; however, the zone_ref_t
2904  * structure can be reused without having to invoke zone_init_ref().
2905  * subsys should be the same value that was passed to zone_hold_ref()
2906  * when the reference was acquired.
2907  */
2908 void
2909 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2910 {
2911         zone_rele_common(ref->zref_zone, ref, subsys);
2912 
2913         /*
2914          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2915          * when consumers dereference the reference.  This helps us catch
2916          * consumers who use released references.  Furthermore, this lets
2917          * consumers reuse the zone_ref_t structure without having to
2918          * invoke zone_init_ref().
2919          */
2920         ref->zref_zone = NULL;
2921 }
2922 
2923 void
2924 zone_cred_hold(zone_t *z)
2925 {
2926         mutex_enter(&z->zone_lock);
2927         z->zone_cred_ref++;
2928         ASSERT(z->zone_cred_ref != 0);
2929         mutex_exit(&z->zone_lock);
2930 }
2931 
2932 void
2933 zone_cred_rele(zone_t *z)
2934 {
2935         boolean_t wakeup;
2936 
2937         mutex_enter(&z->zone_lock);
2938         ASSERT(z->zone_cred_ref != 0);
2939         z->zone_cred_ref--;
2940         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2941                 /* no more refs, free the structure */
2942                 mutex_exit(&z->zone_lock);
2943                 zone_free(z);
2944                 return;
2945         }
2946         /*
2947          * If zone_destroy is waiting for the cred references to drain
2948          * out, and they have, signal it.
2949          */
2950         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2951             zone_status_get(z) >= ZONE_IS_DEAD);
2952         mutex_exit(&z->zone_lock);
2953 
2954         if (wakeup) {
2955                 /*
2956                  * Grabbing zonehash_lock here effectively synchronizes with
2957                  * zone_destroy() to avoid missed signals.
2958                  */
2959                 mutex_enter(&zonehash_lock);
2960                 cv_broadcast(&zone_destroy_cv);
2961                 mutex_exit(&zonehash_lock);
2962         }
2963 }
2964 
2965 void
2966 zone_task_hold(zone_t *z)
2967 {
2968         mutex_enter(&z->zone_lock);
2969         z->zone_ntasks++;
2970         ASSERT(z->zone_ntasks != 0);
2971         mutex_exit(&z->zone_lock);
2972 }
2973 
2974 void
2975 zone_task_rele(zone_t *zone)
2976 {
2977         uint_t refcnt;
2978 
2979         mutex_enter(&zone->zone_lock);
2980         ASSERT(zone->zone_ntasks != 0);
2981         refcnt = --zone->zone_ntasks;
2982         if (refcnt > 1)      {       /* Common case */
2983                 mutex_exit(&zone->zone_lock);
2984                 return;
2985         }
2986         zone_hold_locked(zone); /* so we can use the zone_t later */
2987         mutex_exit(&zone->zone_lock);
2988         if (refcnt == 1) {
2989                 /*
2990                  * See if the zone is shutting down.
2991                  */
2992                 mutex_enter(&zone_status_lock);
2993                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2994                         goto out;
2995                 }
2996 
2997                 /*
2998                  * Make sure the ntasks didn't change since we
2999                  * dropped zone_lock.
3000                  */
3001                 mutex_enter(&zone->zone_lock);
3002                 if (refcnt != zone->zone_ntasks) {
3003                         mutex_exit(&zone->zone_lock);
3004                         goto out;
3005                 }
3006                 mutex_exit(&zone->zone_lock);
3007 
3008                 /*
3009                  * No more user processes in the zone.  The zone is empty.
3010                  */
3011                 zone_status_set(zone, ZONE_IS_EMPTY);
3012                 goto out;
3013         }
3014 
3015         ASSERT(refcnt == 0);
3016         /*
3017          * zsched has exited; the zone is dead.
3018          */
3019         zone->zone_zsched = NULL;            /* paranoia */
3020         mutex_enter(&zone_status_lock);
3021         zone_status_set(zone, ZONE_IS_DEAD);
3022 out:
3023         mutex_exit(&zone_status_lock);
3024         zone_rele(zone);
3025 }
3026 
3027 zoneid_t
3028 getzoneid(void)
3029 {
3030         return (curproc->p_zone->zone_id);
3031 }
3032 
3033 /*
3034  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3035  * check the validity of a zone's state.
3036  */
3037 static zone_t *
3038 zone_find_all_by_id(zoneid_t zoneid)
3039 {
3040         mod_hash_val_t hv;
3041         zone_t *zone = NULL;
3042 
3043         ASSERT(MUTEX_HELD(&zonehash_lock));
3044 
3045         if (mod_hash_find(zonehashbyid,
3046             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3047                 zone = (zone_t *)hv;
3048         return (zone);
3049 }
3050 
3051 static zone_t *
3052 zone_find_all_by_label(const ts_label_t *label)
3053 {
3054         mod_hash_val_t hv;
3055         zone_t *zone = NULL;
3056 
3057         ASSERT(MUTEX_HELD(&zonehash_lock));
3058 
3059         /*
3060          * zonehashbylabel is not maintained for unlabeled systems
3061          */
3062         if (!is_system_labeled())
3063                 return (NULL);
3064         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3065                 zone = (zone_t *)hv;
3066         return (zone);
3067 }
3068 
3069 static zone_t *
3070 zone_find_all_by_name(char *name)
3071 {
3072         mod_hash_val_t hv;
3073         zone_t *zone = NULL;
3074 
3075         ASSERT(MUTEX_HELD(&zonehash_lock));
3076 
3077         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3078                 zone = (zone_t *)hv;
3079         return (zone);
3080 }
3081 
3082 /*
3083  * Public interface for looking up a zone by zoneid.  Only returns the zone if
3084  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3085  * Caller must call zone_rele() once it is done with the zone.
3086  *
3087  * The zone may begin the zone_destroy() sequence immediately after this
3088  * function returns, but may be safely used until zone_rele() is called.
3089  */
3090 zone_t *
3091 zone_find_by_id(zoneid_t zoneid)
3092 {
3093         zone_t *zone;
3094         zone_status_t status;
3095 
3096         mutex_enter(&zonehash_lock);
3097         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3098                 mutex_exit(&zonehash_lock);
3099                 return (NULL);
3100         }
3101         status = zone_status_get(zone);
3102         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3103                 /*
3104                  * For all practical purposes the zone doesn't exist.
3105                  */
3106                 mutex_exit(&zonehash_lock);
3107                 return (NULL);
3108         }
3109         zone_hold(zone);
3110         mutex_exit(&zonehash_lock);
3111         return (zone);
3112 }
3113 
3114 /*
3115  * Similar to zone_find_by_id, but using zone label as the key.
3116  */
3117 zone_t *
3118 zone_find_by_label(const ts_label_t *label)
3119 {
3120         zone_t *zone;
3121         zone_status_t status;
3122 
3123         mutex_enter(&zonehash_lock);
3124         if ((zone = zone_find_all_by_label(label)) == NULL) {
3125                 mutex_exit(&zonehash_lock);
3126                 return (NULL);
3127         }
3128 
3129         status = zone_status_get(zone);
3130         if (status > ZONE_IS_DOWN) {
3131                 /*
3132                  * For all practical purposes the zone doesn't exist.
3133                  */
3134                 mutex_exit(&zonehash_lock);
3135                 return (NULL);
3136         }
3137         zone_hold(zone);
3138         mutex_exit(&zonehash_lock);
3139         return (zone);
3140 }
3141 
3142 /*
3143  * Similar to zone_find_by_id, but using zone name as the key.
3144  */
3145 zone_t *
3146 zone_find_by_name(char *name)
3147 {
3148         zone_t *zone;
3149         zone_status_t status;
3150 
3151         mutex_enter(&zonehash_lock);
3152         if ((zone = zone_find_all_by_name(name)) == NULL) {
3153                 mutex_exit(&zonehash_lock);
3154                 return (NULL);
3155         }
3156         status = zone_status_get(zone);
3157         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3158                 /*
3159                  * For all practical purposes the zone doesn't exist.
3160                  */
3161                 mutex_exit(&zonehash_lock);
3162                 return (NULL);
3163         }
3164         zone_hold(zone);
3165         mutex_exit(&zonehash_lock);
3166         return (zone);
3167 }
3168 
3169 /*
3170  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3171  * if there is a zone "foo" rooted at /foo/root, and the path argument
3172  * is "/foo/root/proc", it will return the held zone_t corresponding to
3173  * zone "foo".
3174  *
3175  * zone_find_by_path() always returns a non-NULL value, since at the
3176  * very least every path will be contained in the global zone.
3177  *
3178  * As with the other zone_find_by_*() functions, the caller is
3179  * responsible for zone_rele()ing the return value of this function.
3180  */
3181 zone_t *
3182 zone_find_by_path(const char *path)
3183 {
3184         zone_t *zone;
3185         zone_t *zret = NULL;
3186         zone_status_t status;
3187 
3188         if (path == NULL) {
3189                 /*
3190                  * Call from rootconf().
3191                  */
3192                 zone_hold(global_zone);
3193                 return (global_zone);
3194         }
3195         ASSERT(*path == '/');
3196         mutex_enter(&zonehash_lock);
3197         for (zone = list_head(&zone_active); zone != NULL;
3198             zone = list_next(&zone_active, zone)) {
3199                 if (ZONE_PATH_VISIBLE(path, zone))
3200                         zret = zone;
3201         }
3202         ASSERT(zret != NULL);
3203         status = zone_status_get(zret);
3204         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3205                 /*
3206                  * Zone practically doesn't exist.
3207                  */
3208                 zret = global_zone;
3209         }
3210         zone_hold(zret);
3211         mutex_exit(&zonehash_lock);
3212         return (zret);
3213 }
3214 
3215 /*
3216  * Public interface for updating per-zone load averages.  Called once per
3217  * second.
3218  *
3219  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3220  */
3221 void
3222 zone_loadavg_update()
3223 {
3224         zone_t *zp;
3225         zone_status_t status;
3226         struct loadavg_s *lavg;
3227         hrtime_t zone_total;
3228         int i;
3229         hrtime_t hr_avg;
3230         int nrun;
3231         static int64_t f[3] = { 135, 27, 9 };
3232         int64_t q, r;
3233 
3234         mutex_enter(&zonehash_lock);
3235         for (zp = list_head(&zone_active); zp != NULL;
3236             zp = list_next(&zone_active, zp)) {
3237                 mutex_enter(&zp->zone_lock);
3238 
3239                 /* Skip zones that are on the way down or not yet up */
3240                 status = zone_status_get(zp);
3241                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3242                         /* For all practical purposes the zone doesn't exist. */
3243                         mutex_exit(&zp->zone_lock);
3244                         continue;
3245                 }
3246 
3247                 /*
3248                  * Update the 10 second moving average data in zone_loadavg.
3249                  */
3250                 lavg = &zp->zone_loadavg;
3251 
3252                 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3253                 scalehrtime(&zone_total);
3254 
3255                 /* The zone_total should always be increasing. */
3256                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3257                     zone_total - lavg->lg_total : 0;
3258                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3259                 /* lg_total holds the prev. 1 sec. total */
3260                 lavg->lg_total = zone_total;
3261 
3262                 /*
3263                  * To simplify the calculation, we don't calculate the load avg.
3264                  * until the zone has been up for at least 10 seconds and our
3265                  * moving average is thus full.
3266                  */
3267                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3268                         lavg->lg_len++;
3269                         mutex_exit(&zp->zone_lock);
3270                         continue;
3271                 }
3272 
3273                 /* Now calculate the 1min, 5min, 15 min load avg. */
3274                 hr_avg = 0;
3275                 for (i = 0; i < S_LOADAVG_SZ; i++)
3276                         hr_avg += lavg->lg_loads[i];
3277                 hr_avg = hr_avg / S_LOADAVG_SZ;
3278                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3279 
3280                 /* Compute load avg. See comment in calcloadavg() */
3281                 for (i = 0; i < 3; i++) {
3282                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3283                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3284                         zp->zone_hp_avenrun[i] +=
3285                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3286 
3287                         /* avenrun[] can only hold 31 bits of load avg. */
3288                         if (zp->zone_hp_avenrun[i] <
3289                             ((uint64_t)1<<(31+16-FSHIFT)))
3290                                 zp->zone_avenrun[i] = (int32_t)
3291                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3292                         else
3293                                 zp->zone_avenrun[i] = 0x7fffffff;
3294                 }
3295 
3296                 mutex_exit(&zp->zone_lock);
3297         }
3298         mutex_exit(&zonehash_lock);
3299 }
3300 
3301 /*
3302  * Get the number of cpus visible to this zone.  The system-wide global
3303  * 'ncpus' is returned if pools are disabled, the caller is in the
3304  * global zone, or a NULL zone argument is passed in.
3305  */
3306 int
3307 zone_ncpus_get(zone_t *zone)
3308 {
3309         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3310 
3311         return (myncpus != 0 ? myncpus : ncpus);
3312 }
3313 
3314 /*
3315  * Get the number of online cpus visible to this zone.  The system-wide
3316  * global 'ncpus_online' is returned if pools are disabled, the caller
3317  * is in the global zone, or a NULL zone argument is passed in.
3318  */
3319 int
3320 zone_ncpus_online_get(zone_t *zone)
3321 {
3322         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3323 
3324         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3325 }
3326 
3327 /*
3328  * Return the pool to which the zone is currently bound.
3329  */
3330 pool_t *
3331 zone_pool_get(zone_t *zone)
3332 {
3333         ASSERT(pool_lock_held());
3334 
3335         return (zone->zone_pool);
3336 }
3337 
3338 /*
3339  * Set the zone's pool pointer and update the zone's visibility to match
3340  * the resources in the new pool.
3341  */
3342 void
3343 zone_pool_set(zone_t *zone, pool_t *pool)
3344 {
3345         ASSERT(pool_lock_held());
3346         ASSERT(MUTEX_HELD(&cpu_lock));
3347 
3348         zone->zone_pool = pool;
3349         zone_pset_set(zone, pool->pool_pset->pset_id);
3350 }
3351 
3352 /*
3353  * Return the cached value of the id of the processor set to which the
3354  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3355  * facility is disabled.
3356  */
3357 psetid_t
3358 zone_pset_get(zone_t *zone)
3359 {
3360         ASSERT(MUTEX_HELD(&cpu_lock));
3361 
3362         return (zone->zone_psetid);
3363 }
3364 
3365 /*
3366  * Set the cached value of the id of the processor set to which the zone
3367  * is currently bound.  Also update the zone's visibility to match the
3368  * resources in the new processor set.
3369  */
3370 void
3371 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3372 {
3373         psetid_t oldpsetid;
3374 
3375         ASSERT(MUTEX_HELD(&cpu_lock));
3376         oldpsetid = zone_pset_get(zone);
3377 
3378         if (oldpsetid == newpsetid)
3379                 return;
3380         /*
3381          * Global zone sees all.
3382          */
3383         if (zone != global_zone) {
3384                 zone->zone_psetid = newpsetid;
3385                 if (newpsetid != ZONE_PS_INVAL)
3386                         pool_pset_visibility_add(newpsetid, zone);
3387                 if (oldpsetid != ZONE_PS_INVAL)
3388                         pool_pset_visibility_remove(oldpsetid, zone);
3389         }
3390         /*
3391          * Disabling pools, so we should start using the global values
3392          * for ncpus and ncpus_online.
3393          */
3394         if (newpsetid == ZONE_PS_INVAL) {
3395                 zone->zone_ncpus = 0;
3396                 zone->zone_ncpus_online = 0;
3397         }
3398 }
3399 
3400 /*
3401  * Walk the list of active zones and issue the provided callback for
3402  * each of them.
3403  *
3404  * Caller must not be holding any locks that may be acquired under
3405  * zonehash_lock.  See comment at the beginning of the file for a list of
3406  * common locks and their interactions with zones.
3407  */
3408 int
3409 zone_walk(int (*cb)(zone_t *, void *), void *data)
3410 {
3411         zone_t *zone;
3412         int ret = 0;
3413         zone_status_t status;
3414 
3415         mutex_enter(&zonehash_lock);
3416         for (zone = list_head(&zone_active); zone != NULL;
3417             zone = list_next(&zone_active, zone)) {
3418                 /*
3419                  * Skip zones that shouldn't be externally visible.
3420                  */
3421                 status = zone_status_get(zone);
3422                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3423                         continue;
3424                 /*
3425                  * Bail immediately if any callback invocation returns a
3426                  * non-zero value.
3427                  */
3428                 ret = (*cb)(zone, data);
3429                 if (ret != 0)
3430                         break;
3431         }
3432         mutex_exit(&zonehash_lock);
3433         return (ret);
3434 }
3435 
3436 static int
3437 zone_set_root(zone_t *zone, const char *upath)
3438 {
3439         vnode_t *vp;
3440         int trycount;
3441         int error = 0;
3442         char *path;
3443         struct pathname upn, pn;
3444         size_t pathlen;
3445 
3446         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3447                 return (error);
3448 
3449         pn_alloc(&pn);
3450 
3451         /* prevent infinite loop */
3452         trycount = 10;
3453         for (;;) {
3454                 if (--trycount <= 0) {
3455                         error = ESTALE;
3456                         goto out;
3457                 }
3458 
3459                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3460                         /*
3461                          * VOP_ACCESS() may cover 'vp' with a new
3462                          * filesystem, if 'vp' is an autoFS vnode.
3463                          * Get the new 'vp' if so.
3464                          */
3465                         if ((error =
3466                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3467                             (!vn_ismntpt(vp) ||
3468                             (error = traverse(&vp)) == 0)) {
3469                                 pathlen = pn.pn_pathlen + 2;
3470                                 path = kmem_alloc(pathlen, KM_SLEEP);
3471                                 (void) strncpy(path, pn.pn_path,
3472                                     pn.pn_pathlen + 1);
3473                                 path[pathlen - 2] = '/';
3474                                 path[pathlen - 1] = '\0';
3475                                 pn_free(&pn);
3476                                 pn_free(&upn);
3477 
3478                                 /* Success! */
3479                                 break;
3480                         }
3481                         VN_RELE(vp);
3482                 }
3483                 if (error != ESTALE)
3484                         goto out;
3485         }
3486 
3487         ASSERT(error == 0);
3488         zone->zone_rootvp = vp;              /* we hold a reference to vp */
3489         zone->zone_rootpath = path;
3490         zone->zone_rootpathlen = pathlen;
3491         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3492                 zone->zone_flags |= ZF_IS_SCRATCH;
3493         return (0);
3494 
3495 out:
3496         pn_free(&pn);
3497         pn_free(&upn);
3498         return (error);
3499 }
3500 
3501 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3502                         ((c) >= 'a' && (c) <= 'z') || \
3503                         ((c) >= 'A' && (c) <= 'Z'))
3504 
3505 static int
3506 zone_set_name(zone_t *zone, const char *uname)
3507 {
3508         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3509         size_t len;
3510         int i, err;
3511 
3512         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3513                 kmem_free(kname, ZONENAME_MAX);
3514                 return (err);   /* EFAULT or ENAMETOOLONG */
3515         }
3516 
3517         /* must be less than ZONENAME_MAX */
3518         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3519                 kmem_free(kname, ZONENAME_MAX);
3520                 return (EINVAL);
3521         }
3522 
3523         /*
3524          * Name must start with an alphanumeric and must contain only
3525          * alphanumerics, '-', '_' and '.'.
3526          */
3527         if (!isalnum(kname[0])) {
3528                 kmem_free(kname, ZONENAME_MAX);
3529                 return (EINVAL);
3530         }
3531         for (i = 1; i < len - 1; i++) {
3532                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3533                     kname[i] != '.') {
3534                         kmem_free(kname, ZONENAME_MAX);
3535                         return (EINVAL);
3536                 }
3537         }
3538 
3539         zone->zone_name = kname;
3540         return (0);
3541 }
3542 
3543 /*
3544  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3545  * is NULL or it points to a zone with no hostid emulation, then the machine's
3546  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3547  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3548  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3549  * hostid and the machine's hostid is invalid.
3550  */
3551 uint32_t
3552 zone_get_hostid(zone_t *zonep)
3553 {
3554         unsigned long machine_hostid;
3555 
3556         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3557                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3558                         return (HW_INVALID_HOSTID);
3559                 return ((uint32_t)machine_hostid);
3560         }
3561         return (zonep->zone_hostid);
3562 }
3563 
3564 /*
3565  * Similar to thread_create(), but makes sure the thread is in the appropriate
3566  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3567  */
3568 /*ARGSUSED*/
3569 kthread_t *
3570 zthread_create(
3571     caddr_t stk,
3572     size_t stksize,
3573     void (*proc)(),
3574     void *arg,
3575     size_t len,
3576     pri_t pri)
3577 {
3578         kthread_t *t;
3579         zone_t *zone = curproc->p_zone;
3580         proc_t *pp = zone->zone_zsched;
3581 
3582         zone_hold(zone);        /* Reference to be dropped when thread exits */
3583 
3584         /*
3585          * No-one should be trying to create threads if the zone is shutting
3586          * down and there aren't any kernel threads around.  See comment
3587          * in zthread_exit().
3588          */
3589         ASSERT(!(zone->zone_kthreads == NULL &&
3590             zone_status_get(zone) >= ZONE_IS_EMPTY));
3591         /*
3592          * Create a thread, but don't let it run until we've finished setting
3593          * things up.
3594          */
3595         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3596         ASSERT(t->t_forw == NULL);
3597         mutex_enter(&zone_status_lock);
3598         if (zone->zone_kthreads == NULL) {
3599                 t->t_forw = t->t_back = t;
3600         } else {
3601                 kthread_t *tx = zone->zone_kthreads;
3602 
3603                 t->t_forw = tx;
3604                 t->t_back = tx->t_back;
3605                 tx->t_back->t_forw = t;
3606                 tx->t_back = t;
3607         }
3608         zone->zone_kthreads = t;
3609         mutex_exit(&zone_status_lock);
3610 
3611         mutex_enter(&pp->p_lock);
3612         t->t_proc_flag |= TP_ZTHREAD;
3613         project_rele(t->t_proj);
3614         t->t_proj = project_hold(pp->p_task->tk_proj);
3615 
3616         /*
3617          * Setup complete, let it run.
3618          */
3619         thread_lock(t);
3620         t->t_schedflag |= TS_ALLSTART;
3621         setrun_locked(t);
3622         thread_unlock(t);
3623 
3624         mutex_exit(&pp->p_lock);
3625 
3626         return (t);
3627 }
3628 
3629 /*
3630  * Similar to thread_exit().  Must be called by threads created via
3631  * zthread_exit().
3632  */
3633 void
3634 zthread_exit(void)
3635 {
3636         kthread_t *t = curthread;
3637         proc_t *pp = curproc;
3638         zone_t *zone = pp->p_zone;
3639 
3640         mutex_enter(&zone_status_lock);
3641 
3642         /*
3643          * Reparent to p0
3644          */
3645         kpreempt_disable();
3646         mutex_enter(&pp->p_lock);
3647         t->t_proc_flag &= ~TP_ZTHREAD;
3648         t->t_procp = &p0;
3649         hat_thread_exit(t);
3650         mutex_exit(&pp->p_lock);
3651         kpreempt_enable();
3652 
3653         if (t->t_back == t) {
3654                 ASSERT(t->t_forw == t);
3655                 /*
3656                  * If the zone is empty, once the thread count
3657                  * goes to zero no further kernel threads can be
3658                  * created.  This is because if the creator is a process
3659                  * in the zone, then it must have exited before the zone
3660                  * state could be set to ZONE_IS_EMPTY.
3661                  * Otherwise, if the creator is a kernel thread in the
3662                  * zone, the thread count is non-zero.
3663                  *
3664                  * This really means that non-zone kernel threads should
3665                  * not create zone kernel threads.
3666                  */
3667                 zone->zone_kthreads = NULL;
3668                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3669                         zone_status_set(zone, ZONE_IS_DOWN);
3670                         /*
3671                          * Remove any CPU caps on this zone.
3672                          */
3673                         cpucaps_zone_remove(zone);
3674                 }
3675         } else {
3676                 t->t_forw->t_back = t->t_back;
3677                 t->t_back->t_forw = t->t_forw;
3678                 if (zone->zone_kthreads == t)
3679                         zone->zone_kthreads = t->t_forw;
3680         }
3681         mutex_exit(&zone_status_lock);
3682         zone_rele(zone);
3683         thread_exit();
3684         /* NOTREACHED */
3685 }
3686 
3687 static void
3688 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3689 {
3690         vnode_t *oldvp;
3691 
3692         /* we're going to hold a reference here to the directory */
3693         VN_HOLD(vp);
3694 
3695         /* update abs cwd/root path see c2/audit.c */
3696         if (AU_AUDITING())
3697                 audit_chdirec(vp, vpp);
3698 
3699         mutex_enter(&pp->p_lock);
3700         oldvp = *vpp;
3701         *vpp = vp;
3702         mutex_exit(&pp->p_lock);
3703         if (oldvp != NULL)
3704                 VN_RELE(oldvp);
3705 }
3706 
3707 /*
3708  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3709  */
3710 static int
3711 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3712 {
3713         nvpair_t *nvp = NULL;
3714         boolean_t priv_set = B_FALSE;
3715         boolean_t limit_set = B_FALSE;
3716         boolean_t action_set = B_FALSE;
3717 
3718         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3719                 const char *name;
3720                 uint64_t ui64;
3721 
3722                 name = nvpair_name(nvp);
3723                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3724                         return (EINVAL);
3725                 (void) nvpair_value_uint64(nvp, &ui64);
3726                 if (strcmp(name, "privilege") == 0) {
3727                         /*
3728                          * Currently only privileged values are allowed, but
3729                          * this may change in the future.
3730                          */
3731                         if (ui64 != RCPRIV_PRIVILEGED)
3732                                 return (EINVAL);
3733                         rv->rcv_privilege = ui64;
3734                         priv_set = B_TRUE;
3735                 } else if (strcmp(name, "limit") == 0) {
3736                         rv->rcv_value = ui64;
3737                         limit_set = B_TRUE;
3738                 } else if (strcmp(name, "action") == 0) {
3739                         if (ui64 != RCTL_LOCAL_NOACTION &&
3740                             ui64 != RCTL_LOCAL_DENY)
3741                                 return (EINVAL);
3742                         rv->rcv_flagaction = ui64;
3743                         action_set = B_TRUE;
3744                 } else {
3745                         return (EINVAL);
3746                 }
3747         }
3748 
3749         if (!(priv_set && limit_set && action_set))
3750                 return (EINVAL);
3751         rv->rcv_action_signal = 0;
3752         rv->rcv_action_recipient = NULL;
3753         rv->rcv_action_recip_pid = -1;
3754         rv->rcv_firing_time = 0;
3755 
3756         return (0);
3757 }
3758 
3759 /*
3760  * Non-global zone version of start_init.
3761  */
3762 void
3763 zone_start_init(void)
3764 {
3765         proc_t *p = ttoproc(curthread);
3766         zone_t *z = p->p_zone;
3767 
3768         ASSERT(!INGLOBALZONE(curproc));
3769 
3770         /*
3771          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3772          * storing just the pid of init is sufficient.
3773          */
3774         z->zone_proc_initpid = p->p_pid;
3775 
3776         /*
3777          * We maintain zone_boot_err so that we can return the cause of the
3778          * failure back to the caller of the zone_boot syscall.
3779          */
3780         p->p_zone->zone_boot_err = start_init_common();
3781 
3782         /*
3783          * We will prevent booting zones from becoming running zones if the
3784          * global zone is shutting down.
3785          */
3786         mutex_enter(&zone_status_lock);
3787         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3788             ZONE_IS_SHUTTING_DOWN) {
3789                 /*
3790                  * Make sure we are still in the booting state-- we could have
3791                  * raced and already be shutting down, or even further along.
3792                  */
3793                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3794                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3795                 }
3796                 mutex_exit(&zone_status_lock);
3797                 /* It's gone bad, dispose of the process */
3798                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3799                         mutex_enter(&p->p_lock);
3800                         ASSERT(p->p_flag & SEXITLWPS);
3801                         lwp_exit();
3802                 }
3803         } else {
3804                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3805                         zone_status_set(z, ZONE_IS_RUNNING);
3806                 mutex_exit(&zone_status_lock);
3807                 /* cause the process to return to userland. */
3808                 lwp_rtt();
3809         }
3810 }
3811 
3812 struct zsched_arg {
3813         zone_t *zone;
3814         nvlist_t *nvlist;
3815 };
3816 
3817 /*
3818  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3819  * anything to do with scheduling, but rather with the fact that
3820  * per-zone kernel threads are parented to zsched, just like regular
3821  * kernel threads are parented to sched (p0).
3822  *
3823  * zsched is also responsible for launching init for the zone.
3824  */
3825 static void
3826 zsched(void *arg)
3827 {
3828         struct zsched_arg *za = arg;
3829         proc_t *pp = curproc;
3830         proc_t *initp = proc_init;
3831         zone_t *zone = za->zone;
3832         cred_t *cr, *oldcred;
3833         rctl_set_t *set;
3834         rctl_alloc_gp_t *gp;
3835         contract_t *ct = NULL;
3836         task_t *tk, *oldtk;
3837         rctl_entity_p_t e;
3838         kproject_t *pj;
3839 
3840         nvlist_t *nvl = za->nvlist;
3841         nvpair_t *nvp = NULL;
3842 
3843         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3844         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3845         PTOU(pp)->u_argc = 0;
3846         PTOU(pp)->u_argv = NULL;
3847         PTOU(pp)->u_envp = NULL;
3848         closeall(P_FINFO(pp));
3849 
3850         /*
3851          * We are this zone's "zsched" process.  As the zone isn't generally
3852          * visible yet we don't need to grab any locks before initializing its
3853          * zone_proc pointer.
3854          */
3855         zone_hold(zone);  /* this hold is released by zone_destroy() */
3856         zone->zone_zsched = pp;
3857         mutex_enter(&pp->p_lock);
3858         pp->p_zone = zone;
3859         mutex_exit(&pp->p_lock);
3860 
3861         /*
3862          * Disassociate process from its 'parent'; parent ourselves to init
3863          * (pid 1) and change other values as needed.
3864          */
3865         sess_create();
3866 
3867         mutex_enter(&pidlock);
3868         proc_detach(pp);
3869         pp->p_ppid = 1;
3870         pp->p_flag |= SZONETOP;
3871         pp->p_ancpid = 1;
3872         pp->p_parent = initp;
3873         pp->p_psibling = NULL;
3874         if (initp->p_child)
3875                 initp->p_child->p_psibling = pp;
3876         pp->p_sibling = initp->p_child;
3877         initp->p_child = pp;
3878 
3879         /* Decrement what newproc() incremented. */
3880         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3881         /*
3882          * Our credentials are about to become kcred-like, so we don't care
3883          * about the caller's ruid.
3884          */
3885         upcount_inc(crgetruid(kcred), zone->zone_id);
3886         mutex_exit(&pidlock);
3887 
3888         /*
3889          * getting out of global zone, so decrement lwp and process counts
3890          */
3891         pj = pp->p_task->tk_proj;
3892         mutex_enter(&global_zone->zone_nlwps_lock);
3893         pj->kpj_nlwps -= pp->p_lwpcnt;
3894         global_zone->zone_nlwps -= pp->p_lwpcnt;
3895         pj->kpj_nprocs--;
3896         global_zone->zone_nprocs--;
3897         mutex_exit(&global_zone->zone_nlwps_lock);
3898 
3899         /*
3900          * Decrement locked memory counts on old zone and project.
3901          */
3902         mutex_enter(&global_zone->zone_mem_lock);
3903         global_zone->zone_locked_mem -= pp->p_locked_mem;
3904         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3905         mutex_exit(&global_zone->zone_mem_lock);
3906 
3907         /*
3908          * Create and join a new task in project '0' of this zone.
3909          *
3910          * We don't need to call holdlwps() since we know we're the only lwp in
3911          * this process.
3912          *
3913          * task_join() returns with p_lock held.
3914          */
3915         tk = task_create(0, zone);
3916         mutex_enter(&cpu_lock);
3917         oldtk = task_join(tk, 0);
3918 
3919         pj = pp->p_task->tk_proj;
3920 
3921         mutex_enter(&zone->zone_mem_lock);
3922         zone->zone_locked_mem += pp->p_locked_mem;
3923         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3924         mutex_exit(&zone->zone_mem_lock);
3925 
3926         /*
3927          * add lwp and process counts to zsched's zone, and increment
3928          * project's task and process count due to the task created in
3929          * the above task_create.
3930          */
3931         mutex_enter(&zone->zone_nlwps_lock);
3932         pj->kpj_nlwps += pp->p_lwpcnt;
3933         pj->kpj_ntasks += 1;
3934         zone->zone_nlwps += pp->p_lwpcnt;
3935         pj->kpj_nprocs++;
3936         zone->zone_nprocs++;
3937         mutex_exit(&zone->zone_nlwps_lock);
3938 
3939         mutex_exit(&curproc->p_lock);
3940         mutex_exit(&cpu_lock);
3941         task_rele(oldtk);
3942 
3943         /*
3944          * The process was created by a process in the global zone, hence the
3945          * credentials are wrong.  We might as well have kcred-ish credentials.
3946          */
3947         cr = zone->zone_kcred;
3948         crhold(cr);
3949         mutex_enter(&pp->p_crlock);
3950         oldcred = pp->p_cred;
3951         pp->p_cred = cr;
3952         mutex_exit(&pp->p_crlock);
3953         crfree(oldcred);
3954 
3955         /*
3956          * Hold credentials again (for thread)
3957          */
3958         crhold(cr);
3959 
3960         /*
3961          * p_lwpcnt can't change since this is a kernel process.
3962          */
3963         crset(pp, cr);
3964 
3965         /*
3966          * Chroot
3967          */
3968         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3969         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3970 
3971         /*
3972          * Initialize zone's rctl set.
3973          */
3974         set = rctl_set_create();
3975         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3976         mutex_enter(&pp->p_lock);
3977         e.rcep_p.zone = zone;
3978         e.rcep_t = RCENTITY_ZONE;
3979         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3980         mutex_exit(&pp->p_lock);
3981         rctl_prealloc_destroy(gp);
3982 
3983         /*
3984          * Apply the rctls passed in to zone_create().  This is basically a list
3985          * assignment: all of the old values are removed and the new ones
3986          * inserted.  That is, if an empty list is passed in, all values are
3987          * removed.
3988          */
3989         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3990                 rctl_dict_entry_t *rde;
3991                 rctl_hndl_t hndl;
3992                 char *name;
3993                 nvlist_t **nvlarray;
3994                 uint_t i, nelem;
3995                 int error;      /* For ASSERT()s */
3996 
3997                 name = nvpair_name(nvp);
3998                 hndl = rctl_hndl_lookup(name);
3999                 ASSERT(hndl != -1);
4000                 rde = rctl_dict_lookup_hndl(hndl);
4001                 ASSERT(rde != NULL);
4002 
4003                 for (; /* ever */; ) {
4004                         rctl_val_t oval;
4005 
4006                         mutex_enter(&pp->p_lock);
4007                         error = rctl_local_get(hndl, NULL, &oval, pp);
4008                         mutex_exit(&pp->p_lock);
4009                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4010                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4011                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
4012                                 break;
4013                         mutex_enter(&pp->p_lock);
4014                         error = rctl_local_delete(hndl, &oval, pp);
4015                         mutex_exit(&pp->p_lock);
4016                         ASSERT(error == 0);
4017                 }
4018                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4019                 ASSERT(error == 0);
4020                 for (i = 0; i < nelem; i++) {
4021                         rctl_val_t *nvalp;
4022 
4023                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4024                         error = nvlist2rctlval(nvlarray[i], nvalp);
4025                         ASSERT(error == 0);
4026                         /*
4027                          * rctl_local_insert can fail if the value being
4028                          * inserted is a duplicate; this is OK.
4029                          */
4030                         mutex_enter(&pp->p_lock);
4031                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
4032                                 kmem_cache_free(rctl_val_cache, nvalp);
4033                         mutex_exit(&pp->p_lock);
4034                 }
4035         }
4036         /*
4037          * Tell the world that we're done setting up.
4038          *
4039          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4040          * and atomically set the zone's processor set visibility.  Once
4041          * we drop pool_lock() this zone will automatically get updated
4042          * to reflect any future changes to the pools configuration.
4043          *
4044          * Note that after we drop the locks below (zonehash_lock in
4045          * particular) other operations such as a zone_getattr call can
4046          * now proceed and observe the zone. That is the reason for doing a
4047          * state transition to the INITIALIZED state.
4048          */
4049         pool_lock();
4050         mutex_enter(&cpu_lock);
4051         mutex_enter(&zonehash_lock);
4052         zone_uniqid(zone);
4053         zone_zsd_configure(zone);
4054         if (pool_state == POOL_ENABLED)
4055                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4056         mutex_enter(&zone_status_lock);
4057         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4058         zone_status_set(zone, ZONE_IS_INITIALIZED);
4059         mutex_exit(&zone_status_lock);
4060         mutex_exit(&zonehash_lock);
4061         mutex_exit(&cpu_lock);
4062         pool_unlock();
4063 
4064         /* Now call the create callback for this key */
4065         zsd_apply_all_keys(zsd_apply_create, zone);
4066 
4067         /* The callbacks are complete. Mark ZONE_IS_READY */
4068         mutex_enter(&zone_status_lock);
4069         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4070         zone_status_set(zone, ZONE_IS_READY);
4071         mutex_exit(&zone_status_lock);
4072 
4073         /*
4074          * Once we see the zone transition to the ZONE_IS_BOOTING state,
4075          * we launch init, and set the state to running.
4076          */
4077         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4078 
4079         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4080                 id_t cid;
4081 
4082                 /*
4083                  * Ok, this is a little complicated.  We need to grab the
4084                  * zone's pool's scheduling class ID; note that by now, we
4085                  * are already bound to a pool if we need to be (zoneadmd
4086                  * will have done that to us while we're in the READY
4087                  * state).  *But* the scheduling class for the zone's 'init'
4088                  * must be explicitly passed to newproc, which doesn't
4089                  * respect pool bindings.
4090                  *
4091                  * We hold the pool_lock across the call to newproc() to
4092                  * close the obvious race: the pool's scheduling class
4093                  * could change before we manage to create the LWP with
4094                  * classid 'cid'.
4095                  */
4096                 pool_lock();
4097                 if (zone->zone_defaultcid > 0)
4098                         cid = zone->zone_defaultcid;
4099                 else
4100                         cid = pool_get_class(zone->zone_pool);
4101                 if (cid == -1)
4102                         cid = defaultcid;
4103 
4104                 /*
4105                  * If this fails, zone_boot will ultimately fail.  The
4106                  * state of the zone will be set to SHUTTING_DOWN-- userland
4107                  * will have to tear down the zone, and fail, or try again.
4108                  */
4109                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4110                     minclsyspri - 1, &ct, 0)) != 0) {
4111                         mutex_enter(&zone_status_lock);
4112                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4113                         mutex_exit(&zone_status_lock);
4114                 } else {
4115                         zone->zone_boot_time = gethrestime_sec();
4116                 }
4117 
4118                 pool_unlock();
4119         }
4120 
4121         /*
4122          * Wait for zone_destroy() to be called.  This is what we spend
4123          * most of our life doing.
4124          */
4125         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4126 
4127         if (ct)
4128                 /*
4129                  * At this point the process contract should be empty.
4130                  * (Though if it isn't, it's not the end of the world.)
4131                  */
4132                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4133 
4134         /*
4135          * Allow kcred to be freed when all referring processes
4136          * (including this one) go away.  We can't just do this in
4137          * zone_free because we need to wait for the zone_cred_ref to
4138          * drop to 0 before calling zone_free, and the existence of
4139          * zone_kcred will prevent that.  Thus, we call crfree here to
4140          * balance the crdup in zone_create.  The crhold calls earlier
4141          * in zsched will be dropped when the thread and process exit.
4142          */
4143         crfree(zone->zone_kcred);
4144         zone->zone_kcred = NULL;
4145 
4146         exit(CLD_EXITED, 0);
4147 }
4148 
4149 /*
4150  * Helper function to determine if there are any submounts of the
4151  * provided path.  Used to make sure the zone doesn't "inherit" any
4152  * mounts from before it is created.
4153  */
4154 static uint_t
4155 zone_mount_count(const char *rootpath)
4156 {
4157         vfs_t *vfsp;
4158         uint_t count = 0;
4159         size_t rootpathlen = strlen(rootpath);
4160 
4161         /*
4162          * Holding zonehash_lock prevents race conditions with
4163          * vfs_list_add()/vfs_list_remove() since we serialize with
4164          * zone_find_by_path().
4165          */
4166         ASSERT(MUTEX_HELD(&zonehash_lock));
4167         /*
4168          * The rootpath must end with a '/'
4169          */
4170         ASSERT(rootpath[rootpathlen - 1] == '/');
4171 
4172         /*
4173          * This intentionally does not count the rootpath itself if that
4174          * happens to be a mount point.
4175          */
4176         vfs_list_read_lock();
4177         vfsp = rootvfs;
4178         do {
4179                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4180                     rootpathlen) == 0)
4181                         count++;
4182                 vfsp = vfsp->vfs_next;
4183         } while (vfsp != rootvfs);
4184         vfs_list_unlock();
4185         return (count);
4186 }
4187 
4188 /*
4189  * Helper function to make sure that a zone created on 'rootpath'
4190  * wouldn't end up containing other zones' rootpaths.
4191  */
4192 static boolean_t
4193 zone_is_nested(const char *rootpath)
4194 {
4195         zone_t *zone;
4196         size_t rootpathlen = strlen(rootpath);
4197         size_t len;
4198 
4199         ASSERT(MUTEX_HELD(&zonehash_lock));
4200 
4201         /*
4202          * zone_set_root() appended '/' and '\0' at the end of rootpath
4203          */
4204         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4205             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4206                 return (B_TRUE);
4207 
4208         for (zone = list_head(&zone_active); zone != NULL;
4209             zone = list_next(&zone_active, zone)) {
4210                 if (zone == global_zone)
4211                         continue;
4212                 len = strlen(zone->zone_rootpath);
4213                 if (strncmp(rootpath, zone->zone_rootpath,
4214                     MIN(rootpathlen, len)) == 0)
4215                         return (B_TRUE);
4216         }
4217         return (B_FALSE);
4218 }
4219 
4220 static int
4221 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4222     size_t zone_privssz)
4223 {
4224         priv_set_t *privs;
4225 
4226         if (zone_privssz < sizeof (priv_set_t))
4227                 return (ENOMEM);
4228 
4229         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4230 
4231         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4232                 kmem_free(privs, sizeof (priv_set_t));
4233                 return (EFAULT);
4234         }
4235 
4236         zone->zone_privset = privs;
4237         return (0);
4238 }
4239 
4240 /*
4241  * We make creative use of nvlists to pass in rctls from userland.  The list is
4242  * a list of the following structures:
4243  *
4244  * (name = rctl_name, value = nvpair_list_array)
4245  *
4246  * Where each element of the nvpair_list_array is of the form:
4247  *
4248  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4249  *      (name = "limit", value = uint64_t),
4250  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4251  */
4252 static int
4253 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4254 {
4255         nvpair_t *nvp = NULL;
4256         nvlist_t *nvl = NULL;
4257         char *kbuf;
4258         int error;
4259         rctl_val_t rv;
4260 
4261         *nvlp = NULL;
4262 
4263         if (buflen == 0)
4264                 return (0);
4265 
4266         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4267                 return (ENOMEM);
4268         if (copyin(ubuf, kbuf, buflen)) {
4269                 error = EFAULT;
4270                 goto out;
4271         }
4272         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4273                 /*
4274                  * nvl may have been allocated/free'd, but the value set to
4275                  * non-NULL, so we reset it here.
4276                  */
4277                 nvl = NULL;
4278                 error = EINVAL;
4279                 goto out;
4280         }
4281         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4282                 rctl_dict_entry_t *rde;
4283                 rctl_hndl_t hndl;
4284                 nvlist_t **nvlarray;
4285                 uint_t i, nelem;
4286                 char *name;
4287 
4288                 error = EINVAL;
4289                 name = nvpair_name(nvp);
4290                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4291                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4292                         goto out;
4293                 }
4294                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4295                         goto out;
4296                 }
4297                 rde = rctl_dict_lookup_hndl(hndl);
4298                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4299                 ASSERT(error == 0);
4300                 for (i = 0; i < nelem; i++) {
4301                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4302                                 goto out;
4303                 }
4304                 if (rctl_invalid_value(rde, &rv)) {
4305                         error = EINVAL;
4306                         goto out;
4307                 }
4308         }
4309         error = 0;
4310         *nvlp = nvl;
4311 out:
4312         kmem_free(kbuf, buflen);
4313         if (error && nvl != NULL)
4314                 nvlist_free(nvl);
4315         return (error);
4316 }
4317 
4318 int
4319 zone_create_error(int er_error, int er_ext, int *er_out) {
4320         if (er_out != NULL) {
4321                 if (copyout(&er_ext, er_out, sizeof (int))) {
4322                         return (set_errno(EFAULT));
4323                 }
4324         }
4325         return (set_errno(er_error));
4326 }
4327 
4328 static int
4329 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4330 {
4331         ts_label_t *tsl;
4332         bslabel_t blab;
4333 
4334         /* Get label from user */
4335         if (copyin(lab, &blab, sizeof (blab)) != 0)
4336                 return (EFAULT);
4337         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4338         if (tsl == NULL)
4339                 return (ENOMEM);
4340 
4341         zone->zone_slabel = tsl;
4342         return (0);
4343 }
4344 
4345 /*
4346  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4347  */
4348 static int
4349 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4350 {
4351         char *kbuf;
4352         char *dataset, *next;
4353         zone_dataset_t *zd;
4354         size_t len;
4355 
4356         if (ubuf == NULL || buflen == 0)
4357                 return (0);
4358 
4359         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4360                 return (ENOMEM);
4361 
4362         if (copyin(ubuf, kbuf, buflen) != 0) {
4363                 kmem_free(kbuf, buflen);
4364                 return (EFAULT);
4365         }
4366 
4367         dataset = next = kbuf;
4368         for (;;) {
4369                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4370 
4371                 next = strchr(dataset, ',');
4372 
4373                 if (next == NULL)
4374                         len = strlen(dataset);
4375                 else
4376                         len = next - dataset;
4377 
4378                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4379                 bcopy(dataset, zd->zd_dataset, len);
4380                 zd->zd_dataset[len] = '\0';
4381 
4382                 list_insert_head(&zone->zone_datasets, zd);
4383 
4384                 if (next == NULL)
4385                         break;
4386 
4387                 dataset = next + 1;
4388         }
4389 
4390         kmem_free(kbuf, buflen);
4391         return (0);
4392 }
4393 
4394 /*
4395  * System call to create/initialize a new zone named 'zone_name', rooted
4396  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4397  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4398  * with labeling set by 'match', 'doi', and 'label'.
4399  *
4400  * If extended error is non-null, we may use it to return more detailed
4401  * error information.
4402  */
4403 static zoneid_t
4404 zone_create(const char *zone_name, const char *zone_root,
4405     const priv_set_t *zone_privs, size_t zone_privssz,
4406     caddr_t rctlbuf, size_t rctlbufsz,
4407     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4408     int match, uint32_t doi, const bslabel_t *label,
4409     int flags)
4410 {
4411         struct zsched_arg zarg;
4412         nvlist_t *rctls = NULL;
4413         proc_t *pp = curproc;
4414         zone_t *zone, *ztmp;
4415         zoneid_t zoneid;
4416         int error;
4417         int error2 = 0;
4418         char *str;
4419         cred_t *zkcr;
4420         boolean_t insert_label_hash;
4421 
4422         if (secpolicy_zone_config(CRED()) != 0)
4423                 return (set_errno(EPERM));
4424 
4425         /* can't boot zone from within chroot environment */
4426         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4427                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4428                     extended_error));
4429 
4430         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4431         zoneid = zone->zone_id = id_alloc(zoneid_space);
4432         zone->zone_status = ZONE_IS_UNINITIALIZED;
4433         zone->zone_pool = pool_default;
4434         zone->zone_pool_mod = gethrtime();
4435         zone->zone_psetid = ZONE_PS_INVAL;
4436         zone->zone_ncpus = 0;
4437         zone->zone_ncpus_online = 0;
4438         zone->zone_restart_init = B_TRUE;
4439         zone->zone_brand = &native_brand;
4440         zone->zone_initname = NULL;
4441         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4442         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4443         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4444         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4445         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4446             offsetof(zone_ref_t, zref_linkage));
4447         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4448             offsetof(struct zsd_entry, zsd_linkage));
4449         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4450             offsetof(zone_dataset_t, zd_linkage));
4451         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4452             offsetof(zone_dl_t, zdl_linkage));
4453         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4454         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4455 
4456         if (flags & ZCF_NET_EXCL) {
4457                 zone->zone_flags |= ZF_NET_EXCL;
4458         }
4459 
4460         if ((error = zone_set_name(zone, zone_name)) != 0) {
4461                 zone_free(zone);
4462                 return (zone_create_error(error, 0, extended_error));
4463         }
4464 
4465         if ((error = zone_set_root(zone, zone_root)) != 0) {
4466                 zone_free(zone);
4467                 return (zone_create_error(error, 0, extended_error));
4468         }
4469         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4470                 zone_free(zone);
4471                 return (zone_create_error(error, 0, extended_error));
4472         }
4473 
4474         /* initialize node name to be the same as zone name */
4475         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4476         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4477         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4478 
4479         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4480         zone->zone_domain[0] = '\0';
4481         zone->zone_hostid = HW_INVALID_HOSTID;
4482         zone->zone_shares = 1;
4483         zone->zone_shmmax = 0;
4484         zone->zone_ipc.ipcq_shmmni = 0;
4485         zone->zone_ipc.ipcq_semmni = 0;
4486         zone->zone_ipc.ipcq_msgmni = 0;
4487         zone->zone_bootargs = NULL;
4488         zone->zone_fs_allowed = NULL;
4489         zone->zone_initname =
4490             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4491         (void) strcpy(zone->zone_initname, zone_default_initname);
4492         zone->zone_nlwps = 0;
4493         zone->zone_nlwps_ctl = INT_MAX;
4494         zone->zone_nprocs = 0;
4495         zone->zone_nprocs_ctl = INT_MAX;
4496         zone->zone_locked_mem = 0;
4497         zone->zone_locked_mem_ctl = UINT64_MAX;
4498         zone->zone_max_swap = 0;
4499         zone->zone_max_swap_ctl = UINT64_MAX;
4500         zone->zone_max_lofi = 0;
4501         zone->zone_max_lofi_ctl = UINT64_MAX;
4502         zone->zone_lockedmem_kstat = NULL;
4503         zone->zone_swapresv_kstat = NULL;
4504         zone->zone_zfs_io_pri = 1;
4505 
4506         /*
4507          * Zsched initializes the rctls.
4508          */
4509         zone->zone_rctls = NULL;
4510 
4511         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4512                 zone_free(zone);
4513                 return (zone_create_error(error, 0, extended_error));
4514         }
4515 
4516         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4517                 zone_free(zone);
4518                 return (set_errno(error));
4519         }
4520 
4521         /*
4522          * Read in the trusted system parameters:
4523          * match flag and sensitivity label.
4524          */
4525         zone->zone_match = match;
4526         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4527                 /* Fail if requested to set doi to anything but system's doi */
4528                 if (doi != 0 && doi != default_doi) {
4529                         zone_free(zone);
4530                         return (set_errno(EINVAL));
4531                 }
4532                 /* Always apply system's doi to the zone */
4533                 error = zone_set_label(zone, label, default_doi);
4534                 if (error != 0) {
4535                         zone_free(zone);
4536                         return (set_errno(error));
4537                 }
4538                 insert_label_hash = B_TRUE;
4539         } else {
4540                 /* all zones get an admin_low label if system is not labeled */
4541                 zone->zone_slabel = l_admin_low;
4542                 label_hold(l_admin_low);
4543                 insert_label_hash = B_FALSE;
4544         }
4545 
4546         /*
4547          * Stop all lwps since that's what normally happens as part of fork().
4548          * This needs to happen before we grab any locks to avoid deadlock
4549          * (another lwp in the process could be waiting for the held lock).
4550          */
4551         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4552                 zone_free(zone);
4553                 nvlist_free(rctls);
4554                 return (zone_create_error(error, 0, extended_error));
4555         }
4556 
4557         if (block_mounts(zone) == 0) {
4558                 mutex_enter(&pp->p_lock);
4559                 if (curthread != pp->p_agenttp)
4560                         continuelwps(pp);
4561                 mutex_exit(&pp->p_lock);
4562                 zone_free(zone);
4563                 nvlist_free(rctls);
4564                 return (zone_create_error(error, 0, extended_error));
4565         }
4566 
4567         /*
4568          * Set up credential for kernel access.  After this, any errors
4569          * should go through the dance in errout rather than calling
4570          * zone_free directly.
4571          */
4572         zone->zone_kcred = crdup(kcred);
4573         crsetzone(zone->zone_kcred, zone);
4574         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4575         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4576         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4577         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4578 
4579         mutex_enter(&zonehash_lock);
4580         /*
4581          * Make sure zone doesn't already exist.
4582          *
4583          * If the system and zone are labeled,
4584          * make sure no other zone exists that has the same label.
4585          */
4586         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4587             (insert_label_hash &&
4588             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4589                 zone_status_t status;
4590 
4591                 status = zone_status_get(ztmp);
4592                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4593                         error = EEXIST;
4594                 else
4595                         error = EBUSY;
4596 
4597                 if (insert_label_hash)
4598                         error2 = ZE_LABELINUSE;
4599 
4600                 goto errout;
4601         }
4602 
4603         /*
4604          * Don't allow zone creations which would cause one zone's rootpath to
4605          * be accessible from that of another (non-global) zone.
4606          */
4607         if (zone_is_nested(zone->zone_rootpath)) {
4608                 error = EBUSY;
4609                 goto errout;
4610         }
4611 
4612         ASSERT(zonecount != 0);         /* check for leaks */
4613         if (zonecount + 1 > maxzones) {
4614                 error = ENOMEM;
4615                 goto errout;
4616         }
4617 
4618         if (zone_mount_count(zone->zone_rootpath) != 0) {
4619                 error = EBUSY;
4620                 error2 = ZE_AREMOUNTS;
4621                 goto errout;
4622         }
4623 
4624         /*
4625          * Zone is still incomplete, but we need to drop all locks while
4626          * zsched() initializes this zone's kernel process.  We
4627          * optimistically add the zone to the hashtable and associated
4628          * lists so a parallel zone_create() doesn't try to create the
4629          * same zone.
4630          */
4631         zonecount++;
4632         (void) mod_hash_insert(zonehashbyid,
4633             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4634             (mod_hash_val_t)(uintptr_t)zone);
4635         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4636         (void) strcpy(str, zone->zone_name);
4637         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4638             (mod_hash_val_t)(uintptr_t)zone);
4639         if (insert_label_hash) {
4640                 (void) mod_hash_insert(zonehashbylabel,
4641                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4642                 zone->zone_flags |= ZF_HASHED_LABEL;
4643         }
4644 
4645         /*
4646          * Insert into active list.  At this point there are no 'hold's
4647          * on the zone, but everyone else knows not to use it, so we can
4648          * continue to use it.  zsched() will do a zone_hold() if the
4649          * newproc() is successful.
4650          */
4651         list_insert_tail(&zone_active, zone);
4652         mutex_exit(&zonehash_lock);
4653 
4654         zarg.zone = zone;
4655         zarg.nvlist = rctls;
4656         /*
4657          * The process, task, and project rctls are probably wrong;
4658          * we need an interface to get the default values of all rctls,
4659          * and initialize zsched appropriately.  I'm not sure that that
4660          * makes much of a difference, though.
4661          */
4662         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4663         if (error != 0) {
4664                 /*
4665                  * We need to undo all globally visible state.
4666                  */
4667                 mutex_enter(&zonehash_lock);
4668                 list_remove(&zone_active, zone);
4669                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4670                         ASSERT(zone->zone_slabel != NULL);
4671                         (void) mod_hash_destroy(zonehashbylabel,
4672                             (mod_hash_key_t)zone->zone_slabel);
4673                 }
4674                 (void) mod_hash_destroy(zonehashbyname,
4675                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4676                 (void) mod_hash_destroy(zonehashbyid,
4677                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4678                 ASSERT(zonecount > 1);
4679                 zonecount--;
4680                 goto errout;
4681         }
4682 
4683         /*
4684          * Zone creation can't fail from now on.
4685          */
4686 
4687         /*
4688          * Create zone kstats
4689          */
4690         zone_kstat_create(zone);
4691 
4692         /*
4693          * Let the other lwps continue.
4694          */
4695         mutex_enter(&pp->p_lock);
4696         if (curthread != pp->p_agenttp)
4697                 continuelwps(pp);
4698         mutex_exit(&pp->p_lock);
4699 
4700         /*
4701          * Wait for zsched to finish initializing the zone.
4702          */
4703         zone_status_wait(zone, ZONE_IS_READY);
4704         /*
4705          * The zone is fully visible, so we can let mounts progress.
4706          */
4707         resume_mounts(zone);
4708         nvlist_free(rctls);
4709 
4710         return (zoneid);
4711 
4712 errout:
4713         mutex_exit(&zonehash_lock);
4714         /*
4715          * Let the other lwps continue.
4716          */
4717         mutex_enter(&pp->p_lock);
4718         if (curthread != pp->p_agenttp)
4719                 continuelwps(pp);
4720         mutex_exit(&pp->p_lock);
4721 
4722         resume_mounts(zone);
4723         nvlist_free(rctls);
4724         /*
4725          * There is currently one reference to the zone, a cred_ref from
4726          * zone_kcred.  To free the zone, we call crfree, which will call
4727          * zone_cred_rele, which will call zone_free.
4728          */
4729         ASSERT(zone->zone_cred_ref == 1);
4730         ASSERT(zone->zone_kcred->cr_ref == 1);
4731         ASSERT(zone->zone_ref == 0);
4732         zkcr = zone->zone_kcred;
4733         zone->zone_kcred = NULL;
4734         crfree(zkcr);                           /* triggers call to zone_free */
4735         return (zone_create_error(error, error2, extended_error));
4736 }
4737 
4738 /*
4739  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4740  * the heavy lifting.  initname is the path to the program to launch
4741  * at the "top" of the zone; if this is NULL, we use the system default,
4742  * which is stored at zone_default_initname.
4743  */
4744 static int
4745 zone_boot(zoneid_t zoneid)
4746 {
4747         int err;
4748         zone_t *zone;
4749 
4750         if (secpolicy_zone_config(CRED()) != 0)
4751                 return (set_errno(EPERM));
4752         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4753                 return (set_errno(EINVAL));
4754 
4755         mutex_enter(&zonehash_lock);
4756         /*
4757          * Look for zone under hash lock to prevent races with calls to
4758          * zone_shutdown, zone_destroy, etc.
4759          */
4760         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4761                 mutex_exit(&zonehash_lock);
4762                 return (set_errno(EINVAL));
4763         }
4764 
4765         mutex_enter(&zone_status_lock);
4766         if (zone_status_get(zone) != ZONE_IS_READY) {
4767                 mutex_exit(&zone_status_lock);
4768                 mutex_exit(&zonehash_lock);
4769                 return (set_errno(EINVAL));
4770         }
4771         zone_status_set(zone, ZONE_IS_BOOTING);
4772         mutex_exit(&zone_status_lock);
4773 
4774         zone_hold(zone);        /* so we can use the zone_t later */
4775         mutex_exit(&zonehash_lock);
4776 
4777         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4778                 zone_rele(zone);
4779                 return (set_errno(EINTR));
4780         }
4781 
4782         /*
4783          * Boot (starting init) might have failed, in which case the zone
4784          * will go to the SHUTTING_DOWN state; an appropriate errno will
4785          * be placed in zone->zone_boot_err, and so we return that.
4786          */
4787         err = zone->zone_boot_err;
4788         zone_rele(zone);
4789         return (err ? set_errno(err) : 0);
4790 }
4791 
4792 /*
4793  * Kills all user processes in the zone, waiting for them all to exit
4794  * before returning.
4795  */
4796 static int
4797 zone_empty(zone_t *zone)
4798 {
4799         int waitstatus;
4800 
4801         /*
4802          * We need to drop zonehash_lock before killing all
4803          * processes, otherwise we'll deadlock with zone_find_*
4804          * which can be called from the exit path.
4805          */
4806         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4807         while ((waitstatus = zone_status_timedwait_sig(zone,
4808             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4809                 killall(zone->zone_id);
4810         }
4811         /*
4812          * return EINTR if we were signaled
4813          */
4814         if (waitstatus == 0)
4815                 return (EINTR);
4816         return (0);
4817 }
4818 
4819 /*
4820  * This function implements the policy for zone visibility.
4821  *
4822  * In standard Solaris, a non-global zone can only see itself.
4823  *
4824  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4825  * it dominates. For this test, the label of the global zone is treated as
4826  * admin_high so it is special-cased instead of being checked for dominance.
4827  *
4828  * Returns true if zone attributes are viewable, false otherwise.
4829  */
4830 static boolean_t
4831 zone_list_access(zone_t *zone)
4832 {
4833 
4834         if (curproc->p_zone == global_zone ||
4835             curproc->p_zone == zone) {
4836                 return (B_TRUE);
4837         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4838                 bslabel_t *curproc_label;
4839                 bslabel_t *zone_label;
4840 
4841                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4842                 zone_label = label2bslabel(zone->zone_slabel);
4843 
4844                 if (zone->zone_id != GLOBAL_ZONEID &&
4845                     bldominates(curproc_label, zone_label)) {
4846                         return (B_TRUE);
4847                 } else {
4848                         return (B_FALSE);
4849                 }
4850         } else {
4851                 return (B_FALSE);
4852         }
4853 }
4854 
4855 /*
4856  * Systemcall to start the zone's halt sequence.  By the time this
4857  * function successfully returns, all user processes and kernel threads
4858  * executing in it will have exited, ZSD shutdown callbacks executed,
4859  * and the zone status set to ZONE_IS_DOWN.
4860  *
4861  * It is possible that the call will interrupt itself if the caller is the
4862  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4863  */
4864 static int
4865 zone_shutdown(zoneid_t zoneid)
4866 {
4867         int error;
4868         zone_t *zone;
4869         zone_status_t status;
4870 
4871         if (secpolicy_zone_config(CRED()) != 0)
4872                 return (set_errno(EPERM));
4873         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4874                 return (set_errno(EINVAL));
4875 
4876         mutex_enter(&zonehash_lock);
4877         /*
4878          * Look for zone under hash lock to prevent races with other
4879          * calls to zone_shutdown and zone_destroy.
4880          */
4881         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4882                 mutex_exit(&zonehash_lock);
4883                 return (set_errno(EINVAL));
4884         }
4885 
4886         /*
4887          * We have to drop zonehash_lock before calling block_mounts.
4888          * Hold the zone so we can continue to use the zone_t.
4889          */
4890         zone_hold(zone);
4891         mutex_exit(&zonehash_lock);
4892 
4893         /*
4894          * Block mounts so that VFS_MOUNT() can get an accurate view of
4895          * the zone's status with regards to ZONE_IS_SHUTTING down.
4896          *
4897          * e.g. NFS can fail the mount if it determines that the zone
4898          * has already begun the shutdown sequence.
4899          *
4900          */
4901         if (block_mounts(zone) == 0) {
4902                 zone_rele(zone);
4903                 return (set_errno(EINTR));
4904         }
4905 
4906         mutex_enter(&zonehash_lock);
4907         mutex_enter(&zone_status_lock);
4908         status = zone_status_get(zone);
4909         /*
4910          * Fail if the zone isn't fully initialized yet.
4911          */
4912         if (status < ZONE_IS_READY) {
4913                 mutex_exit(&zone_status_lock);
4914                 mutex_exit(&zonehash_lock);
4915                 resume_mounts(zone);
4916                 zone_rele(zone);
4917                 return (set_errno(EINVAL));
4918         }
4919         /*
4920          * If conditions required for zone_shutdown() to return have been met,
4921          * return success.
4922          */
4923         if (status >= ZONE_IS_DOWN) {
4924                 mutex_exit(&zone_status_lock);
4925                 mutex_exit(&zonehash_lock);
4926                 resume_mounts(zone);
4927                 zone_rele(zone);
4928                 return (0);
4929         }
4930         /*
4931          * If zone_shutdown() hasn't been called before, go through the motions.
4932          * If it has, there's nothing to do but wait for the kernel threads to
4933          * drain.
4934          */
4935         if (status < ZONE_IS_EMPTY) {
4936                 uint_t ntasks;
4937 
4938                 mutex_enter(&zone->zone_lock);
4939                 if ((ntasks = zone->zone_ntasks) != 1) {
4940                         /*
4941                          * There's still stuff running.
4942                          */
4943                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4944                 }
4945                 mutex_exit(&zone->zone_lock);
4946                 if (ntasks == 1) {
4947                         /*
4948                          * The only way to create another task is through
4949                          * zone_enter(), which will block until we drop
4950                          * zonehash_lock.  The zone is empty.
4951                          */
4952                         if (zone->zone_kthreads == NULL) {
4953                                 /*
4954                                  * Skip ahead to ZONE_IS_DOWN
4955                                  */
4956                                 zone_status_set(zone, ZONE_IS_DOWN);
4957                         } else {
4958                                 zone_status_set(zone, ZONE_IS_EMPTY);
4959                         }
4960                 }
4961         }
4962         mutex_exit(&zone_status_lock);
4963         mutex_exit(&zonehash_lock);
4964         resume_mounts(zone);
4965 
4966         if (error = zone_empty(zone)) {
4967                 zone_rele(zone);
4968                 return (set_errno(error));
4969         }
4970         /*
4971          * After the zone status goes to ZONE_IS_DOWN this zone will no
4972          * longer be notified of changes to the pools configuration, so
4973          * in order to not end up with a stale pool pointer, we point
4974          * ourselves at the default pool and remove all resource
4975          * visibility.  This is especially important as the zone_t may
4976          * languish on the deathrow for a very long time waiting for
4977          * cred's to drain out.
4978          *
4979          * This rebinding of the zone can happen multiple times
4980          * (presumably due to interrupted or parallel systemcalls)
4981          * without any adverse effects.
4982          */
4983         if (pool_lock_intr() != 0) {
4984                 zone_rele(zone);
4985                 return (set_errno(EINTR));
4986         }
4987         if (pool_state == POOL_ENABLED) {
4988                 mutex_enter(&cpu_lock);
4989                 zone_pool_set(zone, pool_default);
4990                 /*
4991                  * The zone no longer needs to be able to see any cpus.
4992                  */
4993                 zone_pset_set(zone, ZONE_PS_INVAL);
4994                 mutex_exit(&cpu_lock);
4995         }
4996         pool_unlock();
4997 
4998         /*
4999          * ZSD shutdown callbacks can be executed multiple times, hence
5000          * it is safe to not be holding any locks across this call.
5001          */
5002         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5003 
5004         mutex_enter(&zone_status_lock);
5005         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5006                 zone_status_set(zone, ZONE_IS_DOWN);
5007         mutex_exit(&zone_status_lock);
5008 
5009         /*
5010          * Wait for kernel threads to drain.
5011          */
5012         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5013                 zone_rele(zone);
5014                 return (set_errno(EINTR));
5015         }
5016 
5017         /*
5018          * Zone can be become down/destroyable even if the above wait
5019          * returns EINTR, so any code added here may never execute.
5020          * (i.e. don't add code here)
5021          */
5022 
5023         zone_rele(zone);
5024         return (0);
5025 }
5026 
5027 /*
5028  * Log the specified zone's reference counts.  The caller should not be
5029  * holding the zone's zone_lock.
5030  */
5031 static void
5032 zone_log_refcounts(zone_t *zone)
5033 {
5034         char *buffer;
5035         char *buffer_position;
5036         uint32_t buffer_size;
5037         uint32_t index;
5038         uint_t ref;
5039         uint_t cred_ref;
5040 
5041         /*
5042          * Construct a string representing the subsystem-specific reference
5043          * counts.  The counts are printed in ascending order by index into the
5044          * zone_t::zone_subsys_ref array.  The list will be surrounded by
5045          * square brackets [] and will only contain nonzero reference counts.
5046          *
5047          * The buffer will hold two square bracket characters plus ten digits,
5048          * one colon, one space, one comma, and some characters for a
5049          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5050          * bit integers have at most ten decimal digits.)  The last
5051          * reference count's comma is replaced by the closing square
5052          * bracket and a NULL character to terminate the string.
5053          *
5054          * NOTE: We have to grab the zone's zone_lock to create a consistent
5055          * snapshot of the zone's reference counters.
5056          *
5057          * First, figure out how much space the string buffer will need.
5058          * The buffer's size is stored in buffer_size.
5059          */
5060         buffer_size = 2;                        /* for the square brackets */
5061         mutex_enter(&zone->zone_lock);
5062         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5063         ref = zone->zone_ref;
5064         cred_ref = zone->zone_cred_ref;
5065         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5066                 if (zone->zone_subsys_ref[index] != 0)
5067                         buffer_size += strlen(zone_ref_subsys_names[index]) +
5068                             13;
5069         if (buffer_size == 2) {
5070                 /*
5071                  * No subsystems had nonzero reference counts.  Don't bother
5072                  * with allocating a buffer; just log the general-purpose and
5073                  * credential reference counts.
5074                  */
5075                 mutex_exit(&zone->zone_lock);
5076                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5077                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
5078                     "references and %u credential references are still extant",
5079                     zone->zone_name, zone->zone_id, ref, cred_ref);
5080                 return;
5081         }
5082 
5083         /*
5084          * buffer_size contains the exact number of characters that the
5085          * buffer will need.  Allocate the buffer and fill it with nonzero
5086          * subsystem-specific reference counts.  Surround the results with
5087          * square brackets afterwards.
5088          */
5089         buffer = kmem_alloc(buffer_size, KM_SLEEP);
5090         buffer_position = &buffer[1];
5091         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5092                 /*
5093                  * NOTE: The DDI's version of sprintf() returns a pointer to
5094                  * the modified buffer rather than the number of bytes written
5095                  * (as in snprintf(3C)).  This is unfortunate and annoying.
5096                  * Therefore, we'll use snprintf() with INT_MAX to get the
5097                  * number of bytes written.  Using INT_MAX is safe because
5098                  * the buffer is perfectly sized for the data: we'll never
5099                  * overrun the buffer.
5100                  */
5101                 if (zone->zone_subsys_ref[index] != 0)
5102                         buffer_position += snprintf(buffer_position, INT_MAX,
5103                             "%s: %u,", zone_ref_subsys_names[index],
5104                             zone->zone_subsys_ref[index]);
5105         }
5106         mutex_exit(&zone->zone_lock);
5107         buffer[0] = '[';
5108         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5109         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5110         buffer_position[-1] = ']';
5111 
5112         /*
5113          * Log the reference counts and free the message buffer.
5114          */
5115         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5116             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5117             "%u credential references are still extant %s", zone->zone_name,
5118             zone->zone_id, ref, cred_ref, buffer);
5119         kmem_free(buffer, buffer_size);
5120 }
5121 
5122 /*
5123  * Systemcall entry point to finalize the zone halt process.  The caller
5124  * must have already successfully called zone_shutdown().
5125  *
5126  * Upon successful completion, the zone will have been fully destroyed:
5127  * zsched will have exited, destructor callbacks executed, and the zone
5128  * removed from the list of active zones.
5129  */
5130 static int
5131 zone_destroy(zoneid_t zoneid)
5132 {
5133         uint64_t uniqid;
5134         zone_t *zone;
5135         zone_status_t status;
5136         clock_t wait_time;
5137         boolean_t log_refcounts;
5138 
5139         if (secpolicy_zone_config(CRED()) != 0)
5140                 return (set_errno(EPERM));
5141         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5142                 return (set_errno(EINVAL));
5143 
5144         mutex_enter(&zonehash_lock);
5145         /*
5146          * Look for zone under hash lock to prevent races with other
5147          * calls to zone_destroy.
5148          */
5149         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5150                 mutex_exit(&zonehash_lock);
5151                 return (set_errno(EINVAL));
5152         }
5153 
5154         if (zone_mount_count(zone->zone_rootpath) != 0) {
5155                 mutex_exit(&zonehash_lock);
5156                 return (set_errno(EBUSY));
5157         }
5158         mutex_enter(&zone_status_lock);
5159         status = zone_status_get(zone);
5160         if (status < ZONE_IS_DOWN) {
5161                 mutex_exit(&zone_status_lock);
5162                 mutex_exit(&zonehash_lock);
5163                 return (set_errno(EBUSY));
5164         } else if (status == ZONE_IS_DOWN) {
5165                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5166         }
5167         mutex_exit(&zone_status_lock);
5168         zone_hold(zone);
5169         mutex_exit(&zonehash_lock);
5170 
5171         /*
5172          * wait for zsched to exit
5173          */
5174         zone_status_wait(zone, ZONE_IS_DEAD);
5175         zone_zsd_callbacks(zone, ZSD_DESTROY);
5176         zone->zone_netstack = NULL;
5177         uniqid = zone->zone_uniqid;
5178         zone_rele(zone);
5179         zone = NULL;    /* potentially free'd */
5180 
5181         log_refcounts = B_FALSE;
5182         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5183         mutex_enter(&zonehash_lock);
5184         for (; /* ever */; ) {
5185                 boolean_t unref;
5186                 boolean_t refs_have_been_logged;
5187 
5188                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5189                     zone->zone_uniqid != uniqid) {
5190                         /*
5191                          * The zone has gone away.  Necessary conditions
5192                          * are met, so we return success.
5193                          */
5194                         mutex_exit(&zonehash_lock);
5195                         return (0);
5196                 }
5197                 mutex_enter(&zone->zone_lock);
5198                 unref = ZONE_IS_UNREF(zone);
5199                 refs_have_been_logged = (zone->zone_flags &
5200                     ZF_REFCOUNTS_LOGGED);
5201                 mutex_exit(&zone->zone_lock);
5202                 if (unref) {
5203                         /*
5204                          * There is only one reference to the zone -- that
5205                          * added when the zone was added to the hashtables --
5206                          * and things will remain this way until we drop
5207                          * zonehash_lock... we can go ahead and cleanup the
5208                          * zone.
5209                          */
5210                         break;
5211                 }
5212 
5213                 /*
5214                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5215                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5216                  * some zone's general-purpose reference count reaches one.
5217                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5218                  * on zone_destroy_cv, then log the zone's reference counts and
5219                  * continue to wait for zone_rele() and zone_cred_rele().
5220                  */
5221                 if (!refs_have_been_logged) {
5222                         if (!log_refcounts) {
5223                                 /*
5224                                  * This thread hasn't timed out waiting on
5225                                  * zone_destroy_cv yet.  Wait wait_time clock
5226                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5227                                  * seconds) for the zone's references to clear.
5228                                  */
5229                                 ASSERT(wait_time > 0);
5230                                 wait_time = cv_reltimedwait_sig(
5231                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5232                                     TR_SEC);
5233                                 if (wait_time > 0) {
5234                                         /*
5235                                          * A thread in zone_rele() or
5236                                          * zone_cred_rele() signaled
5237                                          * zone_destroy_cv before this thread's
5238                                          * wait timed out.  The zone might have
5239                                          * only one reference left; find out!
5240                                          */
5241                                         continue;
5242                                 } else if (wait_time == 0) {
5243                                         /* The thread's process was signaled. */
5244                                         mutex_exit(&zonehash_lock);
5245                                         return (set_errno(EINTR));
5246                                 }
5247 
5248                                 /*
5249                                  * The thread timed out while waiting on
5250                                  * zone_destroy_cv.  Even though the thread
5251                                  * timed out, it has to check whether another
5252                                  * thread woke up from zone_destroy_cv and
5253                                  * destroyed the zone.
5254                                  *
5255                                  * If the zone still exists and has more than
5256                                  * one unreleased general-purpose reference,
5257                                  * then log the zone's reference counts.
5258                                  */
5259                                 log_refcounts = B_TRUE;
5260                                 continue;
5261                         }
5262 
5263                         /*
5264                          * The thread already timed out on zone_destroy_cv while
5265                          * waiting for subsystems to release the zone's last
5266                          * general-purpose references.  Log the zone's reference
5267                          * counts and wait indefinitely on zone_destroy_cv.
5268                          */
5269                         zone_log_refcounts(zone);
5270                 }
5271                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5272                         /* The thread's process was signaled. */
5273                         mutex_exit(&zonehash_lock);
5274                         return (set_errno(EINTR));
5275                 }
5276         }
5277 
5278         /*
5279          * Remove CPU cap for this zone now since we're not going to
5280          * fail below this point.
5281          */
5282         cpucaps_zone_remove(zone);
5283 
5284         /* Get rid of the zone's kstats */
5285         zone_kstat_delete(zone);
5286 
5287         /* remove the pfexecd doors */
5288         if (zone->zone_pfexecd != NULL) {
5289                 klpd_freelist(&zone->zone_pfexecd);
5290                 zone->zone_pfexecd = NULL;
5291         }
5292 
5293         /* free brand specific data */
5294         if (ZONE_IS_BRANDED(zone))
5295                 ZBROP(zone)->b_free_brand_data(zone);
5296 
5297         /* Say goodbye to brand framework. */
5298         brand_unregister_zone(zone->zone_brand);
5299 
5300         /*
5301          * It is now safe to let the zone be recreated; remove it from the
5302          * lists.  The memory will not be freed until the last cred
5303          * reference goes away.
5304          */
5305         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
5306         zonecount--;
5307         /* remove from active list and hash tables */
5308         list_remove(&zone_active, zone);
5309         (void) mod_hash_destroy(zonehashbyname,
5310             (mod_hash_key_t)zone->zone_name);
5311         (void) mod_hash_destroy(zonehashbyid,
5312             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5313         if (zone->zone_flags & ZF_HASHED_LABEL)
5314                 (void) mod_hash_destroy(zonehashbylabel,
5315                     (mod_hash_key_t)zone->zone_slabel);
5316         mutex_exit(&zonehash_lock);
5317 
5318         /*
5319          * Release the root vnode; we're not using it anymore.  Nor should any
5320          * other thread that might access it exist.
5321          */
5322         if (zone->zone_rootvp != NULL) {
5323                 VN_RELE(zone->zone_rootvp);
5324                 zone->zone_rootvp = NULL;
5325         }
5326 
5327         /* add to deathrow list */
5328         mutex_enter(&zone_deathrow_lock);
5329         list_insert_tail(&zone_deathrow, zone);
5330         mutex_exit(&zone_deathrow_lock);
5331 
5332         /*
5333          * Drop last reference (which was added by zsched()), this will
5334          * free the zone unless there are outstanding cred references.
5335          */
5336         zone_rele(zone);
5337         return (0);
5338 }
5339 
5340 /*
5341  * Systemcall entry point for zone_getattr(2).
5342  */
5343 static ssize_t
5344 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5345 {
5346         size_t size;
5347         int error = 0, err;
5348         zone_t *zone;
5349         char *zonepath;
5350         char *outstr;
5351         zone_status_t zone_status;
5352         pid_t initpid;
5353         boolean_t global = (curzone == global_zone);
5354         boolean_t inzone = (curzone->zone_id == zoneid);
5355         ushort_t flags;
5356         zone_net_data_t *zbuf;
5357 
5358         mutex_enter(&zonehash_lock);
5359         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5360                 mutex_exit(&zonehash_lock);
5361                 return (set_errno(EINVAL));
5362         }
5363         zone_status = zone_status_get(zone);
5364         if (zone_status < ZONE_IS_INITIALIZED) {
5365                 mutex_exit(&zonehash_lock);
5366                 return (set_errno(EINVAL));
5367         }
5368         zone_hold(zone);
5369         mutex_exit(&zonehash_lock);
5370 
5371         /*
5372          * If not in the global zone, don't show information about other zones,
5373          * unless the system is labeled and the local zone's label dominates
5374          * the other zone.
5375          */
5376         if (!zone_list_access(zone)) {
5377                 zone_rele(zone);
5378                 return (set_errno(EINVAL));
5379         }
5380 
5381         switch (attr) {
5382         case ZONE_ATTR_ROOT:
5383                 if (global) {
5384                         /*
5385                          * Copy the path to trim the trailing "/" (except for
5386                          * the global zone).
5387                          */
5388                         if (zone != global_zone)
5389                                 size = zone->zone_rootpathlen - 1;
5390                         else
5391                                 size = zone->zone_rootpathlen;
5392                         zonepath = kmem_alloc(size, KM_SLEEP);
5393                         bcopy(zone->zone_rootpath, zonepath, size);
5394                         zonepath[size - 1] = '\0';
5395                 } else {
5396                         if (inzone || !is_system_labeled()) {
5397                                 /*
5398                                  * Caller is not in the global zone.
5399                                  * if the query is on the current zone
5400                                  * or the system is not labeled,
5401                                  * just return faked-up path for current zone.
5402                                  */
5403                                 zonepath = "/";
5404                                 size = 2;
5405                         } else {
5406                                 /*
5407                                  * Return related path for current zone.
5408                                  */
5409                                 int prefix_len = strlen(zone_prefix);
5410                                 int zname_len = strlen(zone->zone_name);
5411 
5412                                 size = prefix_len + zname_len + 1;
5413                                 zonepath = kmem_alloc(size, KM_SLEEP);
5414                                 bcopy(zone_prefix, zonepath, prefix_len);
5415                                 bcopy(zone->zone_name, zonepath +
5416                                     prefix_len, zname_len);
5417                                 zonepath[size - 1] = '\0';
5418                         }
5419                 }
5420                 if (bufsize > size)
5421                         bufsize = size;
5422                 if (buf != NULL) {
5423                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5424                         if (err != 0 && err != ENAMETOOLONG)
5425                                 error = EFAULT;
5426                 }
5427                 if (global || (is_system_labeled() && !inzone))
5428                         kmem_free(zonepath, size);
5429                 break;
5430 
5431         case ZONE_ATTR_NAME:
5432                 size = strlen(zone->zone_name) + 1;
5433                 if (bufsize > size)
5434                         bufsize = size;
5435                 if (buf != NULL) {
5436                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5437                         if (err != 0 && err != ENAMETOOLONG)
5438                                 error = EFAULT;
5439                 }
5440                 break;
5441 
5442         case ZONE_ATTR_STATUS:
5443                 /*
5444                  * Since we're not holding zonehash_lock, the zone status
5445                  * may be anything; leave it up to userland to sort it out.
5446                  */
5447                 size = sizeof (zone_status);
5448                 if (bufsize > size)
5449                         bufsize = size;
5450                 zone_status = zone_status_get(zone);
5451                 if (buf != NULL &&
5452                     copyout(&zone_status, buf, bufsize) != 0)
5453                         error = EFAULT;
5454                 break;
5455         case ZONE_ATTR_FLAGS:
5456                 size = sizeof (zone->zone_flags);
5457                 if (bufsize > size)
5458                         bufsize = size;
5459                 flags = zone->zone_flags;
5460                 if (buf != NULL &&
5461                     copyout(&flags, buf, bufsize) != 0)
5462                         error = EFAULT;
5463                 break;
5464         case ZONE_ATTR_PRIVSET:
5465                 size = sizeof (priv_set_t);
5466                 if (bufsize > size)
5467                         bufsize = size;
5468                 if (buf != NULL &&
5469                     copyout(zone->zone_privset, buf, bufsize) != 0)
5470                         error = EFAULT;
5471                 break;
5472         case ZONE_ATTR_UNIQID:
5473                 size = sizeof (zone->zone_uniqid);
5474                 if (bufsize > size)
5475                         bufsize = size;
5476                 if (buf != NULL &&
5477                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5478                         error = EFAULT;
5479                 break;
5480         case ZONE_ATTR_POOLID:
5481                 {
5482                         pool_t *pool;
5483                         poolid_t poolid;
5484 
5485                         if (pool_lock_intr() != 0) {
5486                                 error = EINTR;
5487                                 break;
5488                         }
5489                         pool = zone_pool_get(zone);
5490                         poolid = pool->pool_id;
5491                         pool_unlock();
5492                         size = sizeof (poolid);
5493                         if (bufsize > size)
5494                                 bufsize = size;
5495                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5496                                 error = EFAULT;
5497                 }
5498                 break;
5499         case ZONE_ATTR_SLBL:
5500                 size = sizeof (bslabel_t);
5501                 if (bufsize > size)
5502                         bufsize = size;
5503                 if (zone->zone_slabel == NULL)
5504                         error = EINVAL;
5505                 else if (buf != NULL &&
5506                     copyout(label2bslabel(zone->zone_slabel), buf,
5507                     bufsize) != 0)
5508                         error = EFAULT;
5509                 break;
5510         case ZONE_ATTR_INITPID:
5511                 size = sizeof (initpid);
5512                 if (bufsize > size)
5513                         bufsize = size;
5514                 initpid = zone->zone_proc_initpid;
5515                 if (initpid == -1) {
5516                         error = ESRCH;
5517                         break;
5518                 }
5519                 if (buf != NULL &&
5520                     copyout(&initpid, buf, bufsize) != 0)
5521                         error = EFAULT;
5522                 break;
5523         case ZONE_ATTR_BRAND:
5524                 size = strlen(zone->zone_brand->b_name) + 1;
5525 
5526                 if (bufsize > size)
5527                         bufsize = size;
5528                 if (buf != NULL) {
5529                         err = copyoutstr(zone->zone_brand->b_name, buf,
5530                             bufsize, NULL);
5531                         if (err != 0 && err != ENAMETOOLONG)
5532                                 error = EFAULT;
5533                 }
5534                 break;
5535         case ZONE_ATTR_INITNAME:
5536                 size = strlen(zone->zone_initname) + 1;
5537                 if (bufsize > size)
5538                         bufsize = size;
5539                 if (buf != NULL) {
5540                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5541                             NULL);
5542                         if (err != 0 && err != ENAMETOOLONG)
5543                                 error = EFAULT;
5544                 }
5545                 break;
5546         case ZONE_ATTR_BOOTARGS:
5547                 if (zone->zone_bootargs == NULL)
5548                         outstr = "";
5549                 else
5550                         outstr = zone->zone_bootargs;
5551                 size = strlen(outstr) + 1;
5552                 if (bufsize > size)
5553                         bufsize = size;
5554                 if (buf != NULL) {
5555                         err = copyoutstr(outstr, buf, bufsize, NULL);
5556                         if (err != 0 && err != ENAMETOOLONG)
5557                                 error = EFAULT;
5558                 }
5559                 break;
5560         case ZONE_ATTR_PHYS_MCAP:
5561                 size = sizeof (zone->zone_phys_mcap);
5562                 if (bufsize > size)
5563                         bufsize = size;
5564                 if (buf != NULL &&
5565                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5566                         error = EFAULT;
5567                 break;
5568         case ZONE_ATTR_SCHED_CLASS:
5569                 mutex_enter(&class_lock);
5570 
5571                 if (zone->zone_defaultcid >= loaded_classes)
5572                         outstr = "";
5573                 else
5574                         outstr = sclass[zone->zone_defaultcid].cl_name;
5575                 size = strlen(outstr) + 1;
5576                 if (bufsize > size)
5577                         bufsize = size;
5578                 if (buf != NULL) {
5579                         err = copyoutstr(outstr, buf, bufsize, NULL);
5580                         if (err != 0 && err != ENAMETOOLONG)
5581                                 error = EFAULT;
5582                 }
5583 
5584                 mutex_exit(&class_lock);
5585                 break;
5586         case ZONE_ATTR_HOSTID:
5587                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5588                     bufsize == sizeof (zone->zone_hostid)) {
5589                         size = sizeof (zone->zone_hostid);
5590                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5591                             bufsize) != 0)
5592                                 error = EFAULT;
5593                 } else {
5594                         error = EINVAL;
5595                 }
5596                 break;
5597         case ZONE_ATTR_FS_ALLOWED:
5598                 if (zone->zone_fs_allowed == NULL)
5599                         outstr = "";
5600                 else
5601                         outstr = zone->zone_fs_allowed;
5602                 size = strlen(outstr) + 1;
5603                 if (bufsize > size)
5604                         bufsize = size;
5605                 if (buf != NULL) {
5606                         err = copyoutstr(outstr, buf, bufsize, NULL);
5607                         if (err != 0 && err != ENAMETOOLONG)
5608                                 error = EFAULT;
5609                 }
5610                 break;
5611         case ZONE_ATTR_NETWORK:
5612                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5613                 if (copyin(buf, zbuf, bufsize) != 0) {
5614                         error = EFAULT;
5615                 } else {
5616                         error = zone_get_network(zoneid, zbuf);
5617                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5618                                 error = EFAULT;
5619                 }
5620                 kmem_free(zbuf, bufsize);
5621                 break;
5622         default:
5623                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5624                         size = bufsize;
5625                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5626                 } else {
5627                         error = EINVAL;
5628                 }
5629         }
5630         zone_rele(zone);
5631 
5632         if (error)
5633                 return (set_errno(error));
5634         return ((ssize_t)size);
5635 }
5636 
5637 /*
5638  * Systemcall entry point for zone_setattr(2).
5639  */
5640 /*ARGSUSED*/
5641 static int
5642 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5643 {
5644         zone_t *zone;
5645         zone_status_t zone_status;
5646         int err = -1;
5647         zone_net_data_t *zbuf;
5648 
5649         if (secpolicy_zone_config(CRED()) != 0)
5650                 return (set_errno(EPERM));
5651 
5652         /*
5653          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5654          * global zone.
5655          */
5656         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5657                 return (set_errno(EINVAL));
5658         }
5659 
5660         mutex_enter(&zonehash_lock);
5661         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5662                 mutex_exit(&zonehash_lock);
5663                 return (set_errno(EINVAL));
5664         }
5665         zone_hold(zone);
5666         mutex_exit(&zonehash_lock);
5667 
5668         /*
5669          * At present most attributes can only be set on non-running,
5670          * non-global zones.
5671          */
5672         zone_status = zone_status_get(zone);
5673         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5674                 err = EINVAL;
5675                 goto done;
5676         }
5677 
5678         switch (attr) {
5679         case ZONE_ATTR_INITNAME:
5680                 err = zone_set_initname(zone, (const char *)buf);
5681                 break;
5682         case ZONE_ATTR_INITNORESTART:
5683                 zone->zone_restart_init = B_FALSE;
5684                 err = 0;
5685                 break;
5686         case ZONE_ATTR_BOOTARGS:
5687                 err = zone_set_bootargs(zone, (const char *)buf);
5688                 break;
5689         case ZONE_ATTR_BRAND:
5690                 err = zone_set_brand(zone, (const char *)buf);
5691                 break;
5692         case ZONE_ATTR_FS_ALLOWED:
5693                 err = zone_set_fs_allowed(zone, (const char *)buf);
5694                 break;
5695         case ZONE_ATTR_PHYS_MCAP:
5696                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5697                 break;
5698         case ZONE_ATTR_SCHED_CLASS:
5699                 err = zone_set_sched_class(zone, (const char *)buf);
5700                 break;
5701         case ZONE_ATTR_HOSTID:
5702                 if (bufsize == sizeof (zone->zone_hostid)) {
5703                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5704                                 err = 0;
5705                         else
5706                                 err = EFAULT;
5707                 } else {
5708                         err = EINVAL;
5709                 }
5710                 break;
5711         case ZONE_ATTR_NETWORK:
5712                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5713                         err = EINVAL;
5714                         break;
5715                 }
5716                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5717                 if (copyin(buf, zbuf, bufsize) != 0) {
5718                         kmem_free(zbuf, bufsize);
5719                         err = EFAULT;
5720                         break;
5721                 }
5722                 err = zone_set_network(zoneid, zbuf);
5723                 kmem_free(zbuf, bufsize);
5724                 break;
5725         default:
5726                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5727                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5728                 else
5729                         err = EINVAL;
5730         }
5731 
5732 done:
5733         zone_rele(zone);
5734         ASSERT(err != -1);
5735         return (err != 0 ? set_errno(err) : 0);
5736 }
5737 
5738 /*
5739  * Return zero if the process has at least one vnode mapped in to its
5740  * address space which shouldn't be allowed to change zones.
5741  *
5742  * Also return zero if the process has any shared mappings which reserve
5743  * swap.  This is because the counting for zone.max-swap does not allow swap
5744  * reservation to be shared between zones.  zone swap reservation is counted
5745  * on zone->zone_max_swap.
5746  */
5747 static int
5748 as_can_change_zones(void)
5749 {
5750         proc_t *pp = curproc;
5751         struct seg *seg;
5752         struct as *as = pp->p_as;
5753         vnode_t *vp;
5754         int allow = 1;
5755 
5756         ASSERT(pp->p_as != &kas);
5757         AS_LOCK_ENTER(as, RW_READER);
5758         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5759 
5760                 /*
5761                  * Cannot enter zone with shared anon memory which
5762                  * reserves swap.  See comment above.
5763                  */
5764                 if (seg_can_change_zones(seg) == B_FALSE) {
5765                         allow = 0;
5766                         break;
5767                 }
5768                 /*
5769                  * if we can't get a backing vnode for this segment then skip
5770                  * it.
5771                  */
5772                 vp = NULL;
5773                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5774                         continue;
5775                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5776                         allow = 0;
5777                         break;
5778                 }
5779         }
5780         AS_LOCK_EXIT(as);
5781         return (allow);
5782 }
5783 
5784 /*
5785  * Count swap reserved by curproc's address space
5786  */
5787 static size_t
5788 as_swresv(void)
5789 {
5790         proc_t *pp = curproc;
5791         struct seg *seg;
5792         struct as *as = pp->p_as;
5793         size_t swap = 0;
5794 
5795         ASSERT(pp->p_as != &kas);
5796         ASSERT(AS_WRITE_HELD(as));
5797         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5798                 swap += seg_swresv(seg);
5799 
5800         return (swap);
5801 }
5802 
5803 /*
5804  * Systemcall entry point for zone_enter().
5805  *
5806  * The current process is injected into said zone.  In the process
5807  * it will change its project membership, privileges, rootdir/cwd,
5808  * zone-wide rctls, and pool association to match those of the zone.
5809  *
5810  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5811  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5812  * enter a zone that is "ready" or "running".
5813  */
5814 static int
5815 zone_enter(zoneid_t zoneid)
5816 {
5817         zone_t *zone;
5818         vnode_t *vp;
5819         proc_t *pp = curproc;
5820         contract_t *ct;
5821         cont_process_t *ctp;
5822         task_t *tk, *oldtk;
5823         kproject_t *zone_proj0;
5824         cred_t *cr, *newcr;
5825         pool_t *oldpool, *newpool;
5826         sess_t *sp;
5827         uid_t uid;
5828         zone_status_t status;
5829         int err = 0;
5830         rctl_entity_p_t e;
5831         size_t swap;
5832         kthread_id_t t;
5833 
5834         if (secpolicy_zone_config(CRED()) != 0)
5835                 return (set_errno(EPERM));
5836         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5837                 return (set_errno(EINVAL));
5838 
5839         /*
5840          * Stop all lwps so we don't need to hold a lock to look at
5841          * curproc->p_zone.  This needs to happen before we grab any
5842          * locks to avoid deadlock (another lwp in the process could
5843          * be waiting for the held lock).
5844          */
5845         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5846                 return (set_errno(EINTR));
5847 
5848         /*
5849          * Make sure we're not changing zones with files open or mapped in
5850          * to our address space which shouldn't be changing zones.
5851          */
5852         if (!files_can_change_zones()) {
5853                 err = EBADF;
5854                 goto out;
5855         }
5856         if (!as_can_change_zones()) {
5857                 err = EFAULT;
5858                 goto out;
5859         }
5860 
5861         mutex_enter(&zonehash_lock);
5862         if (pp->p_zone != global_zone) {
5863                 mutex_exit(&zonehash_lock);
5864                 err = EINVAL;
5865                 goto out;
5866         }
5867 
5868         zone = zone_find_all_by_id(zoneid);
5869         if (zone == NULL) {
5870                 mutex_exit(&zonehash_lock);
5871                 err = EINVAL;
5872                 goto out;
5873         }
5874 
5875         /*
5876          * To prevent processes in a zone from holding contracts on
5877          * extrazonal resources, and to avoid process contract
5878          * memberships which span zones, contract holders and processes
5879          * which aren't the sole members of their encapsulating process
5880          * contracts are not allowed to zone_enter.
5881          */
5882         ctp = pp->p_ct_process;
5883         ct = &ctp->conp_contract;
5884         mutex_enter(&ct->ct_lock);
5885         mutex_enter(&pp->p_lock);
5886         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5887                 mutex_exit(&pp->p_lock);
5888                 mutex_exit(&ct->ct_lock);
5889                 mutex_exit(&zonehash_lock);
5890                 err = EINVAL;
5891                 goto out;
5892         }
5893 
5894         /*
5895          * Moreover, we don't allow processes whose encapsulating
5896          * process contracts have inherited extrazonal contracts.
5897          * While it would be easier to eliminate all process contracts
5898          * with inherited contracts, we need to be able to give a
5899          * restarted init (or other zone-penetrating process) its
5900          * predecessor's contracts.
5901          */
5902         if (ctp->conp_ninherited != 0) {
5903                 contract_t *next;
5904                 for (next = list_head(&ctp->conp_inherited); next;
5905                     next = list_next(&ctp->conp_inherited, next)) {
5906                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5907                                 mutex_exit(&pp->p_lock);
5908                                 mutex_exit(&ct->ct_lock);
5909                                 mutex_exit(&zonehash_lock);
5910                                 err = EINVAL;
5911                                 goto out;
5912                         }
5913                 }
5914         }
5915 
5916         mutex_exit(&pp->p_lock);
5917         mutex_exit(&ct->ct_lock);
5918 
5919         status = zone_status_get(zone);
5920         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5921                 /*
5922                  * Can't join
5923                  */
5924                 mutex_exit(&zonehash_lock);
5925                 err = EINVAL;
5926                 goto out;
5927         }
5928 
5929         /*
5930          * Make sure new priv set is within the permitted set for caller
5931          */
5932         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5933                 mutex_exit(&zonehash_lock);
5934                 err = EPERM;
5935                 goto out;
5936         }
5937         /*
5938          * We want to momentarily drop zonehash_lock while we optimistically
5939          * bind curproc to the pool it should be running in.  This is safe
5940          * since the zone can't disappear (we have a hold on it).
5941          */
5942         zone_hold(zone);
5943         mutex_exit(&zonehash_lock);
5944 
5945         /*
5946          * Grab pool_lock to keep the pools configuration from changing
5947          * and to stop ourselves from getting rebound to another pool
5948          * until we join the zone.
5949          */
5950         if (pool_lock_intr() != 0) {
5951                 zone_rele(zone);
5952                 err = EINTR;
5953                 goto out;
5954         }
5955         ASSERT(secpolicy_pool(CRED()) == 0);
5956         /*
5957          * Bind ourselves to the pool currently associated with the zone.
5958          */
5959         oldpool = curproc->p_pool;
5960         newpool = zone_pool_get(zone);
5961         if (pool_state == POOL_ENABLED && newpool != oldpool &&
5962             (err = pool_do_bind(newpool, P_PID, P_MYID,
5963             POOL_BIND_ALL)) != 0) {
5964                 pool_unlock();
5965                 zone_rele(zone);
5966                 goto out;
5967         }
5968 
5969         /*
5970          * Grab cpu_lock now; we'll need it later when we call
5971          * task_join().
5972          */
5973         mutex_enter(&cpu_lock);
5974         mutex_enter(&zonehash_lock);
5975         /*
5976          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5977          */
5978         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5979                 /*
5980                  * Can't join anymore.
5981                  */
5982                 mutex_exit(&zonehash_lock);
5983                 mutex_exit(&cpu_lock);
5984                 if (pool_state == POOL_ENABLED &&
5985                     newpool != oldpool)
5986                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
5987                             POOL_BIND_ALL);
5988                 pool_unlock();
5989                 zone_rele(zone);
5990                 err = EINVAL;
5991                 goto out;
5992         }
5993 
5994         /*
5995          * a_lock must be held while transfering locked memory and swap
5996          * reservation from the global zone to the non global zone because
5997          * asynchronous faults on the processes' address space can lock
5998          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5999          * segments respectively.
6000          */
6001         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6002         swap = as_swresv();
6003         mutex_enter(&pp->p_lock);
6004         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6005         /* verify that we do not exceed and task or lwp limits */
6006         mutex_enter(&zone->zone_nlwps_lock);
6007         /* add new lwps to zone and zone's proj0 */
6008         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6009         zone->zone_nlwps += pp->p_lwpcnt;
6010         /* add 1 task to zone's proj0 */
6011         zone_proj0->kpj_ntasks += 1;
6012 
6013         zone_proj0->kpj_nprocs++;
6014         zone->zone_nprocs++;
6015         mutex_exit(&zone->zone_nlwps_lock);
6016 
6017         mutex_enter(&zone->zone_mem_lock);
6018         zone->zone_locked_mem += pp->p_locked_mem;
6019         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6020         zone->zone_max_swap += swap;
6021         mutex_exit(&zone->zone_mem_lock);
6022 
6023         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6024         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6025         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6026 
6027         /* remove lwps and process from proc's old zone and old project */
6028         mutex_enter(&pp->p_zone->zone_nlwps_lock);
6029         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6030         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6031         pp->p_task->tk_proj->kpj_nprocs--;
6032         pp->p_zone->zone_nprocs--;
6033         mutex_exit(&pp->p_zone->zone_nlwps_lock);
6034 
6035         mutex_enter(&pp->p_zone->zone_mem_lock);
6036         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6037         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6038         pp->p_zone->zone_max_swap -= swap;
6039         mutex_exit(&pp->p_zone->zone_mem_lock);
6040 
6041         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6042         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6043         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6044 
6045         pp->p_flag |= SZONETOP;
6046         pp->p_zone = zone;
6047         mutex_exit(&pp->p_lock);
6048         AS_LOCK_EXIT(pp->p_as);
6049 
6050         /*
6051          * Joining the zone cannot fail from now on.
6052          *
6053          * This means that a lot of the following code can be commonized and
6054          * shared with zsched().
6055          */
6056 
6057         /*
6058          * If the process contract fmri was inherited, we need to
6059          * flag this so that any contract status will not leak
6060          * extra zone information, svc_fmri in this case
6061          */
6062         if (ctp->conp_svc_ctid != ct->ct_id) {
6063                 mutex_enter(&ct->ct_lock);
6064                 ctp->conp_svc_zone_enter = ct->ct_id;
6065                 mutex_exit(&ct->ct_lock);
6066         }
6067 
6068         /*
6069          * Reset the encapsulating process contract's zone.
6070          */
6071         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6072         contract_setzuniqid(ct, zone->zone_uniqid);
6073 
6074         /*
6075          * Create a new task and associate the process with the project keyed
6076          * by (projid,zoneid).
6077          *
6078          * We might as well be in project 0; the global zone's projid doesn't
6079          * make much sense in a zone anyhow.
6080          *
6081          * This also increments zone_ntasks, and returns with p_lock held.
6082          */
6083         tk = task_create(0, zone);
6084         oldtk = task_join(tk, 0);
6085         mutex_exit(&cpu_lock);
6086 
6087         /*
6088          * call RCTLOP_SET functions on this proc
6089          */
6090         e.rcep_p.zone = zone;
6091         e.rcep_t = RCENTITY_ZONE;
6092         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6093             RCD_CALLBACK);
6094         mutex_exit(&pp->p_lock);
6095 
6096         /*
6097          * We don't need to hold any of zsched's locks here; not only do we know
6098          * the process and zone aren't going away, we know its session isn't
6099          * changing either.
6100          *
6101          * By joining zsched's session here, we mimic the behavior in the
6102          * global zone of init's sid being the pid of sched.  We extend this
6103          * to all zlogin-like zone_enter()'ing processes as well.
6104          */
6105         mutex_enter(&pidlock);
6106         sp = zone->zone_zsched->p_sessp;
6107         sess_hold(zone->zone_zsched);
6108         mutex_enter(&pp->p_lock);
6109         pgexit(pp);
6110         sess_rele(pp->p_sessp, B_TRUE);
6111         pp->p_sessp = sp;
6112         pgjoin(pp, zone->zone_zsched->p_pidp);
6113 
6114         /*
6115          * If any threads are scheduled to be placed on zone wait queue they
6116          * should abandon the idea since the wait queue is changing.
6117          * We need to be holding pidlock & p_lock to do this.
6118          */
6119         if ((t = pp->p_tlist) != NULL) {
6120                 do {
6121                         thread_lock(t);
6122                         /*
6123                          * Kick this thread so that he doesn't sit
6124                          * on a wrong wait queue.
6125                          */
6126                         if (ISWAITING(t))
6127                                 setrun_locked(t);
6128 
6129                         if (t->t_schedflag & TS_ANYWAITQ)
6130                                 t->t_schedflag &= ~ TS_ANYWAITQ;
6131 
6132                         thread_unlock(t);
6133                 } while ((t = t->t_forw) != pp->p_tlist);
6134         }
6135 
6136         /*
6137          * If there is a default scheduling class for the zone and it is not
6138          * the class we are currently in, change all of the threads in the
6139          * process to the new class.  We need to be holding pidlock & p_lock
6140          * when we call parmsset so this is a good place to do it.
6141          */
6142         if (zone->zone_defaultcid > 0 &&
6143             zone->zone_defaultcid != curthread->t_cid) {
6144                 pcparms_t pcparms;
6145 
6146                 pcparms.pc_cid = zone->zone_defaultcid;
6147                 pcparms.pc_clparms[0] = 0;
6148 
6149                 /*
6150                  * If setting the class fails, we still want to enter the zone.
6151                  */
6152                 if ((t = pp->p_tlist) != NULL) {
6153                         do {
6154                                 (void) parmsset(&pcparms, t);
6155                         } while ((t = t->t_forw) != pp->p_tlist);
6156                 }
6157         }
6158 
6159         mutex_exit(&pp->p_lock);
6160         mutex_exit(&pidlock);
6161 
6162         mutex_exit(&zonehash_lock);
6163         /*
6164          * We're firmly in the zone; let pools progress.
6165          */
6166         pool_unlock();
6167         task_rele(oldtk);
6168         /*
6169          * We don't need to retain a hold on the zone since we already
6170          * incremented zone_ntasks, so the zone isn't going anywhere.
6171          */
6172         zone_rele(zone);
6173 
6174         /*
6175          * Chroot
6176          */
6177         vp = zone->zone_rootvp;
6178         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6179         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6180 
6181         /*
6182          * Change process credentials
6183          */
6184         newcr = cralloc();
6185         mutex_enter(&pp->p_crlock);
6186         cr = pp->p_cred;
6187         crcopy_to(cr, newcr);
6188         crsetzone(newcr, zone);
6189         pp->p_cred = newcr;
6190 
6191         /*
6192          * Restrict all process privilege sets to zone limit
6193          */
6194         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6195         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6196         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6197         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6198         mutex_exit(&pp->p_crlock);
6199         crset(pp, newcr);
6200 
6201         /*
6202          * Adjust upcount to reflect zone entry.
6203          */
6204         uid = crgetruid(newcr);
6205         mutex_enter(&pidlock);
6206         upcount_dec(uid, GLOBAL_ZONEID);
6207         upcount_inc(uid, zoneid);
6208         mutex_exit(&pidlock);
6209 
6210         /*
6211          * Set up core file path and content.
6212          */
6213         set_core_defaults();
6214 
6215 out:
6216         /*
6217          * Let the other lwps continue.
6218          */
6219         mutex_enter(&pp->p_lock);
6220         if (curthread != pp->p_agenttp)
6221                 continuelwps(pp);
6222         mutex_exit(&pp->p_lock);
6223 
6224         return (err != 0 ? set_errno(err) : 0);
6225 }
6226 
6227 /*
6228  * Systemcall entry point for zone_list(2).
6229  *
6230  * Processes running in a (non-global) zone only see themselves.
6231  * On labeled systems, they see all zones whose label they dominate.
6232  */
6233 static int
6234 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6235 {
6236         zoneid_t *zoneids;
6237         zone_t *zone, *myzone;
6238         uint_t user_nzones, real_nzones;
6239         uint_t domi_nzones;
6240         int error;
6241 
6242         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6243                 return (set_errno(EFAULT));
6244 
6245         myzone = curproc->p_zone;
6246         if (myzone != global_zone) {
6247                 bslabel_t *mybslab;
6248 
6249                 if (!is_system_labeled()) {
6250                         /* just return current zone */
6251                         real_nzones = domi_nzones = 1;
6252                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6253                         zoneids[0] = myzone->zone_id;
6254                 } else {
6255                         /* return all zones that are dominated */
6256                         mutex_enter(&zonehash_lock);
6257                         real_nzones = zonecount;
6258                         domi_nzones = 0;
6259                         if (real_nzones > 0) {
6260                                 zoneids = kmem_alloc(real_nzones *
6261                                     sizeof (zoneid_t), KM_SLEEP);
6262                                 mybslab = label2bslabel(myzone->zone_slabel);
6263                                 for (zone = list_head(&zone_active);
6264                                     zone != NULL;
6265                                     zone = list_next(&zone_active, zone)) {
6266                                         if (zone->zone_id == GLOBAL_ZONEID)
6267                                                 continue;
6268                                         if (zone != myzone &&
6269                                             (zone->zone_flags & ZF_IS_SCRATCH))
6270                                                 continue;
6271                                         /*
6272                                          * Note that a label always dominates
6273                                          * itself, so myzone is always included
6274                                          * in the list.
6275                                          */
6276                                         if (bldominates(mybslab,
6277                                             label2bslabel(zone->zone_slabel))) {
6278                                                 zoneids[domi_nzones++] =
6279                                                     zone->zone_id;
6280                                         }
6281                                 }
6282                         }
6283                         mutex_exit(&zonehash_lock);
6284                 }
6285         } else {
6286                 mutex_enter(&zonehash_lock);
6287                 real_nzones = zonecount;
6288                 domi_nzones = 0;
6289                 if (real_nzones > 0) {
6290                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6291                             KM_SLEEP);
6292                         for (zone = list_head(&zone_active); zone != NULL;
6293                             zone = list_next(&zone_active, zone))
6294                                 zoneids[domi_nzones++] = zone->zone_id;
6295                         ASSERT(domi_nzones == real_nzones);
6296                 }
6297                 mutex_exit(&zonehash_lock);
6298         }
6299 
6300         /*
6301          * If user has allocated space for fewer entries than we found, then
6302          * return only up to his limit.  Either way, tell him exactly how many
6303          * we found.
6304          */
6305         if (domi_nzones < user_nzones)
6306                 user_nzones = domi_nzones;
6307         error = 0;
6308         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6309                 error = EFAULT;
6310         } else if (zoneidlist != NULL && user_nzones != 0) {
6311                 if (copyout(zoneids, zoneidlist,
6312                     user_nzones * sizeof (zoneid_t)) != 0)
6313                         error = EFAULT;
6314         }
6315 
6316         if (real_nzones > 0)
6317                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6318 
6319         if (error != 0)
6320                 return (set_errno(error));
6321         else
6322                 return (0);
6323 }
6324 
6325 /*
6326  * Systemcall entry point for zone_lookup(2).
6327  *
6328  * Non-global zones are only able to see themselves and (on labeled systems)
6329  * the zones they dominate.
6330  */
6331 static zoneid_t
6332 zone_lookup(const char *zone_name)
6333 {
6334         char *kname;
6335         zone_t *zone;
6336         zoneid_t zoneid;
6337         int err;
6338 
6339         if (zone_name == NULL) {
6340                 /* return caller's zone id */
6341                 return (getzoneid());
6342         }
6343 
6344         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6345         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6346                 kmem_free(kname, ZONENAME_MAX);
6347                 return (set_errno(err));
6348         }
6349 
6350         mutex_enter(&zonehash_lock);
6351         zone = zone_find_all_by_name(kname);
6352         kmem_free(kname, ZONENAME_MAX);
6353         /*
6354          * In a non-global zone, can only lookup global and own name.
6355          * In Trusted Extensions zone label dominance rules apply.
6356          */
6357         if (zone == NULL ||
6358             zone_status_get(zone) < ZONE_IS_READY ||
6359             !zone_list_access(zone)) {
6360                 mutex_exit(&zonehash_lock);
6361                 return (set_errno(EINVAL));
6362         } else {
6363                 zoneid = zone->zone_id;
6364                 mutex_exit(&zonehash_lock);
6365                 return (zoneid);
6366         }
6367 }
6368 
6369 static int
6370 zone_version(int *version_arg)
6371 {
6372         int version = ZONE_SYSCALL_API_VERSION;
6373 
6374         if (copyout(&version, version_arg, sizeof (int)) != 0)
6375                 return (set_errno(EFAULT));
6376         return (0);
6377 }
6378 
6379 /* ARGSUSED */
6380 long
6381 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6382 {
6383         zone_def zs;
6384         int err;
6385 
6386         switch (cmd) {
6387         case ZONE_CREATE:
6388                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6389                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6390                                 return (set_errno(EFAULT));
6391                         }
6392                 } else {
6393 #ifdef _SYSCALL32_IMPL
6394                         zone_def32 zs32;
6395 
6396                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6397                                 return (set_errno(EFAULT));
6398                         }
6399                         zs.zone_name =
6400                             (const char *)(unsigned long)zs32.zone_name;
6401                         zs.zone_root =
6402                             (const char *)(unsigned long)zs32.zone_root;
6403                         zs.zone_privs =
6404                             (const struct priv_set *)
6405                             (unsigned long)zs32.zone_privs;
6406                         zs.zone_privssz = zs32.zone_privssz;
6407                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6408                         zs.rctlbufsz = zs32.rctlbufsz;
6409                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6410                         zs.zfsbufsz = zs32.zfsbufsz;
6411                         zs.extended_error =
6412                             (int *)(unsigned long)zs32.extended_error;
6413                         zs.match = zs32.match;
6414                         zs.doi = zs32.doi;
6415                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6416                         zs.flags = zs32.flags;
6417 #else
6418                         panic("get_udatamodel() returned bogus result\n");
6419 #endif
6420                 }
6421 
6422                 return (zone_create(zs.zone_name, zs.zone_root,
6423                     zs.zone_privs, zs.zone_privssz,
6424                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6425                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6426                     zs.extended_error, zs.match, zs.doi,
6427                     zs.label, zs.flags));
6428         case ZONE_BOOT:
6429                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6430         case ZONE_DESTROY:
6431                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6432         case ZONE_GETATTR:
6433                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6434                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6435         case ZONE_SETATTR:
6436                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6437                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6438         case ZONE_ENTER:
6439                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6440         case ZONE_LIST:
6441                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6442         case ZONE_SHUTDOWN:
6443                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6444         case ZONE_LOOKUP:
6445                 return (zone_lookup((const char *)arg1));
6446         case ZONE_VERSION:
6447                 return (zone_version((int *)arg1));
6448         case ZONE_ADD_DATALINK:
6449                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6450                     (datalink_id_t)(uintptr_t)arg2));
6451         case ZONE_DEL_DATALINK:
6452                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6453                     (datalink_id_t)(uintptr_t)arg2));
6454         case ZONE_CHECK_DATALINK: {
6455                 zoneid_t        zoneid;
6456                 boolean_t       need_copyout;
6457 
6458                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6459                         return (EFAULT);
6460                 need_copyout = (zoneid == ALL_ZONES);
6461                 err = zone_check_datalink(&zoneid,
6462                     (datalink_id_t)(uintptr_t)arg2);
6463                 if (err == 0 && need_copyout) {
6464                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6465                                 err = EFAULT;
6466                 }
6467                 return (err == 0 ? 0 : set_errno(err));
6468         }
6469         case ZONE_LIST_DATALINK:
6470                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6471                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6472         default:
6473                 return (set_errno(EINVAL));
6474         }
6475 }
6476 
6477 struct zarg {
6478         zone_t *zone;
6479         zone_cmd_arg_t arg;
6480 };
6481 
6482 static int
6483 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6484 {
6485         char *buf;
6486         size_t buflen;
6487         int error;
6488 
6489         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6490         buf = kmem_alloc(buflen, KM_SLEEP);
6491         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6492         error = door_ki_open(buf, doorp);
6493         kmem_free(buf, buflen);
6494         return (error);
6495 }
6496 
6497 static void
6498 zone_release_door(door_handle_t *doorp)
6499 {
6500         door_ki_rele(*doorp);
6501         *doorp = NULL;
6502 }
6503 
6504 static void
6505 zone_ki_call_zoneadmd(struct zarg *zargp)
6506 {
6507         door_handle_t door = NULL;
6508         door_arg_t darg, save_arg;
6509         char *zone_name;
6510         size_t zone_namelen;
6511         zoneid_t zoneid;
6512         zone_t *zone;
6513         zone_cmd_arg_t arg;
6514         uint64_t uniqid;
6515         size_t size;
6516         int error;
6517         int retry;
6518 
6519         zone = zargp->zone;
6520         arg = zargp->arg;
6521         kmem_free(zargp, sizeof (*zargp));
6522 
6523         zone_namelen = strlen(zone->zone_name) + 1;
6524         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6525         bcopy(zone->zone_name, zone_name, zone_namelen);
6526         zoneid = zone->zone_id;
6527         uniqid = zone->zone_uniqid;
6528         /*
6529          * zoneadmd may be down, but at least we can empty out the zone.
6530          * We can ignore the return value of zone_empty() since we're called
6531          * from a kernel thread and know we won't be delivered any signals.
6532          */
6533         ASSERT(curproc == &p0);
6534         (void) zone_empty(zone);
6535         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6536         zone_rele(zone);
6537 
6538         size = sizeof (arg);
6539         darg.rbuf = (char *)&arg;
6540         darg.data_ptr = (char *)&arg;
6541         darg.rsize = size;
6542         darg.data_size = size;
6543         darg.desc_ptr = NULL;
6544         darg.desc_num = 0;
6545 
6546         save_arg = darg;
6547         /*
6548          * Since we're not holding a reference to the zone, any number of
6549          * things can go wrong, including the zone disappearing before we get a
6550          * chance to talk to zoneadmd.
6551          */
6552         for (retry = 0; /* forever */; retry++) {
6553                 if (door == NULL &&
6554                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6555                         goto next;
6556                 }
6557                 ASSERT(door != NULL);
6558 
6559                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6560                     SIZE_MAX, 0)) == 0) {
6561                         break;
6562                 }
6563                 switch (error) {
6564                 case EINTR:
6565                         /* FALLTHROUGH */
6566                 case EAGAIN:    /* process may be forking */
6567                         /*
6568                          * Back off for a bit
6569                          */
6570                         break;
6571                 case EBADF:
6572                         zone_release_door(&door);
6573                         if (zone_lookup_door(zone_name, &door) != 0) {
6574                                 /*
6575                                  * zoneadmd may be dead, but it may come back to
6576                                  * life later.
6577                                  */
6578                                 break;
6579                         }
6580                         break;
6581                 default:
6582                         cmn_err(CE_WARN,
6583                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6584                             error);
6585                         goto out;
6586                 }
6587 next:
6588                 /*
6589                  * If this isn't the same zone_t that we originally had in mind,
6590                  * then this is the same as if two kadmin requests come in at
6591                  * the same time: the first one wins.  This means we lose, so we
6592                  * bail.
6593                  */
6594                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6595                         /*
6596                          * Problem is solved.
6597                          */
6598                         break;
6599                 }
6600                 if (zone->zone_uniqid != uniqid) {
6601                         /*
6602                          * zoneid recycled
6603                          */
6604                         zone_rele(zone);
6605                         break;
6606                 }
6607                 /*
6608                  * We could zone_status_timedwait(), but there doesn't seem to
6609                  * be much point in doing that (plus, it would mean that
6610                  * zone_free() isn't called until this thread exits).
6611                  */
6612                 zone_rele(zone);
6613                 delay(hz);
6614                 darg = save_arg;
6615         }
6616 out:
6617         if (door != NULL) {
6618                 zone_release_door(&door);
6619         }
6620         kmem_free(zone_name, zone_namelen);
6621         thread_exit();
6622 }
6623 
6624 /*
6625  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6626  * kadmin().  The caller is a process in the zone.
6627  *
6628  * In order to shutdown the zone, we will hand off control to zoneadmd
6629  * (running in the global zone) via a door.  We do a half-hearted job at
6630  * killing all processes in the zone, create a kernel thread to contact
6631  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6632  * a form of generation number used to let zoneadmd (as well as
6633  * zone_destroy()) know exactly which zone they're re talking about.
6634  */
6635 int
6636 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6637 {
6638         struct zarg *zargp;
6639         zone_cmd_t zcmd;
6640         zone_t *zone;
6641 
6642         zone = curproc->p_zone;
6643         ASSERT(getzoneid() != GLOBAL_ZONEID);
6644 
6645         switch (cmd) {
6646         case A_SHUTDOWN:
6647                 switch (fcn) {
6648                 case AD_HALT:
6649                 case AD_POWEROFF:
6650                         zcmd = Z_HALT;
6651                         break;
6652                 case AD_BOOT:
6653                         zcmd = Z_REBOOT;
6654                         break;
6655                 case AD_IBOOT:
6656                 case AD_SBOOT:
6657                 case AD_SIBOOT:
6658                 case AD_NOSYNC:
6659                         return (ENOTSUP);
6660                 default:
6661                         return (EINVAL);
6662                 }
6663                 break;
6664         case A_REBOOT:
6665                 zcmd = Z_REBOOT;
6666                 break;
6667         case A_FTRACE:
6668         case A_REMOUNT:
6669         case A_FREEZE:
6670         case A_DUMP:
6671         case A_CONFIG:
6672                 return (ENOTSUP);
6673         default:
6674                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6675                 return (EINVAL);
6676         }
6677 
6678         if (secpolicy_zone_admin(credp, B_FALSE))
6679                 return (EPERM);
6680         mutex_enter(&zone_status_lock);
6681 
6682         /*
6683          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6684          * is in the zone.
6685          */
6686         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6687         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6688                 /*
6689                  * This zone is already on its way down.
6690                  */
6691                 mutex_exit(&zone_status_lock);
6692                 return (0);
6693         }
6694         /*
6695          * Prevent future zone_enter()s
6696          */
6697         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6698         mutex_exit(&zone_status_lock);
6699 
6700         /*
6701          * Kill everyone now and call zoneadmd later.
6702          * zone_ki_call_zoneadmd() will do a more thorough job of this
6703          * later.
6704          */
6705         killall(zone->zone_id);
6706         /*
6707          * Now, create the thread to contact zoneadmd and do the rest of the
6708          * work.  This thread can't be created in our zone otherwise
6709          * zone_destroy() would deadlock.
6710          */
6711         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6712         zargp->arg.cmd = zcmd;
6713         zargp->arg.uniqid = zone->zone_uniqid;
6714         zargp->zone = zone;
6715         (void) strcpy(zargp->arg.locale, "C");
6716         /* mdep was already copied in for us by uadmin */
6717         if (mdep != NULL)
6718                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6719                     sizeof (zargp->arg.bootbuf));
6720         zone_hold(zone);
6721 
6722         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6723             TS_RUN, minclsyspri);
6724         exit(CLD_EXITED, 0);
6725 
6726         return (EINVAL);
6727 }
6728 
6729 /*
6730  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6731  * status to ZONE_IS_SHUTTING_DOWN.
6732  *
6733  * This function also shuts down all running zones to ensure that they won't
6734  * fork new processes.
6735  */
6736 void
6737 zone_shutdown_global(void)
6738 {
6739         zone_t *current_zonep;
6740 
6741         ASSERT(INGLOBALZONE(curproc));
6742         mutex_enter(&zonehash_lock);
6743         mutex_enter(&zone_status_lock);
6744 
6745         /* Modify the global zone's status first. */
6746         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6747         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6748 
6749         /*
6750          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6751          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6752          * could cause assertions to fail (e.g., assertions about a zone's
6753          * state during initialization, readying, or booting) or produce races.
6754          * We'll let threads continue to initialize and ready new zones: they'll
6755          * fail to boot the new zones when they see that the global zone is
6756          * shutting down.
6757          */
6758         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6759             current_zonep = list_next(&zone_active, current_zonep)) {
6760                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6761                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6762         }
6763         mutex_exit(&zone_status_lock);
6764         mutex_exit(&zonehash_lock);
6765 }
6766 
6767 /*
6768  * Returns true if the named dataset is visible in the current zone.
6769  * The 'write' parameter is set to 1 if the dataset is also writable.
6770  */
6771 int
6772 zone_dataset_visible(const char *dataset, int *write)
6773 {
6774         static int zfstype = -1;
6775         zone_dataset_t *zd;
6776         size_t len;
6777         zone_t *zone = curproc->p_zone;
6778         const char *name = NULL;
6779         vfs_t *vfsp = NULL;
6780 
6781         if (dataset[0] == '\0')
6782                 return (0);
6783 
6784         /*
6785          * Walk the list once, looking for datasets which match exactly, or
6786          * specify a dataset underneath an exported dataset.  If found, return
6787          * true and note that it is writable.
6788          */
6789         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6790             zd = list_next(&zone->zone_datasets, zd)) {
6791 
6792                 len = strlen(zd->zd_dataset);
6793                 if (strlen(dataset) >= len &&
6794                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6795                     (dataset[len] == '\0' || dataset[len] == '/' ||
6796                     dataset[len] == '@')) {
6797                         if (write)
6798                                 *write = 1;
6799                         return (1);
6800                 }
6801         }
6802 
6803         /*
6804          * Walk the list a second time, searching for datasets which are parents
6805          * of exported datasets.  These should be visible, but read-only.
6806          *
6807          * Note that we also have to support forms such as 'pool/dataset/', with
6808          * a trailing slash.
6809          */
6810         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6811             zd = list_next(&zone->zone_datasets, zd)) {
6812 
6813                 len = strlen(dataset);
6814                 if (dataset[len - 1] == '/')
6815                         len--;  /* Ignore trailing slash */
6816                 if (len < strlen(zd->zd_dataset) &&
6817                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6818                     zd->zd_dataset[len] == '/') {
6819                         if (write)
6820                                 *write = 0;
6821                         return (1);
6822                 }
6823         }
6824 
6825         /*
6826          * We reach here if the given dataset is not found in the zone_dataset
6827          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6828          * instead of delegation. For this we search for the dataset in the
6829          * zone_vfslist of this zone. If found, return true and note that it is
6830          * not writable.
6831          */
6832 
6833         /*
6834          * Initialize zfstype if it is not initialized yet.
6835          */
6836         if (zfstype == -1) {
6837                 struct vfssw *vswp = vfs_getvfssw("zfs");
6838                 zfstype = vswp - vfssw;
6839                 vfs_unrefvfssw(vswp);
6840         }
6841 
6842         vfs_list_read_lock();
6843         vfsp = zone->zone_vfslist;
6844         do {
6845                 ASSERT(vfsp);
6846                 if (vfsp->vfs_fstype == zfstype) {
6847                         name = refstr_value(vfsp->vfs_resource);
6848 
6849                         /*
6850                          * Check if we have an exact match.
6851                          */
6852                         if (strcmp(dataset, name) == 0) {
6853                                 vfs_list_unlock();
6854                                 if (write)
6855                                         *write = 0;
6856                                 return (1);
6857                         }
6858                         /*
6859                          * We need to check if we are looking for parents of
6860                          * a dataset. These should be visible, but read-only.
6861                          */
6862                         len = strlen(dataset);
6863                         if (dataset[len - 1] == '/')
6864                                 len--;
6865 
6866                         if (len < strlen(name) &&
6867                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6868                                 vfs_list_unlock();
6869                                 if (write)
6870                                         *write = 0;
6871                                 return (1);
6872                         }
6873                 }
6874                 vfsp = vfsp->vfs_zone_next;
6875         } while (vfsp != zone->zone_vfslist);
6876 
6877         vfs_list_unlock();
6878         return (0);
6879 }
6880 
6881 /*
6882  * zone_find_by_any_path() -
6883  *
6884  * kernel-private routine similar to zone_find_by_path(), but which
6885  * effectively compares against zone paths rather than zonerootpath
6886  * (i.e., the last component of zonerootpaths, which should be "root/",
6887  * are not compared.)  This is done in order to accurately identify all
6888  * paths, whether zone-visible or not, including those which are parallel
6889  * to /root/, such as /dev/, /home/, etc...
6890  *
6891  * If the specified path does not fall under any zone path then global
6892  * zone is returned.
6893  *
6894  * The treat_abs parameter indicates whether the path should be treated as
6895  * an absolute path although it does not begin with "/".  (This supports
6896  * nfs mount syntax such as host:any/path.)
6897  *
6898  * The caller is responsible for zone_rele of the returned zone.
6899  */
6900 zone_t *
6901 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6902 {
6903         zone_t *zone;
6904         int path_offset = 0;
6905 
6906         if (path == NULL) {
6907                 zone_hold(global_zone);
6908                 return (global_zone);
6909         }
6910 
6911         if (*path != '/') {
6912                 ASSERT(treat_abs);
6913                 path_offset = 1;
6914         }
6915 
6916         mutex_enter(&zonehash_lock);
6917         for (zone = list_head(&zone_active); zone != NULL;
6918             zone = list_next(&zone_active, zone)) {
6919                 char    *c;
6920                 size_t  pathlen;
6921                 char *rootpath_start;
6922 
6923                 if (zone == global_zone)        /* skip global zone */
6924                         continue;
6925 
6926                 /* scan backwards to find start of last component */
6927                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6928                 do {
6929                         c--;
6930                 } while (*c != '/');
6931 
6932                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
6933                 rootpath_start = (zone->zone_rootpath + path_offset);
6934                 if (strncmp(path, rootpath_start, pathlen) == 0)
6935                         break;
6936         }
6937         if (zone == NULL)
6938                 zone = global_zone;
6939         zone_hold(zone);
6940         mutex_exit(&zonehash_lock);
6941         return (zone);
6942 }
6943 
6944 /*
6945  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6946  * zone_dl_t pointer if found, and NULL otherwise.
6947  */
6948 static zone_dl_t *
6949 zone_find_dl(zone_t *zone, datalink_id_t linkid)
6950 {
6951         zone_dl_t *zdl;
6952 
6953         ASSERT(mutex_owned(&zone->zone_lock));
6954         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6955             zdl = list_next(&zone->zone_dl_list, zdl)) {
6956                 if (zdl->zdl_id == linkid)
6957                         break;
6958         }
6959         return (zdl);
6960 }
6961 
6962 static boolean_t
6963 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6964 {
6965         boolean_t exists;
6966 
6967         mutex_enter(&zone->zone_lock);
6968         exists = (zone_find_dl(zone, linkid) != NULL);
6969         mutex_exit(&zone->zone_lock);
6970         return (exists);
6971 }
6972 
6973 /*
6974  * Add an data link name for the zone.
6975  */
6976 static int
6977 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6978 {
6979         zone_dl_t *zdl;
6980         zone_t *zone;
6981         zone_t *thiszone;
6982 
6983         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6984                 return (set_errno(ENXIO));
6985 
6986         /* Verify that the datalink ID doesn't already belong to a zone. */
6987         mutex_enter(&zonehash_lock);
6988         for (zone = list_head(&zone_active); zone != NULL;
6989             zone = list_next(&zone_active, zone)) {
6990                 if (zone_dl_exists(zone, linkid)) {
6991                         mutex_exit(&zonehash_lock);
6992                         zone_rele(thiszone);
6993                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6994                 }
6995         }
6996 
6997         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6998         zdl->zdl_id = linkid;
6999         zdl->zdl_net = NULL;
7000         mutex_enter(&thiszone->zone_lock);
7001         list_insert_head(&thiszone->zone_dl_list, zdl);
7002         mutex_exit(&thiszone->zone_lock);
7003         mutex_exit(&zonehash_lock);
7004         zone_rele(thiszone);
7005         return (0);
7006 }
7007 
7008 static int
7009 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7010 {
7011         zone_dl_t *zdl;
7012         zone_t *zone;
7013         int err = 0;
7014 
7015         if ((zone = zone_find_by_id(zoneid)) == NULL)
7016                 return (set_errno(EINVAL));
7017 
7018         mutex_enter(&zone->zone_lock);
7019         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7020                 err = ENXIO;
7021         } else {
7022                 list_remove(&zone->zone_dl_list, zdl);
7023                 nvlist_free(zdl->zdl_net);
7024                 kmem_free(zdl, sizeof (zone_dl_t));
7025         }
7026         mutex_exit(&zone->zone_lock);
7027         zone_rele(zone);
7028         return (err == 0 ? 0 : set_errno(err));
7029 }
7030 
7031 /*
7032  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7033  * the linkid.  Otherwise we just check if the specified zoneidp has been
7034  * assigned the supplied linkid.
7035  */
7036 int
7037 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7038 {
7039         zone_t *zone;
7040         int err = ENXIO;
7041 
7042         if (*zoneidp != ALL_ZONES) {
7043                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7044                         if (zone_dl_exists(zone, linkid))
7045                                 err = 0;
7046                         zone_rele(zone);
7047                 }
7048                 return (err);
7049         }
7050 
7051         mutex_enter(&zonehash_lock);
7052         for (zone = list_head(&zone_active); zone != NULL;
7053             zone = list_next(&zone_active, zone)) {
7054                 if (zone_dl_exists(zone, linkid)) {
7055                         *zoneidp = zone->zone_id;
7056                         err = 0;
7057                         break;
7058                 }
7059         }
7060         mutex_exit(&zonehash_lock);
7061         return (err);
7062 }
7063 
7064 /*
7065  * Get the list of datalink IDs assigned to a zone.
7066  *
7067  * On input, *nump is the number of datalink IDs that can fit in the supplied
7068  * idarray.  Upon return, *nump is either set to the number of datalink IDs
7069  * that were placed in the array if the array was large enough, or to the
7070  * number of datalink IDs that the function needs to place in the array if the
7071  * array is too small.
7072  */
7073 static int
7074 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7075 {
7076         uint_t num, dlcount;
7077         zone_t *zone;
7078         zone_dl_t *zdl;
7079         datalink_id_t *idptr = idarray;
7080 
7081         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7082                 return (set_errno(EFAULT));
7083         if ((zone = zone_find_by_id(zoneid)) == NULL)
7084                 return (set_errno(ENXIO));
7085 
7086         num = 0;
7087         mutex_enter(&zone->zone_lock);
7088         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7089             zdl = list_next(&zone->zone_dl_list, zdl)) {
7090                 /*
7091                  * If the list is bigger than what the caller supplied, just
7092                  * count, don't do copyout.
7093                  */
7094                 if (++num > dlcount)
7095                         continue;
7096                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7097                         mutex_exit(&zone->zone_lock);
7098                         zone_rele(zone);
7099                         return (set_errno(EFAULT));
7100                 }
7101                 idptr++;
7102         }
7103         mutex_exit(&zone->zone_lock);
7104         zone_rele(zone);
7105 
7106         /* Increased or decreased, caller should be notified. */
7107         if (num != dlcount) {
7108                 if (copyout(&num, nump, sizeof (num)) != 0)
7109                         return (set_errno(EFAULT));
7110         }
7111         return (0);
7112 }
7113 
7114 /*
7115  * Public interface for looking up a zone by zoneid. It's a customized version
7116  * for netstack_zone_create(). It can only be called from the zsd create
7117  * callbacks, since it doesn't have reference on the zone structure hence if
7118  * it is called elsewhere the zone could disappear after the zonehash_lock
7119  * is dropped.
7120  *
7121  * Furthermore it
7122  * 1. Doesn't check the status of the zone.
7123  * 2. It will be called even before zone_init is called, in that case the
7124  *    address of zone0 is returned directly, and netstack_zone_create()
7125  *    will only assign a value to zone0.zone_netstack, won't break anything.
7126  * 3. Returns without the zone being held.
7127  */
7128 zone_t *
7129 zone_find_by_id_nolock(zoneid_t zoneid)
7130 {
7131         zone_t *zone;
7132 
7133         mutex_enter(&zonehash_lock);
7134         if (zonehashbyid == NULL)
7135                 zone = &zone0;
7136         else
7137                 zone = zone_find_all_by_id(zoneid);
7138         mutex_exit(&zonehash_lock);
7139         return (zone);
7140 }
7141 
7142 /*
7143  * Walk the datalinks for a given zone
7144  */
7145 int
7146 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7147     void *data)
7148 {
7149         zone_t          *zone;
7150         zone_dl_t       *zdl;
7151         datalink_id_t   *idarray;
7152         uint_t          idcount = 0;
7153         int             i, ret = 0;
7154 
7155         if ((zone = zone_find_by_id(zoneid)) == NULL)
7156                 return (ENOENT);
7157 
7158         /*
7159          * We first build an array of linkid's so that we can walk these and
7160          * execute the callback with the zone_lock dropped.
7161          */
7162         mutex_enter(&zone->zone_lock);
7163         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7164             zdl = list_next(&zone->zone_dl_list, zdl)) {
7165                 idcount++;
7166         }
7167 
7168         if (idcount == 0) {
7169                 mutex_exit(&zone->zone_lock);
7170                 zone_rele(zone);
7171                 return (0);
7172         }
7173 
7174         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7175         if (idarray == NULL) {
7176                 mutex_exit(&zone->zone_lock);
7177                 zone_rele(zone);
7178                 return (ENOMEM);
7179         }
7180 
7181         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7182             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7183                 idarray[i] = zdl->zdl_id;
7184         }
7185 
7186         mutex_exit(&zone->zone_lock);
7187 
7188         for (i = 0; i < idcount && ret == 0; i++) {
7189                 if ((ret = (*cb)(idarray[i], data)) != 0)
7190                         break;
7191         }
7192 
7193         zone_rele(zone);
7194         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7195         return (ret);
7196 }
7197 
7198 static char *
7199 zone_net_type2name(int type)
7200 {
7201         switch (type) {
7202         case ZONE_NETWORK_ADDRESS:
7203                 return (ZONE_NET_ADDRNAME);
7204         case ZONE_NETWORK_DEFROUTER:
7205                 return (ZONE_NET_RTRNAME);
7206         default:
7207                 return (NULL);
7208         }
7209 }
7210 
7211 static int
7212 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7213 {
7214         zone_t *zone;
7215         zone_dl_t *zdl;
7216         nvlist_t *nvl;
7217         int err = 0;
7218         uint8_t *new = NULL;
7219         char *nvname;
7220         int bufsize;
7221         datalink_id_t linkid = znbuf->zn_linkid;
7222 
7223         if (secpolicy_zone_config(CRED()) != 0)
7224                 return (set_errno(EPERM));
7225 
7226         if (zoneid == GLOBAL_ZONEID)
7227                 return (set_errno(EINVAL));
7228 
7229         nvname = zone_net_type2name(znbuf->zn_type);
7230         bufsize = znbuf->zn_len;
7231         new = znbuf->zn_val;
7232         if (nvname == NULL)
7233                 return (set_errno(EINVAL));
7234 
7235         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7236                 return (set_errno(EINVAL));
7237         }
7238 
7239         mutex_enter(&zone->zone_lock);
7240         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7241                 err = ENXIO;
7242                 goto done;
7243         }
7244         if ((nvl = zdl->zdl_net) == NULL) {
7245                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7246                         err = ENOMEM;
7247                         goto done;
7248                 } else {
7249                         zdl->zdl_net = nvl;
7250                 }
7251         }
7252         if (nvlist_exists(nvl, nvname)) {
7253                 err = EINVAL;
7254                 goto done;
7255         }
7256         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7257         ASSERT(err == 0);
7258 done:
7259         mutex_exit(&zone->zone_lock);
7260         zone_rele(zone);
7261         if (err != 0)
7262                 return (set_errno(err));
7263         else
7264                 return (0);
7265 }
7266 
7267 static int
7268 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7269 {
7270         zone_t *zone;
7271         zone_dl_t *zdl;
7272         nvlist_t *nvl;
7273         uint8_t *ptr;
7274         uint_t psize;
7275         int err = 0;
7276         char *nvname;
7277         int bufsize;
7278         void *buf;
7279         datalink_id_t linkid = znbuf->zn_linkid;
7280 
7281         if (zoneid == GLOBAL_ZONEID)
7282                 return (set_errno(EINVAL));
7283 
7284         nvname = zone_net_type2name(znbuf->zn_type);
7285         bufsize = znbuf->zn_len;
7286         buf = znbuf->zn_val;
7287 
7288         if (nvname == NULL)
7289                 return (set_errno(EINVAL));
7290         if ((zone = zone_find_by_id(zoneid)) == NULL)
7291                 return (set_errno(EINVAL));
7292 
7293         mutex_enter(&zone->zone_lock);
7294         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7295                 err = ENXIO;
7296                 goto done;
7297         }
7298         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7299                 err = ENOENT;
7300                 goto done;
7301         }
7302         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7303         ASSERT(err == 0);
7304 
7305         if (psize > bufsize) {
7306                 err = ENOBUFS;
7307                 goto done;
7308         }
7309         znbuf->zn_len = psize;
7310         bcopy(ptr, buf, psize);
7311 done:
7312         mutex_exit(&zone->zone_lock);
7313         zone_rele(zone);
7314         if (err != 0)
7315                 return (set_errno(err));
7316         else
7317                 return (0);
7318 }