1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent Inc. All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27  */
  28 
  29 /*
  30  * Zones
  31  *
  32  *   A zone is a named collection of processes, namespace constraints,
  33  *   and other system resources which comprise a secure and manageable
  34  *   application containment facility.
  35  *
  36  *   Zones (represented by the reference counted zone_t) are tracked in
  37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38  *   (zoneid_t) are used to track zone association.  Zone IDs are
  39  *   dynamically generated when the zone is created; if a persistent
  40  *   identifier is needed (core files, accounting logs, audit trail,
  41  *   etc.), the zone name should be used.
  42  *
  43  *
  44  *   Global Zone:
  45  *
  46  *   The global zone (zoneid 0) is automatically associated with all
  47  *   system resources that have not been bound to a user-created zone.
  48  *   This means that even systems where zones are not in active use
  49  *   have a global zone, and all processes, mounts, etc. are
  50  *   associated with that zone.  The global zone is generally
  51  *   unconstrained in terms of privileges and access, though the usual
  52  *   credential and privilege based restrictions apply.
  53  *
  54  *
  55  *   Zone States:
  56  *
  57  *   The states in which a zone may be in and the transitions are as
  58  *   follows:
  59  *
  60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61  *   initialized zone is added to the list of active zones on the system but
  62  *   isn't accessible.
  63  *
  64  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65  *   not yet completed. Not possible to enter the zone, but attributes can
  66  *   be retrieved.
  67  *
  68  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70  *   executed.  A zone remains in this state until it transitions into
  71  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72  *
  73  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75  *   state.
  76  *
  77  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78  *   successfully started init.   A zone remains in this state until
  79  *   zone_shutdown() is called.
  80  *
  81  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82  *   killing all processes running in the zone. The zone remains
  83  *   in this state until there are no more user processes running in the zone.
  84  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85  *   Since zone_shutdown() is restartable, it may be called successfully
  86  *   multiple times for the same zone_t.  Setting of the zone's state to
  87  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88  *   the zone's status without worrying about it being a moving target.
  89  *
  90  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91  *   are no more user processes in the zone.  The zone remains in this
  92  *   state until there are no more kernel threads associated with the
  93  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94  *   fail.
  95  *
  96  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98  *   join the zone or create kernel threads therein.
  99  *
 100  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  *   return NULL from now on.
 103  *
 104  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  *   processes or threads doing work on behalf of the zone.  The zone is
 106  *   removed from the list of active zones.  zone_destroy() returns, and
 107  *   the zone can be recreated.
 108  *
 109  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 110  *   callbacks are executed, and all memory associated with the zone is
 111  *   freed.
 112  *
 113  *   Threads can wait for the zone to enter a requested state by using
 114  *   zone_status_wait() or zone_status_timedwait() with the desired
 115  *   state passed in as an argument.  Zone state transitions are
 116  *   uni-directional; it is not possible to move back to an earlier state.
 117  *
 118  *
 119  *   Zone-Specific Data:
 120  *
 121  *   Subsystems needing to maintain zone-specific data can store that
 122  *   data using the ZSD mechanism.  This provides a zone-specific data
 123  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 124  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 125  *   to register callbacks to be invoked when a zone is created, shut
 126  *   down, or destroyed.  This can be used to initialize zone-specific
 127  *   data for new zones and to clean up when zones go away.
 128  *
 129  *
 130  *   Data Structures:
 131  *
 132  *   The per-zone structure (zone_t) is reference counted, and freed
 133  *   when all references are released.  zone_hold and zone_rele can be
 134  *   used to adjust the reference count.  In addition, reference counts
 135  *   associated with the cred_t structure are tracked separately using
 136  *   zone_cred_hold and zone_cred_rele.
 137  *
 138  *   Pointers to active zone_t's are stored in two hash tables; one
 139  *   for searching by id, the other for searching by name.  Lookups
 140  *   can be performed on either basis, using zone_find_by_id and
 141  *   zone_find_by_name.  Both return zone_t pointers with the zone
 142  *   held, so zone_rele should be called when the pointer is no longer
 143  *   needed.  Zones can also be searched by path; zone_find_by_path
 144  *   returns the zone with which a path name is associated (global
 145  *   zone if the path is not within some other zone's file system
 146  *   hierarchy).  This currently requires iterating through each zone,
 147  *   so it is slower than an id or name search via a hash table.
 148  *
 149  *
 150  *   Locking:
 151  *
 152  *   zonehash_lock: This is a top-level global lock used to protect the
 153  *       zone hash tables and lists.  Zones cannot be created or destroyed
 154  *       while this lock is held.
 155  *   zone_status_lock: This is a global lock protecting zone state.
 156  *       Zones cannot change state while this lock is held.  It also
 157  *       protects the list of kernel threads associated with a zone.
 158  *   zone_lock: This is a per-zone lock used to protect several fields of
 159  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 160  *       this lock means that the zone cannot go away.
 161  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 162  *       related to the zone.max-lwps rctl.
 163  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 164  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 165  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 166  *       currently just max_lofi
 167  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 168  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 169  *       list (a list of zones in the ZONE_IS_DEAD state).
 170  *
 171  *   Ordering requirements:
 172  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 173  *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
 174  *
 175  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 176  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 177  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 178  *
 179  *   Blocking memory allocations are permitted while holding any of the
 180  *   zone locks.
 181  *
 182  *
 183  *   System Call Interface:
 184  *
 185  *   The zone subsystem can be managed and queried from user level with
 186  *   the following system calls (all subcodes of the primary "zone"
 187  *   system call):
 188  *   - zone_create: creates a zone with selected attributes (name,
 189  *     root path, privileges, resource controls, ZFS datasets)
 190  *   - zone_enter: allows the current process to enter a zone
 191  *   - zone_getattr: reports attributes of a zone
 192  *   - zone_setattr: set attributes of a zone
 193  *   - zone_boot: set 'init' running for the zone
 194  *   - zone_list: lists all zones active in the system
 195  *   - zone_lookup: looks up zone id based on name
 196  *   - zone_shutdown: initiates shutdown process (see states above)
 197  *   - zone_destroy: completes shutdown process (see states above)
 198  *
 199  */
 200 
 201 #include <sys/priv_impl.h>
 202 #include <sys/cred.h>
 203 #include <c2/audit.h>
 204 #include <sys/debug.h>
 205 #include <sys/file.h>
 206 #include <sys/kmem.h>
 207 #include <sys/kstat.h>
 208 #include <sys/mutex.h>
 209 #include <sys/note.h>
 210 #include <sys/pathname.h>
 211 #include <sys/proc.h>
 212 #include <sys/project.h>
 213 #include <sys/sysevent.h>
 214 #include <sys/task.h>
 215 #include <sys/systm.h>
 216 #include <sys/types.h>
 217 #include <sys/utsname.h>
 218 #include <sys/vnode.h>
 219 #include <sys/vfs.h>
 220 #include <sys/systeminfo.h>
 221 #include <sys/policy.h>
 222 #include <sys/cred_impl.h>
 223 #include <sys/contract_impl.h>
 224 #include <sys/contract/process_impl.h>
 225 #include <sys/class.h>
 226 #include <sys/pool.h>
 227 #include <sys/pool_pset.h>
 228 #include <sys/pset.h>
 229 #include <sys/strlog.h>
 230 #include <sys/sysmacros.h>
 231 #include <sys/callb.h>
 232 #include <sys/vmparam.h>
 233 #include <sys/corectl.h>
 234 #include <sys/ipc_impl.h>
 235 #include <sys/klpd.h>
 236 
 237 #include <sys/door.h>
 238 #include <sys/cpuvar.h>
 239 #include <sys/sdt.h>
 240 
 241 #include <sys/uadmin.h>
 242 #include <sys/session.h>
 243 #include <sys/cmn_err.h>
 244 #include <sys/modhash.h>
 245 #include <sys/sunddi.h>
 246 #include <sys/nvpair.h>
 247 #include <sys/rctl.h>
 248 #include <sys/fss.h>
 249 #include <sys/brand.h>
 250 #include <sys/zone.h>
 251 #include <net/if.h>
 252 #include <sys/cpucaps.h>
 253 #include <vm/seg.h>
 254 #include <sys/mac.h>
 255 
 256 /*
 257  * This constant specifies the number of seconds that threads waiting for
 258  * subsystems to release a zone's general-purpose references will wait before
 259  * they log the zone's reference counts.  The constant's value shouldn't
 260  * be so small that reference counts are unnecessarily reported for zones
 261  * whose references are slowly released.  On the other hand, it shouldn't be so
 262  * large that users reboot their systems out of frustration over hung zones
 263  * before the system logs the zones' reference counts.
 264  */
 265 #define ZONE_DESTROY_TIMEOUT_SECS       60
 266 
 267 /* List of data link IDs which are accessible from the zone */
 268 typedef struct zone_dl {
 269         datalink_id_t   zdl_id;
 270         nvlist_t        *zdl_net;
 271         list_node_t     zdl_linkage;
 272 } zone_dl_t;
 273 
 274 /*
 275  * cv used to signal that all references to the zone have been released.  This
 276  * needs to be global since there may be multiple waiters, and the first to
 277  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  */
 279 static kcondvar_t zone_destroy_cv;
 280 /*
 281  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  */
 284 static kmutex_t zone_status_lock;
 285 
 286 /*
 287  * ZSD-related global variables.
 288  */
 289 static kmutex_t zsd_key_lock;   /* protects the following two */
 290 /*
 291  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  */
 293 static zone_key_t zsd_keyval = 0;
 294 /*
 295  * Global list of registered keys.  We use this when a new zone is created.
 296  */
 297 static list_t zsd_registered_keys;
 298 
 299 int zone_hash_size = 256;
 300 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 301 static kmutex_t zonehash_lock;
 302 static uint_t zonecount;
 303 static id_space_t *zoneid_space;
 304 
 305 /*
 306  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  * kernel proper runs, and which manages all other zones.
 308  *
 309  * Although not declared as static, the variable "zone0" should not be used
 310  * except for by code that needs to reference the global zone early on in boot,
 311  * before it is fully initialized.  All other consumers should use
 312  * 'global_zone'.
 313  */
 314 zone_t zone0;
 315 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316 
 317 /*
 318  * List of active zones, protected by zonehash_lock.
 319  */
 320 static list_t zone_active;
 321 
 322 /*
 323  * List of destroyed zones that still have outstanding cred references.
 324  * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  * problems in zone_free.
 326  */
 327 static list_t zone_deathrow;
 328 static kmutex_t zone_deathrow_lock;
 329 
 330 /* number of zones is limited by virtual interface limit in IP */
 331 uint_t maxzones = 8192;
 332 
 333 /* Event channel to sent zone state change notifications */
 334 evchan_t *zone_event_chan;
 335 
 336 /*
 337  * This table holds the mapping from kernel zone states to
 338  * states visible in the state notification API.
 339  * The idea is that we only expose "obvious" states and
 340  * do not expose states which are just implementation details.
 341  */
 342 const char  *zone_status_table[] = {
 343         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344         ZONE_EVENT_INITIALIZED,         /* initialized */
 345         ZONE_EVENT_READY,               /* ready */
 346         ZONE_EVENT_READY,               /* booting */
 347         ZONE_EVENT_RUNNING,             /* running */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352         ZONE_EVENT_UNINITIALIZED,       /* dead */
 353 };
 354 
 355 /*
 356  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  * (see sys/zone.h).
 358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368 
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_max_lofi;
 376 rctl_hndl_t rc_zone_cpu_cap;
 377 rctl_hndl_t rc_zone_nlwps;
 378 rctl_hndl_t rc_zone_nprocs;
 379 rctl_hndl_t rc_zone_shmmax;
 380 rctl_hndl_t rc_zone_shmmni;
 381 rctl_hndl_t rc_zone_semmni;
 382 rctl_hndl_t rc_zone_msgmni;
 383 
 384 const char * const zone_default_initname = "/sbin/init";
 385 static char * const zone_prefix = "/zone/";
 386 static int zone_shutdown(zoneid_t zoneid);
 387 static int zone_add_datalink(zoneid_t, datalink_id_t);
 388 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 389 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 390 static int zone_set_network(zoneid_t, zone_net_data_t *);
 391 static int zone_get_network(zoneid_t, zone_net_data_t *);
 392 
 393 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 394 
 395 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 396 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 397 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 399     zone_key_t);
 400 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 401 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 402     kmutex_t *);
 403 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 404     kmutex_t *);
 405 
 406 /*
 407  * Bump this number when you alter the zone syscall interfaces; this is
 408  * because we need to have support for previous API versions in libc
 409  * to support patching; libc calls into the kernel to determine this number.
 410  *
 411  * Version 1 of the API is the version originally shipped with Solaris 10
 412  * Version 2 alters the zone_create system call in order to support more
 413  *     arguments by moving the args into a structure; and to do better
 414  *     error reporting when zone_create() fails.
 415  * Version 3 alters the zone_create system call in order to support the
 416  *     import of ZFS datasets to zones.
 417  * Version 4 alters the zone_create system call in order to support
 418  *     Trusted Extensions.
 419  * Version 5 alters the zone_boot system call, and converts its old
 420  *     bootargs parameter to be set by the zone_setattr API instead.
 421  * Version 6 adds the flag argument to zone_create.
 422  */
 423 static const int ZONE_SYSCALL_API_VERSION = 6;
 424 
 425 /*
 426  * Certain filesystems (such as NFS and autofs) need to know which zone
 427  * the mount is being placed in.  Because of this, we need to be able to
 428  * ensure that a zone isn't in the process of being created/destroyed such
 429  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 430  * it gets added the list of mounted zones, it ends up on the wrong zone's
 431  * mount list. Since a zone can't reside on an NFS file system, we don't
 432  * have to worry about the zonepath itself.
 433  *
 434  * The following functions: block_mounts()/resume_mounts() and
 435  * mount_in_progress()/mount_completed() are used by zones and the VFS
 436  * layer (respectively) to synchronize zone state transitions and new
 437  * mounts within a zone. This syncronization is on a per-zone basis, so
 438  * activity for one zone will not interfere with activity for another zone.
 439  *
 440  * The semantics are like a reader-reader lock such that there may
 441  * either be multiple mounts (or zone state transitions, if that weren't
 442  * serialized by zonehash_lock) in progress at the same time, but not
 443  * both.
 444  *
 445  * We use cv's so the user can ctrl-C out of the operation if it's
 446  * taking too long.
 447  *
 448  * The semantics are such that there is unfair bias towards the
 449  * "current" operation.  This means that zone halt may starve if
 450  * there is a rapid succession of new mounts coming in to the zone.
 451  */
 452 /*
 453  * Prevent new mounts from progressing to the point of calling
 454  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 455  * them to complete.
 456  */
 457 static int
 458 block_mounts(zone_t *zp)
 459 {
 460         int retval = 0;
 461 
 462         /*
 463          * Since it may block for a long time, block_mounts() shouldn't be
 464          * called with zonehash_lock held.
 465          */
 466         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 467         mutex_enter(&zp->zone_mount_lock);
 468         while (zp->zone_mounts_in_progress > 0) {
 469                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 470                         goto signaled;
 471         }
 472         /*
 473          * A negative value of mounts_in_progress indicates that mounts
 474          * have been blocked by (-mounts_in_progress) different callers
 475          * (remotely possible if two threads enter zone_shutdown at the same
 476          * time).
 477          */
 478         zp->zone_mounts_in_progress--;
 479         retval = 1;
 480 signaled:
 481         mutex_exit(&zp->zone_mount_lock);
 482         return (retval);
 483 }
 484 
 485 /*
 486  * The VFS layer may progress with new mounts as far as we're concerned.
 487  * Allow them to progress if we were the last obstacle.
 488  */
 489 static void
 490 resume_mounts(zone_t *zp)
 491 {
 492         mutex_enter(&zp->zone_mount_lock);
 493         if (++zp->zone_mounts_in_progress == 0)
 494                 cv_broadcast(&zp->zone_mount_cv);
 495         mutex_exit(&zp->zone_mount_lock);
 496 }
 497 
 498 /*
 499  * The VFS layer is busy with a mount; this zone should wait until all
 500  * of its mounts are completed to progress.
 501  */
 502 void
 503 mount_in_progress(zone_t *zp)
 504 {
 505         mutex_enter(&zp->zone_mount_lock);
 506         while (zp->zone_mounts_in_progress < 0)
 507                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 508         zp->zone_mounts_in_progress++;
 509         mutex_exit(&zp->zone_mount_lock);
 510 }
 511 
 512 /*
 513  * VFS is done with one mount; wake up any waiting block_mounts()
 514  * callers if this is the last mount.
 515  */
 516 void
 517 mount_completed(zone_t *zp)
 518 {
 519         mutex_enter(&zp->zone_mount_lock);
 520         if (--zp->zone_mounts_in_progress == 0)
 521                 cv_broadcast(&zp->zone_mount_cv);
 522         mutex_exit(&zp->zone_mount_lock);
 523 }
 524 
 525 /*
 526  * ZSD routines.
 527  *
 528  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 529  * defined by the pthread_key_create() and related interfaces.
 530  *
 531  * Kernel subsystems may register one or more data items and/or
 532  * callbacks to be executed when a zone is created, shutdown, or
 533  * destroyed.
 534  *
 535  * Unlike the thread counterpart, destructor callbacks will be executed
 536  * even if the data pointer is NULL and/or there are no constructor
 537  * callbacks, so it is the responsibility of such callbacks to check for
 538  * NULL data values if necessary.
 539  *
 540  * The locking strategy and overall picture is as follows:
 541  *
 542  * When someone calls zone_key_create(), a template ZSD entry is added to the
 543  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 544  * holding that lock all the existing zones are marked as
 545  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 546  * zone_zsd list (protected by zone_lock). The global list is updated first
 547  * (under zone_key_lock) to make sure that newly created zones use the
 548  * most recent list of keys. Then under zonehash_lock we walk the zones
 549  * and mark them.  Similar locking is used in zone_key_delete().
 550  *
 551  * The actual create, shutdown, and destroy callbacks are done without
 552  * holding any lock. And zsd_flags are used to ensure that the operations
 553  * completed so that when zone_key_create (and zone_create) is done, as well as
 554  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 555  * are completed.
 556  *
 557  * When new zones are created constructor callbacks for all registered ZSD
 558  * entries will be called. That also uses the above two phases of marking
 559  * what needs to be done, and then running the callbacks without holding
 560  * any locks.
 561  *
 562  * The framework does not provide any locking around zone_getspecific() and
 563  * zone_setspecific() apart from that needed for internal consistency, so
 564  * callers interested in atomic "test-and-set" semantics will need to provide
 565  * their own locking.
 566  */
 567 
 568 /*
 569  * Helper function to find the zsd_entry associated with the key in the
 570  * given list.
 571  */
 572 static struct zsd_entry *
 573 zsd_find(list_t *l, zone_key_t key)
 574 {
 575         struct zsd_entry *zsd;
 576 
 577         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 578                 if (zsd->zsd_key == key) {
 579                         return (zsd);
 580                 }
 581         }
 582         return (NULL);
 583 }
 584 
 585 /*
 586  * Helper function to find the zsd_entry associated with the key in the
 587  * given list. Move it to the front of the list.
 588  */
 589 static struct zsd_entry *
 590 zsd_find_mru(list_t *l, zone_key_t key)
 591 {
 592         struct zsd_entry *zsd;
 593 
 594         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 595                 if (zsd->zsd_key == key) {
 596                         /*
 597                          * Move to head of list to keep list in MRU order.
 598                          */
 599                         if (zsd != list_head(l)) {
 600                                 list_remove(l, zsd);
 601                                 list_insert_head(l, zsd);
 602                         }
 603                         return (zsd);
 604                 }
 605         }
 606         return (NULL);
 607 }
 608 
 609 void
 610 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 611     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 612 {
 613         struct zsd_entry *zsdp;
 614         struct zsd_entry *t;
 615         struct zone *zone;
 616         zone_key_t  key;
 617 
 618         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 619         zsdp->zsd_data = NULL;
 620         zsdp->zsd_create = create;
 621         zsdp->zsd_shutdown = shutdown;
 622         zsdp->zsd_destroy = destroy;
 623 
 624         /*
 625          * Insert in global list of callbacks. Makes future zone creations
 626          * see it.
 627          */
 628         mutex_enter(&zsd_key_lock);
 629         key = zsdp->zsd_key = ++zsd_keyval;
 630         ASSERT(zsd_keyval != 0);
 631         list_insert_tail(&zsd_registered_keys, zsdp);
 632         mutex_exit(&zsd_key_lock);
 633 
 634         /*
 635          * Insert for all existing zones and mark them as needing
 636          * a create callback.
 637          */
 638         mutex_enter(&zonehash_lock);        /* stop the world */
 639         for (zone = list_head(&zone_active); zone != NULL;
 640             zone = list_next(&zone_active, zone)) {
 641                 zone_status_t status;
 642 
 643                 mutex_enter(&zone->zone_lock);
 644 
 645                 /* Skip zones that are on the way down or not yet up */
 646                 status = zone_status_get(zone);
 647                 if (status >= ZONE_IS_DOWN ||
 648                     status == ZONE_IS_UNINITIALIZED) {
 649                         mutex_exit(&zone->zone_lock);
 650                         continue;
 651                 }
 652 
 653                 t = zsd_find_mru(&zone->zone_zsd, key);
 654                 if (t != NULL) {
 655                         /*
 656                          * A zsd_configure already inserted it after
 657                          * we dropped zsd_key_lock above.
 658                          */
 659                         mutex_exit(&zone->zone_lock);
 660                         continue;
 661                 }
 662                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 663                 t->zsd_key = key;
 664                 t->zsd_create = create;
 665                 t->zsd_shutdown = shutdown;
 666                 t->zsd_destroy = destroy;
 667                 if (create != NULL) {
 668                         t->zsd_flags = ZSD_CREATE_NEEDED;
 669                         DTRACE_PROBE2(zsd__create__needed,
 670                             zone_t *, zone, zone_key_t, key);
 671                 }
 672                 list_insert_tail(&zone->zone_zsd, t);
 673                 mutex_exit(&zone->zone_lock);
 674         }
 675         mutex_exit(&zonehash_lock);
 676 
 677         if (create != NULL) {
 678                 /* Now call the create callback for this key */
 679                 zsd_apply_all_zones(zsd_apply_create, key);
 680         }
 681         /*
 682          * It is safe for consumers to use the key now, make it
 683          * globally visible. Specifically zone_getspecific() will
 684          * always successfully return the zone specific data associated
 685          * with the key.
 686          */
 687         *keyp = key;
 688 
 689 }
 690 
 691 /*
 692  * Function called when a module is being unloaded, or otherwise wishes
 693  * to unregister its ZSD key and callbacks.
 694  *
 695  * Remove from the global list and determine the functions that need to
 696  * be called under a global lock. Then call the functions without
 697  * holding any locks. Finally free up the zone_zsd entries. (The apply
 698  * functions need to access the zone_zsd entries to find zsd_data etc.)
 699  */
 700 int
 701 zone_key_delete(zone_key_t key)
 702 {
 703         struct zsd_entry *zsdp = NULL;
 704         zone_t *zone;
 705 
 706         mutex_enter(&zsd_key_lock);
 707         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 708         if (zsdp == NULL) {
 709                 mutex_exit(&zsd_key_lock);
 710                 return (-1);
 711         }
 712         list_remove(&zsd_registered_keys, zsdp);
 713         mutex_exit(&zsd_key_lock);
 714 
 715         mutex_enter(&zonehash_lock);
 716         for (zone = list_head(&zone_active); zone != NULL;
 717             zone = list_next(&zone_active, zone)) {
 718                 struct zsd_entry *del;
 719 
 720                 mutex_enter(&zone->zone_lock);
 721                 del = zsd_find_mru(&zone->zone_zsd, key);
 722                 if (del == NULL) {
 723                         /*
 724                          * Somebody else got here first e.g the zone going
 725                          * away.
 726                          */
 727                         mutex_exit(&zone->zone_lock);
 728                         continue;
 729                 }
 730                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 731                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 732                 if (del->zsd_shutdown != NULL &&
 733                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 734                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 735                         DTRACE_PROBE2(zsd__shutdown__needed,
 736                             zone_t *, zone, zone_key_t, key);
 737                 }
 738                 if (del->zsd_destroy != NULL &&
 739                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 740                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 741                         DTRACE_PROBE2(zsd__destroy__needed,
 742                             zone_t *, zone, zone_key_t, key);
 743                 }
 744                 mutex_exit(&zone->zone_lock);
 745         }
 746         mutex_exit(&zonehash_lock);
 747         kmem_free(zsdp, sizeof (*zsdp));
 748 
 749         /* Now call the shutdown and destroy callback for this key */
 750         zsd_apply_all_zones(zsd_apply_shutdown, key);
 751         zsd_apply_all_zones(zsd_apply_destroy, key);
 752 
 753         /* Now we can free up the zsdp structures in each zone */
 754         mutex_enter(&zonehash_lock);
 755         for (zone = list_head(&zone_active); zone != NULL;
 756             zone = list_next(&zone_active, zone)) {
 757                 struct zsd_entry *del;
 758 
 759                 mutex_enter(&zone->zone_lock);
 760                 del = zsd_find(&zone->zone_zsd, key);
 761                 if (del != NULL) {
 762                         list_remove(&zone->zone_zsd, del);
 763                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 764                         kmem_free(del, sizeof (*del));
 765                 }
 766                 mutex_exit(&zone->zone_lock);
 767         }
 768         mutex_exit(&zonehash_lock);
 769 
 770         return (0);
 771 }
 772 
 773 /*
 774  * ZSD counterpart of pthread_setspecific().
 775  *
 776  * Since all zsd callbacks, including those with no create function,
 777  * have an entry in zone_zsd, if the key is registered it is part of
 778  * the zone_zsd list.
 779  * Return an error if the key wasn't registerd.
 780  */
 781 int
 782 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 783 {
 784         struct zsd_entry *t;
 785 
 786         mutex_enter(&zone->zone_lock);
 787         t = zsd_find_mru(&zone->zone_zsd, key);
 788         if (t != NULL) {
 789                 /*
 790                  * Replace old value with new
 791                  */
 792                 t->zsd_data = (void *)data;
 793                 mutex_exit(&zone->zone_lock);
 794                 return (0);
 795         }
 796         mutex_exit(&zone->zone_lock);
 797         return (-1);
 798 }
 799 
 800 /*
 801  * ZSD counterpart of pthread_getspecific().
 802  */
 803 void *
 804 zone_getspecific(zone_key_t key, zone_t *zone)
 805 {
 806         struct zsd_entry *t;
 807         void *data;
 808 
 809         mutex_enter(&zone->zone_lock);
 810         t = zsd_find_mru(&zone->zone_zsd, key);
 811         data = (t == NULL ? NULL : t->zsd_data);
 812         mutex_exit(&zone->zone_lock);
 813         return (data);
 814 }
 815 
 816 /*
 817  * Function used to initialize a zone's list of ZSD callbacks and data
 818  * when the zone is being created.  The callbacks are initialized from
 819  * the template list (zsd_registered_keys). The constructor callback is
 820  * executed later (once the zone exists and with locks dropped).
 821  */
 822 static void
 823 zone_zsd_configure(zone_t *zone)
 824 {
 825         struct zsd_entry *zsdp;
 826         struct zsd_entry *t;
 827 
 828         ASSERT(MUTEX_HELD(&zonehash_lock));
 829         ASSERT(list_head(&zone->zone_zsd) == NULL);
 830         mutex_enter(&zone->zone_lock);
 831         mutex_enter(&zsd_key_lock);
 832         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 833             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 834                 /*
 835                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 836                  * should not have added anything to it.
 837                  */
 838                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 839 
 840                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 841                 t->zsd_key = zsdp->zsd_key;
 842                 t->zsd_create = zsdp->zsd_create;
 843                 t->zsd_shutdown = zsdp->zsd_shutdown;
 844                 t->zsd_destroy = zsdp->zsd_destroy;
 845                 if (zsdp->zsd_create != NULL) {
 846                         t->zsd_flags = ZSD_CREATE_NEEDED;
 847                         DTRACE_PROBE2(zsd__create__needed,
 848                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 849                 }
 850                 list_insert_tail(&zone->zone_zsd, t);
 851         }
 852         mutex_exit(&zsd_key_lock);
 853         mutex_exit(&zone->zone_lock);
 854 }
 855 
 856 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 857 
 858 /*
 859  * Helper function to execute shutdown or destructor callbacks.
 860  */
 861 static void
 862 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 863 {
 864         struct zsd_entry *t;
 865 
 866         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 867         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 868         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 869 
 870         /*
 871          * Run the callback solely based on what is registered for the zone
 872          * in zone_zsd. The global list can change independently of this
 873          * as keys are registered and unregistered and we don't register new
 874          * callbacks for a zone that is in the process of going away.
 875          */
 876         mutex_enter(&zone->zone_lock);
 877         for (t = list_head(&zone->zone_zsd); t != NULL;
 878             t = list_next(&zone->zone_zsd, t)) {
 879                 zone_key_t key = t->zsd_key;
 880 
 881                 /* Skip if no callbacks registered */
 882 
 883                 if (ct == ZSD_SHUTDOWN) {
 884                         if (t->zsd_shutdown != NULL &&
 885                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 886                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 887                                 DTRACE_PROBE2(zsd__shutdown__needed,
 888                                     zone_t *, zone, zone_key_t, key);
 889                         }
 890                 } else {
 891                         if (t->zsd_destroy != NULL &&
 892                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 893                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 894                                 DTRACE_PROBE2(zsd__destroy__needed,
 895                                     zone_t *, zone, zone_key_t, key);
 896                         }
 897                 }
 898         }
 899         mutex_exit(&zone->zone_lock);
 900 
 901         /* Now call the shutdown and destroy callback for this key */
 902         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 903         zsd_apply_all_keys(zsd_apply_destroy, zone);
 904 
 905 }
 906 
 907 /*
 908  * Called when the zone is going away; free ZSD-related memory, and
 909  * destroy the zone_zsd list.
 910  */
 911 static void
 912 zone_free_zsd(zone_t *zone)
 913 {
 914         struct zsd_entry *t, *next;
 915 
 916         /*
 917          * Free all the zsd_entry's we had on this zone.
 918          */
 919         mutex_enter(&zone->zone_lock);
 920         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 921                 next = list_next(&zone->zone_zsd, t);
 922                 list_remove(&zone->zone_zsd, t);
 923                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 924                 kmem_free(t, sizeof (*t));
 925         }
 926         list_destroy(&zone->zone_zsd);
 927         mutex_exit(&zone->zone_lock);
 928 
 929 }
 930 
 931 /*
 932  * Apply a function to all zones for particular key value.
 933  *
 934  * The applyfn has to drop zonehash_lock if it does some work, and
 935  * then reacquire it before it returns.
 936  * When the lock is dropped we don't follow list_next even
 937  * if it is possible to do so without any hazards. This is
 938  * because we want the design to allow for the list of zones
 939  * to change in any arbitrary way during the time the
 940  * lock was dropped.
 941  *
 942  * It is safe to restart the loop at list_head since the applyfn
 943  * changes the zsd_flags as it does work, so a subsequent
 944  * pass through will have no effect in applyfn, hence the loop will terminate
 945  * in at worst O(N^2).
 946  */
 947 static void
 948 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 949 {
 950         zone_t *zone;
 951 
 952         mutex_enter(&zonehash_lock);
 953         zone = list_head(&zone_active);
 954         while (zone != NULL) {
 955                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 956                         /* Lock dropped - restart at head */
 957                         zone = list_head(&zone_active);
 958                 } else {
 959                         zone = list_next(&zone_active, zone);
 960                 }
 961         }
 962         mutex_exit(&zonehash_lock);
 963 }
 964 
 965 /*
 966  * Apply a function to all keys for a particular zone.
 967  *
 968  * The applyfn has to drop zonehash_lock if it does some work, and
 969  * then reacquire it before it returns.
 970  * When the lock is dropped we don't follow list_next even
 971  * if it is possible to do so without any hazards. This is
 972  * because we want the design to allow for the list of zsd callbacks
 973  * to change in any arbitrary way during the time the
 974  * lock was dropped.
 975  *
 976  * It is safe to restart the loop at list_head since the applyfn
 977  * changes the zsd_flags as it does work, so a subsequent
 978  * pass through will have no effect in applyfn, hence the loop will terminate
 979  * in at worst O(N^2).
 980  */
 981 static void
 982 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 983 {
 984         struct zsd_entry *t;
 985 
 986         mutex_enter(&zone->zone_lock);
 987         t = list_head(&zone->zone_zsd);
 988         while (t != NULL) {
 989                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 990                         /* Lock dropped - restart at head */
 991                         t = list_head(&zone->zone_zsd);
 992                 } else {
 993                         t = list_next(&zone->zone_zsd, t);
 994                 }
 995         }
 996         mutex_exit(&zone->zone_lock);
 997 }
 998 
 999 /*
1000  * Call the create function for the zone and key if CREATE_NEEDED
1001  * is set.
1002  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003  * we wait for that thread to complete so that we can ensure that
1004  * all the callbacks are done when we've looped over all zones/keys.
1005  *
1006  * When we call the create function, we drop the global held by the
1007  * caller, and return true to tell the caller it needs to re-evalute the
1008  * state.
1009  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010  * remains held on exit.
1011  */
1012 static boolean_t
1013 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014     zone_t *zone, zone_key_t key)
1015 {
1016         void *result;
1017         struct zsd_entry *t;
1018         boolean_t dropped;
1019 
1020         if (lockp != NULL) {
1021                 ASSERT(MUTEX_HELD(lockp));
1022         }
1023         if (zone_lock_held) {
1024                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1025         } else {
1026                 mutex_enter(&zone->zone_lock);
1027         }
1028 
1029         t = zsd_find(&zone->zone_zsd, key);
1030         if (t == NULL) {
1031                 /*
1032                  * Somebody else got here first e.g the zone going
1033                  * away.
1034                  */
1035                 if (!zone_lock_held)
1036                         mutex_exit(&zone->zone_lock);
1037                 return (B_FALSE);
1038         }
1039         dropped = B_FALSE;
1040         if (zsd_wait_for_inprogress(zone, t, lockp))
1041                 dropped = B_TRUE;
1042 
1043         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046                 DTRACE_PROBE2(zsd__create__inprogress,
1047                     zone_t *, zone, zone_key_t, key);
1048                 mutex_exit(&zone->zone_lock);
1049                 if (lockp != NULL)
1050                         mutex_exit(lockp);
1051 
1052                 dropped = B_TRUE;
1053                 ASSERT(t->zsd_create != NULL);
1054                 DTRACE_PROBE2(zsd__create__start,
1055                     zone_t *, zone, zone_key_t, key);
1056 
1057                 result = (*t->zsd_create)(zone->zone_id);
1058 
1059                 DTRACE_PROBE2(zsd__create__end,
1060                     zone_t *, zone, voidn *, result);
1061 
1062                 ASSERT(result != NULL);
1063                 if (lockp != NULL)
1064                         mutex_enter(lockp);
1065                 mutex_enter(&zone->zone_lock);
1066                 t->zsd_data = result;
1067                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1069                 cv_broadcast(&t->zsd_cv);
1070                 DTRACE_PROBE2(zsd__create__completed,
1071                     zone_t *, zone, zone_key_t, key);
1072         }
1073         if (!zone_lock_held)
1074                 mutex_exit(&zone->zone_lock);
1075         return (dropped);
1076 }
1077 
1078 /*
1079  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1080  * is set.
1081  * If some other thread gets here first and sets *_INPROGRESS, then
1082  * we wait for that thread to complete so that we can ensure that
1083  * all the callbacks are done when we've looped over all zones/keys.
1084  *
1085  * When we call the shutdown function, we drop the global held by the
1086  * caller, and return true to tell the caller it needs to re-evalute the
1087  * state.
1088  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1089  * remains held on exit.
1090  */
1091 static boolean_t
1092 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1093     zone_t *zone, zone_key_t key)
1094 {
1095         struct zsd_entry *t;
1096         void *data;
1097         boolean_t dropped;
1098 
1099         if (lockp != NULL) {
1100                 ASSERT(MUTEX_HELD(lockp));
1101         }
1102         if (zone_lock_held) {
1103                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1104         } else {
1105                 mutex_enter(&zone->zone_lock);
1106         }
1107 
1108         t = zsd_find(&zone->zone_zsd, key);
1109         if (t == NULL) {
1110                 /*
1111                  * Somebody else got here first e.g the zone going
1112                  * away.
1113                  */
1114                 if (!zone_lock_held)
1115                         mutex_exit(&zone->zone_lock);
1116                 return (B_FALSE);
1117         }
1118         dropped = B_FALSE;
1119         if (zsd_wait_for_creator(zone, t, lockp))
1120                 dropped = B_TRUE;
1121 
1122         if (zsd_wait_for_inprogress(zone, t, lockp))
1123                 dropped = B_TRUE;
1124 
1125         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1126                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1127                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1128                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1129                     zone_t *, zone, zone_key_t, key);
1130                 mutex_exit(&zone->zone_lock);
1131                 if (lockp != NULL)
1132                         mutex_exit(lockp);
1133                 dropped = B_TRUE;
1134 
1135                 ASSERT(t->zsd_shutdown != NULL);
1136                 data = t->zsd_data;
1137 
1138                 DTRACE_PROBE2(zsd__shutdown__start,
1139                     zone_t *, zone, zone_key_t, key);
1140 
1141                 (t->zsd_shutdown)(zone->zone_id, data);
1142                 DTRACE_PROBE2(zsd__shutdown__end,
1143                     zone_t *, zone, zone_key_t, key);
1144 
1145                 if (lockp != NULL)
1146                         mutex_enter(lockp);
1147                 mutex_enter(&zone->zone_lock);
1148                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1149                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1150                 cv_broadcast(&t->zsd_cv);
1151                 DTRACE_PROBE2(zsd__shutdown__completed,
1152                     zone_t *, zone, zone_key_t, key);
1153         }
1154         if (!zone_lock_held)
1155                 mutex_exit(&zone->zone_lock);
1156         return (dropped);
1157 }
1158 
1159 /*
1160  * Call the destroy function for the zone and key if DESTROY_NEEDED
1161  * is set.
1162  * If some other thread gets here first and sets *_INPROGRESS, then
1163  * we wait for that thread to complete so that we can ensure that
1164  * all the callbacks are done when we've looped over all zones/keys.
1165  *
1166  * When we call the destroy function, we drop the global held by the
1167  * caller, and return true to tell the caller it needs to re-evalute the
1168  * state.
1169  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1170  * remains held on exit.
1171  */
1172 static boolean_t
1173 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1174     zone_t *zone, zone_key_t key)
1175 {
1176         struct zsd_entry *t;
1177         void *data;
1178         boolean_t dropped;
1179 
1180         if (lockp != NULL) {
1181                 ASSERT(MUTEX_HELD(lockp));
1182         }
1183         if (zone_lock_held) {
1184                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1185         } else {
1186                 mutex_enter(&zone->zone_lock);
1187         }
1188 
1189         t = zsd_find(&zone->zone_zsd, key);
1190         if (t == NULL) {
1191                 /*
1192                  * Somebody else got here first e.g the zone going
1193                  * away.
1194                  */
1195                 if (!zone_lock_held)
1196                         mutex_exit(&zone->zone_lock);
1197                 return (B_FALSE);
1198         }
1199         dropped = B_FALSE;
1200         if (zsd_wait_for_creator(zone, t, lockp))
1201                 dropped = B_TRUE;
1202 
1203         if (zsd_wait_for_inprogress(zone, t, lockp))
1204                 dropped = B_TRUE;
1205 
1206         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1207                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1208                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1209                 DTRACE_PROBE2(zsd__destroy__inprogress,
1210                     zone_t *, zone, zone_key_t, key);
1211                 mutex_exit(&zone->zone_lock);
1212                 if (lockp != NULL)
1213                         mutex_exit(lockp);
1214                 dropped = B_TRUE;
1215 
1216                 ASSERT(t->zsd_destroy != NULL);
1217                 data = t->zsd_data;
1218                 DTRACE_PROBE2(zsd__destroy__start,
1219                     zone_t *, zone, zone_key_t, key);
1220 
1221                 (t->zsd_destroy)(zone->zone_id, data);
1222                 DTRACE_PROBE2(zsd__destroy__end,
1223                     zone_t *, zone, zone_key_t, key);
1224 
1225                 if (lockp != NULL)
1226                         mutex_enter(lockp);
1227                 mutex_enter(&zone->zone_lock);
1228                 t->zsd_data = NULL;
1229                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1230                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1231                 cv_broadcast(&t->zsd_cv);
1232                 DTRACE_PROBE2(zsd__destroy__completed,
1233                     zone_t *, zone, zone_key_t, key);
1234         }
1235         if (!zone_lock_held)
1236                 mutex_exit(&zone->zone_lock);
1237         return (dropped);
1238 }
1239 
1240 /*
1241  * Wait for any CREATE_NEEDED flag to be cleared.
1242  * Returns true if lockp was temporarily dropped while waiting.
1243  */
1244 static boolean_t
1245 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1246 {
1247         boolean_t dropped = B_FALSE;
1248 
1249         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1250                 DTRACE_PROBE2(zsd__wait__for__creator,
1251                     zone_t *, zone, struct zsd_entry *, t);
1252                 if (lockp != NULL) {
1253                         dropped = B_TRUE;
1254                         mutex_exit(lockp);
1255                 }
1256                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1257                 if (lockp != NULL) {
1258                         /* First drop zone_lock to preserve order */
1259                         mutex_exit(&zone->zone_lock);
1260                         mutex_enter(lockp);
1261                         mutex_enter(&zone->zone_lock);
1262                 }
1263         }
1264         return (dropped);
1265 }
1266 
1267 /*
1268  * Wait for any INPROGRESS flag to be cleared.
1269  * Returns true if lockp was temporarily dropped while waiting.
1270  */
1271 static boolean_t
1272 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1273 {
1274         boolean_t dropped = B_FALSE;
1275 
1276         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1277                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1278                     zone_t *, zone, struct zsd_entry *, t);
1279                 if (lockp != NULL) {
1280                         dropped = B_TRUE;
1281                         mutex_exit(lockp);
1282                 }
1283                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1284                 if (lockp != NULL) {
1285                         /* First drop zone_lock to preserve order */
1286                         mutex_exit(&zone->zone_lock);
1287                         mutex_enter(lockp);
1288                         mutex_enter(&zone->zone_lock);
1289                 }
1290         }
1291         return (dropped);
1292 }
1293 
1294 /*
1295  * Frees memory associated with the zone dataset list.
1296  */
1297 static void
1298 zone_free_datasets(zone_t *zone)
1299 {
1300         zone_dataset_t *t, *next;
1301 
1302         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1303                 next = list_next(&zone->zone_datasets, t);
1304                 list_remove(&zone->zone_datasets, t);
1305                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1306                 kmem_free(t, sizeof (*t));
1307         }
1308         list_destroy(&zone->zone_datasets);
1309 }
1310 
1311 /*
1312  * zone.cpu-shares resource control support.
1313  */
1314 /*ARGSUSED*/
1315 static rctl_qty_t
1316 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1317 {
1318         ASSERT(MUTEX_HELD(&p->p_lock));
1319         return (p->p_zone->zone_shares);
1320 }
1321 
1322 /*ARGSUSED*/
1323 static int
1324 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1325     rctl_qty_t nv)
1326 {
1327         ASSERT(MUTEX_HELD(&p->p_lock));
1328         ASSERT(e->rcep_t == RCENTITY_ZONE);
1329         if (e->rcep_p.zone == NULL)
1330                 return (0);
1331 
1332         e->rcep_p.zone->zone_shares = nv;
1333         return (0);
1334 }
1335 
1336 static rctl_ops_t zone_cpu_shares_ops = {
1337         rcop_no_action,
1338         zone_cpu_shares_usage,
1339         zone_cpu_shares_set,
1340         rcop_no_test
1341 };
1342 
1343 /*
1344  * zone.cpu-cap resource control support.
1345  */
1346 /*ARGSUSED*/
1347 static rctl_qty_t
1348 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1349 {
1350         ASSERT(MUTEX_HELD(&p->p_lock));
1351         return (cpucaps_zone_get(p->p_zone));
1352 }
1353 
1354 /*ARGSUSED*/
1355 static int
1356 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1357     rctl_qty_t nv)
1358 {
1359         zone_t *zone = e->rcep_p.zone;
1360 
1361         ASSERT(MUTEX_HELD(&p->p_lock));
1362         ASSERT(e->rcep_t == RCENTITY_ZONE);
1363 
1364         if (zone == NULL)
1365                 return (0);
1366 
1367         /*
1368          * set cap to the new value.
1369          */
1370         return (cpucaps_zone_set(zone, nv));
1371 }
1372 
1373 static rctl_ops_t zone_cpu_cap_ops = {
1374         rcop_no_action,
1375         zone_cpu_cap_get,
1376         zone_cpu_cap_set,
1377         rcop_no_test
1378 };
1379 
1380 /*ARGSUSED*/
1381 static rctl_qty_t
1382 zone_lwps_usage(rctl_t *r, proc_t *p)
1383 {
1384         rctl_qty_t nlwps;
1385         zone_t *zone = p->p_zone;
1386 
1387         ASSERT(MUTEX_HELD(&p->p_lock));
1388 
1389         mutex_enter(&zone->zone_nlwps_lock);
1390         nlwps = zone->zone_nlwps;
1391         mutex_exit(&zone->zone_nlwps_lock);
1392 
1393         return (nlwps);
1394 }
1395 
1396 /*ARGSUSED*/
1397 static int
1398 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1399     rctl_qty_t incr, uint_t flags)
1400 {
1401         rctl_qty_t nlwps;
1402 
1403         ASSERT(MUTEX_HELD(&p->p_lock));
1404         ASSERT(e->rcep_t == RCENTITY_ZONE);
1405         if (e->rcep_p.zone == NULL)
1406                 return (0);
1407         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1408         nlwps = e->rcep_p.zone->zone_nlwps;
1409 
1410         if (nlwps + incr > rcntl->rcv_value)
1411                 return (1);
1412 
1413         return (0);
1414 }
1415 
1416 /*ARGSUSED*/
1417 static int
1418 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1419 {
1420         ASSERT(MUTEX_HELD(&p->p_lock));
1421         ASSERT(e->rcep_t == RCENTITY_ZONE);
1422         if (e->rcep_p.zone == NULL)
1423                 return (0);
1424         e->rcep_p.zone->zone_nlwps_ctl = nv;
1425         return (0);
1426 }
1427 
1428 static rctl_ops_t zone_lwps_ops = {
1429         rcop_no_action,
1430         zone_lwps_usage,
1431         zone_lwps_set,
1432         zone_lwps_test,
1433 };
1434 
1435 /*ARGSUSED*/
1436 static rctl_qty_t
1437 zone_procs_usage(rctl_t *r, proc_t *p)
1438 {
1439         rctl_qty_t nprocs;
1440         zone_t *zone = p->p_zone;
1441 
1442         ASSERT(MUTEX_HELD(&p->p_lock));
1443 
1444         mutex_enter(&zone->zone_nlwps_lock);
1445         nprocs = zone->zone_nprocs;
1446         mutex_exit(&zone->zone_nlwps_lock);
1447 
1448         return (nprocs);
1449 }
1450 
1451 /*ARGSUSED*/
1452 static int
1453 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1454     rctl_qty_t incr, uint_t flags)
1455 {
1456         rctl_qty_t nprocs;
1457 
1458         ASSERT(MUTEX_HELD(&p->p_lock));
1459         ASSERT(e->rcep_t == RCENTITY_ZONE);
1460         if (e->rcep_p.zone == NULL)
1461                 return (0);
1462         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1463         nprocs = e->rcep_p.zone->zone_nprocs;
1464 
1465         if (nprocs + incr > rcntl->rcv_value)
1466                 return (1);
1467 
1468         return (0);
1469 }
1470 
1471 /*ARGSUSED*/
1472 static int
1473 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1474 {
1475         ASSERT(MUTEX_HELD(&p->p_lock));
1476         ASSERT(e->rcep_t == RCENTITY_ZONE);
1477         if (e->rcep_p.zone == NULL)
1478                 return (0);
1479         e->rcep_p.zone->zone_nprocs_ctl = nv;
1480         return (0);
1481 }
1482 
1483 static rctl_ops_t zone_procs_ops = {
1484         rcop_no_action,
1485         zone_procs_usage,
1486         zone_procs_set,
1487         zone_procs_test,
1488 };
1489 
1490 /*ARGSUSED*/
1491 static rctl_qty_t
1492 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1493 {
1494         ASSERT(MUTEX_HELD(&p->p_lock));
1495         return (p->p_zone->zone_shmmax);
1496 }
1497 
1498 /*ARGSUSED*/
1499 static int
1500 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1501     rctl_qty_t incr, uint_t flags)
1502 {
1503         rctl_qty_t v;
1504         ASSERT(MUTEX_HELD(&p->p_lock));
1505         ASSERT(e->rcep_t == RCENTITY_ZONE);
1506         v = e->rcep_p.zone->zone_shmmax + incr;
1507         if (v > rval->rcv_value)
1508                 return (1);
1509         return (0);
1510 }
1511 
1512 static rctl_ops_t zone_shmmax_ops = {
1513         rcop_no_action,
1514         zone_shmmax_usage,
1515         rcop_no_set,
1516         zone_shmmax_test
1517 };
1518 
1519 /*ARGSUSED*/
1520 static rctl_qty_t
1521 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1522 {
1523         ASSERT(MUTEX_HELD(&p->p_lock));
1524         return (p->p_zone->zone_ipc.ipcq_shmmni);
1525 }
1526 
1527 /*ARGSUSED*/
1528 static int
1529 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530     rctl_qty_t incr, uint_t flags)
1531 {
1532         rctl_qty_t v;
1533         ASSERT(MUTEX_HELD(&p->p_lock));
1534         ASSERT(e->rcep_t == RCENTITY_ZONE);
1535         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1536         if (v > rval->rcv_value)
1537                 return (1);
1538         return (0);
1539 }
1540 
1541 static rctl_ops_t zone_shmmni_ops = {
1542         rcop_no_action,
1543         zone_shmmni_usage,
1544         rcop_no_set,
1545         zone_shmmni_test
1546 };
1547 
1548 /*ARGSUSED*/
1549 static rctl_qty_t
1550 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1551 {
1552         ASSERT(MUTEX_HELD(&p->p_lock));
1553         return (p->p_zone->zone_ipc.ipcq_semmni);
1554 }
1555 
1556 /*ARGSUSED*/
1557 static int
1558 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1559     rctl_qty_t incr, uint_t flags)
1560 {
1561         rctl_qty_t v;
1562         ASSERT(MUTEX_HELD(&p->p_lock));
1563         ASSERT(e->rcep_t == RCENTITY_ZONE);
1564         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1565         if (v > rval->rcv_value)
1566                 return (1);
1567         return (0);
1568 }
1569 
1570 static rctl_ops_t zone_semmni_ops = {
1571         rcop_no_action,
1572         zone_semmni_usage,
1573         rcop_no_set,
1574         zone_semmni_test
1575 };
1576 
1577 /*ARGSUSED*/
1578 static rctl_qty_t
1579 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1580 {
1581         ASSERT(MUTEX_HELD(&p->p_lock));
1582         return (p->p_zone->zone_ipc.ipcq_msgmni);
1583 }
1584 
1585 /*ARGSUSED*/
1586 static int
1587 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1588     rctl_qty_t incr, uint_t flags)
1589 {
1590         rctl_qty_t v;
1591         ASSERT(MUTEX_HELD(&p->p_lock));
1592         ASSERT(e->rcep_t == RCENTITY_ZONE);
1593         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1594         if (v > rval->rcv_value)
1595                 return (1);
1596         return (0);
1597 }
1598 
1599 static rctl_ops_t zone_msgmni_ops = {
1600         rcop_no_action,
1601         zone_msgmni_usage,
1602         rcop_no_set,
1603         zone_msgmni_test
1604 };
1605 
1606 /*ARGSUSED*/
1607 static rctl_qty_t
1608 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1609 {
1610         rctl_qty_t q;
1611         ASSERT(MUTEX_HELD(&p->p_lock));
1612         mutex_enter(&p->p_zone->zone_mem_lock);
1613         q = p->p_zone->zone_locked_mem;
1614         mutex_exit(&p->p_zone->zone_mem_lock);
1615         return (q);
1616 }
1617 
1618 /*ARGSUSED*/
1619 static int
1620 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1621     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1622 {
1623         rctl_qty_t q;
1624         zone_t *z;
1625 
1626         z = e->rcep_p.zone;
1627         ASSERT(MUTEX_HELD(&p->p_lock));
1628         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1629         q = z->zone_locked_mem;
1630         if (q + incr > rcntl->rcv_value)
1631                 return (1);
1632         return (0);
1633 }
1634 
1635 /*ARGSUSED*/
1636 static int
1637 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1638     rctl_qty_t nv)
1639 {
1640         ASSERT(MUTEX_HELD(&p->p_lock));
1641         ASSERT(e->rcep_t == RCENTITY_ZONE);
1642         if (e->rcep_p.zone == NULL)
1643                 return (0);
1644         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1645         return (0);
1646 }
1647 
1648 static rctl_ops_t zone_locked_mem_ops = {
1649         rcop_no_action,
1650         zone_locked_mem_usage,
1651         zone_locked_mem_set,
1652         zone_locked_mem_test
1653 };
1654 
1655 /*ARGSUSED*/
1656 static rctl_qty_t
1657 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1658 {
1659         rctl_qty_t q;
1660         zone_t *z = p->p_zone;
1661 
1662         ASSERT(MUTEX_HELD(&p->p_lock));
1663         mutex_enter(&z->zone_mem_lock);
1664         q = z->zone_max_swap;
1665         mutex_exit(&z->zone_mem_lock);
1666         return (q);
1667 }
1668 
1669 /*ARGSUSED*/
1670 static int
1671 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1672     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1673 {
1674         rctl_qty_t q;
1675         zone_t *z;
1676 
1677         z = e->rcep_p.zone;
1678         ASSERT(MUTEX_HELD(&p->p_lock));
1679         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1680         q = z->zone_max_swap;
1681         if (q + incr > rcntl->rcv_value)
1682                 return (1);
1683         return (0);
1684 }
1685 
1686 /*ARGSUSED*/
1687 static int
1688 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1689     rctl_qty_t nv)
1690 {
1691         ASSERT(MUTEX_HELD(&p->p_lock));
1692         ASSERT(e->rcep_t == RCENTITY_ZONE);
1693         if (e->rcep_p.zone == NULL)
1694                 return (0);
1695         e->rcep_p.zone->zone_max_swap_ctl = nv;
1696         return (0);
1697 }
1698 
1699 static rctl_ops_t zone_max_swap_ops = {
1700         rcop_no_action,
1701         zone_max_swap_usage,
1702         zone_max_swap_set,
1703         zone_max_swap_test
1704 };
1705 
1706 /*ARGSUSED*/
1707 static rctl_qty_t
1708 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1709 {
1710         rctl_qty_t q;
1711         zone_t *z = p->p_zone;
1712 
1713         ASSERT(MUTEX_HELD(&p->p_lock));
1714         mutex_enter(&z->zone_rctl_lock);
1715         q = z->zone_max_lofi;
1716         mutex_exit(&z->zone_rctl_lock);
1717         return (q);
1718 }
1719 
1720 /*ARGSUSED*/
1721 static int
1722 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1723     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1724 {
1725         rctl_qty_t q;
1726         zone_t *z;
1727 
1728         z = e->rcep_p.zone;
1729         ASSERT(MUTEX_HELD(&p->p_lock));
1730         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1731         q = z->zone_max_lofi;
1732         if (q + incr > rcntl->rcv_value)
1733                 return (1);
1734         return (0);
1735 }
1736 
1737 /*ARGSUSED*/
1738 static int
1739 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1740     rctl_qty_t nv)
1741 {
1742         ASSERT(MUTEX_HELD(&p->p_lock));
1743         ASSERT(e->rcep_t == RCENTITY_ZONE);
1744         if (e->rcep_p.zone == NULL)
1745                 return (0);
1746         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1747         return (0);
1748 }
1749 
1750 static rctl_ops_t zone_max_lofi_ops = {
1751         rcop_no_action,
1752         zone_max_lofi_usage,
1753         zone_max_lofi_set,
1754         zone_max_lofi_test
1755 };
1756 
1757 /*
1758  * Helper function to brand the zone with a unique ID.
1759  */
1760 static void
1761 zone_uniqid(zone_t *zone)
1762 {
1763         static uint64_t uniqid = 0;
1764 
1765         ASSERT(MUTEX_HELD(&zonehash_lock));
1766         zone->zone_uniqid = uniqid++;
1767 }
1768 
1769 /*
1770  * Returns a held pointer to the "kcred" for the specified zone.
1771  */
1772 struct cred *
1773 zone_get_kcred(zoneid_t zoneid)
1774 {
1775         zone_t *zone;
1776         cred_t *cr;
1777 
1778         if ((zone = zone_find_by_id(zoneid)) == NULL)
1779                 return (NULL);
1780         cr = zone->zone_kcred;
1781         crhold(cr);
1782         zone_rele(zone);
1783         return (cr);
1784 }
1785 
1786 static int
1787 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1788 {
1789         zone_t *zone = ksp->ks_private;
1790         zone_kstat_t *zk = ksp->ks_data;
1791 
1792         if (rw == KSTAT_WRITE)
1793                 return (EACCES);
1794 
1795         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1796         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1797         return (0);
1798 }
1799 
1800 static int
1801 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1802 {
1803         zone_t *zone = ksp->ks_private;
1804         zone_kstat_t *zk = ksp->ks_data;
1805 
1806         if (rw == KSTAT_WRITE)
1807                 return (EACCES);
1808 
1809         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1810         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1811         return (0);
1812 }
1813 
1814 static int
1815 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1816 {
1817         zone_t *zone = ksp->ks_private;
1818         zone_kstat_t *zk = ksp->ks_data;
1819 
1820         if (rw == KSTAT_WRITE)
1821                 return (EACCES);
1822 
1823         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1824         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1825         return (0);
1826 }
1827 
1828 static kstat_t *
1829 zone_kstat_create_common(zone_t *zone, char *name,
1830     int (*updatefunc) (kstat_t *, int))
1831 {
1832         kstat_t *ksp;
1833         zone_kstat_t *zk;
1834 
1835         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1836             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1837             KSTAT_FLAG_VIRTUAL);
1838 
1839         if (ksp == NULL)
1840                 return (NULL);
1841 
1842         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1843         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1844         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1845         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1846         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1847         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1848         ksp->ks_update = updatefunc;
1849         ksp->ks_private = zone;
1850         kstat_install(ksp);
1851         return (ksp);
1852 }
1853 
1854 
1855 static int
1856 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1857 {
1858         zone_t *zone = ksp->ks_private;
1859         zone_mcap_kstat_t *zmp = ksp->ks_data;
1860 
1861         if (rw == KSTAT_WRITE)
1862                 return (EACCES);
1863 
1864         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1865         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1866         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1867         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1868         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1869 
1870         return (0);
1871 }
1872 
1873 static kstat_t *
1874 zone_mcap_kstat_create(zone_t *zone)
1875 {
1876         kstat_t *ksp;
1877         zone_mcap_kstat_t *zmp;
1878 
1879         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1880             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1881             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1882             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1883                 return (NULL);
1884 
1885         if (zone->zone_id != GLOBAL_ZONEID)
1886                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1887 
1888         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1889         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1890         ksp->ks_lock = &zone->zone_mcap_lock;
1891         zone->zone_mcap_stats = zmp;
1892 
1893         /* The kstat "name" field is not large enough for a full zonename */
1894         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1895         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1896         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1897         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1898         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1899         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1900         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1901             KSTAT_DATA_UINT64);
1902 
1903         ksp->ks_update = zone_mcap_kstat_update;
1904         ksp->ks_private = zone;
1905 
1906         kstat_install(ksp);
1907         return (ksp);
1908 }
1909 
1910 static int
1911 zone_misc_kstat_update(kstat_t *ksp, int rw)
1912 {
1913         zone_t *zone = ksp->ks_private;
1914         zone_misc_kstat_t *zmp = ksp->ks_data;
1915         hrtime_t hrtime;
1916         uint64_t tmp;
1917 
1918         if (rw == KSTAT_WRITE)
1919                 return (EACCES);
1920 
1921         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
1922         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1923         scalehrtime(&hrtime);
1924         zmp->zm_stime.value.ui64 = hrtime;
1925 
1926         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
1927         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1928         scalehrtime(&hrtime);
1929         zmp->zm_utime.value.ui64 = hrtime;
1930 
1931         tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
1932         hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1933         scalehrtime(&hrtime);
1934         zmp->zm_wtime.value.ui64 = hrtime;
1935 
1936         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1937         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1938         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1939 
1940         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1941         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1942         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1943         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1944 
1945         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1946 
1947         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1948         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1949 
1950         return (0);
1951 }
1952 
1953 static kstat_t *
1954 zone_misc_kstat_create(zone_t *zone)
1955 {
1956         kstat_t *ksp;
1957         zone_misc_kstat_t *zmp;
1958 
1959         if ((ksp = kstat_create_zone("zones", zone->zone_id,
1960             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1961             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1962             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1963                 return (NULL);
1964 
1965         if (zone->zone_id != GLOBAL_ZONEID)
1966                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1967 
1968         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1969         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1970         ksp->ks_lock = &zone->zone_misc_lock;
1971         zone->zone_misc_stats = zmp;
1972 
1973         /* The kstat "name" field is not large enough for a full zonename */
1974         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1975         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1976         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1977         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1978         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1979         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1980         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1981         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1982             KSTAT_DATA_UINT32);
1983         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1984         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1985             KSTAT_DATA_UINT32);
1986         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1987         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1988         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1989             KSTAT_DATA_UINT32);
1990         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1991         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1992 
1993         ksp->ks_update = zone_misc_kstat_update;
1994         ksp->ks_private = zone;
1995 
1996         kstat_install(ksp);
1997         return (ksp);
1998 }
1999 
2000 static void
2001 zone_kstat_create(zone_t *zone)
2002 {
2003         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2004             "lockedmem", zone_lockedmem_kstat_update);
2005         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2006             "swapresv", zone_swapresv_kstat_update);
2007         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2008             "nprocs", zone_nprocs_kstat_update);
2009 
2010         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2011                 zone->zone_mcap_stats = kmem_zalloc(
2012                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2013         }
2014 
2015         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2016                 zone->zone_misc_stats = kmem_zalloc(
2017                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2018         }
2019 }
2020 
2021 static void
2022 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2023 {
2024         void *data;
2025 
2026         if (*pkstat != NULL) {
2027                 data = (*pkstat)->ks_data;
2028                 kstat_delete(*pkstat);
2029                 kmem_free(data, datasz);
2030                 *pkstat = NULL;
2031         }
2032 }
2033 
2034 static void
2035 zone_kstat_delete(zone_t *zone)
2036 {
2037         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2038             sizeof (zone_kstat_t));
2039         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2040             sizeof (zone_kstat_t));
2041         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2042             sizeof (zone_kstat_t));
2043         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2044             sizeof (zone_mcap_kstat_t));
2045         zone_kstat_delete_common(&zone->zone_misc_ksp,
2046             sizeof (zone_misc_kstat_t));
2047 }
2048 
2049 /*
2050  * Called very early on in boot to initialize the ZSD list so that
2051  * zone_key_create() can be called before zone_init().  It also initializes
2052  * portions of zone0 which may be used before zone_init() is called.  The
2053  * variable "global_zone" will be set when zone0 is fully initialized by
2054  * zone_init().
2055  */
2056 void
2057 zone_zsd_init(void)
2058 {
2059         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2060         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2061         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2062             offsetof(struct zsd_entry, zsd_linkage));
2063         list_create(&zone_active, sizeof (zone_t),
2064             offsetof(zone_t, zone_linkage));
2065         list_create(&zone_deathrow, sizeof (zone_t),
2066             offsetof(zone_t, zone_linkage));
2067 
2068         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2069         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2070         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2071         zone0.zone_shares = 1;
2072         zone0.zone_nlwps = 0;
2073         zone0.zone_nlwps_ctl = INT_MAX;
2074         zone0.zone_nprocs = 0;
2075         zone0.zone_nprocs_ctl = INT_MAX;
2076         zone0.zone_locked_mem = 0;
2077         zone0.zone_locked_mem_ctl = UINT64_MAX;
2078         ASSERT(zone0.zone_max_swap == 0);
2079         zone0.zone_max_swap_ctl = UINT64_MAX;
2080         zone0.zone_max_lofi = 0;
2081         zone0.zone_max_lofi_ctl = UINT64_MAX;
2082         zone0.zone_shmmax = 0;
2083         zone0.zone_ipc.ipcq_shmmni = 0;
2084         zone0.zone_ipc.ipcq_semmni = 0;
2085         zone0.zone_ipc.ipcq_msgmni = 0;
2086         zone0.zone_name = GLOBAL_ZONENAME;
2087         zone0.zone_nodename = utsname.nodename;
2088         zone0.zone_domain = srpc_domain;
2089         zone0.zone_hostid = HW_INVALID_HOSTID;
2090         zone0.zone_fs_allowed = NULL;
2091         psecflags_default(&zone0.zone_secflags);
2092         zone0.zone_ref = 1;
2093         zone0.zone_id = GLOBAL_ZONEID;
2094         zone0.zone_status = ZONE_IS_RUNNING;
2095         zone0.zone_rootpath = "/";
2096         zone0.zone_rootpathlen = 2;
2097         zone0.zone_psetid = ZONE_PS_INVAL;
2098         zone0.zone_ncpus = 0;
2099         zone0.zone_ncpus_online = 0;
2100         zone0.zone_proc_initpid = 1;
2101         zone0.zone_initname = initname;
2102         zone0.zone_lockedmem_kstat = NULL;
2103         zone0.zone_swapresv_kstat = NULL;
2104         zone0.zone_nprocs_kstat = NULL;
2105 
2106         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2107             offsetof(zone_ref_t, zref_linkage));
2108         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2109             offsetof(struct zsd_entry, zsd_linkage));
2110         list_insert_head(&zone_active, &zone0);
2111 
2112         /*
2113          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2114          * to anything meaningful.  It is assigned to be 'rootdir' in
2115          * vfs_mountroot().
2116          */
2117         zone0.zone_rootvp = NULL;
2118         zone0.zone_vfslist = NULL;
2119         zone0.zone_bootargs = initargs;
2120         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2121         /*
2122          * The global zone has all privileges
2123          */
2124         priv_fillset(zone0.zone_privset);
2125         /*
2126          * Add p0 to the global zone
2127          */
2128         zone0.zone_zsched = &p0;
2129         p0.p_zone = &zone0;
2130 }
2131 
2132 /*
2133  * Compute a hash value based on the contents of the label and the DOI.  The
2134  * hash algorithm is somewhat arbitrary, but is based on the observation that
2135  * humans will likely pick labels that differ by amounts that work out to be
2136  * multiples of the number of hash chains, and thus stirring in some primes
2137  * should help.
2138  */
2139 static uint_t
2140 hash_bylabel(void *hdata, mod_hash_key_t key)
2141 {
2142         const ts_label_t *lab = (ts_label_t *)key;
2143         const uint32_t *up, *ue;
2144         uint_t hash;
2145         int i;
2146 
2147         _NOTE(ARGUNUSED(hdata));
2148 
2149         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2150         /* we depend on alignment of label, but not representation */
2151         up = (const uint32_t *)&lab->tsl_label;
2152         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2153         i = 1;
2154         while (up < ue) {
2155                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2156                 hash += *up + (*up << ((i % 16) + 1));
2157                 up++;
2158                 i++;
2159         }
2160         return (hash);
2161 }
2162 
2163 /*
2164  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2165  * equal).  This may need to be changed if less than / greater than is ever
2166  * needed.
2167  */
2168 static int
2169 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2170 {
2171         ts_label_t *lab1 = (ts_label_t *)key1;
2172         ts_label_t *lab2 = (ts_label_t *)key2;
2173 
2174         return (label_equal(lab1, lab2) ? 0 : 1);
2175 }
2176 
2177 /*
2178  * Called by main() to initialize the zones framework.
2179  */
2180 void
2181 zone_init(void)
2182 {
2183         rctl_dict_entry_t *rde;
2184         rctl_val_t *dval;
2185         rctl_set_t *set;
2186         rctl_alloc_gp_t *gp;
2187         rctl_entity_p_t e;
2188         int res;
2189 
2190         ASSERT(curproc == &p0);
2191 
2192         /*
2193          * Create ID space for zone IDs.  ID 0 is reserved for the
2194          * global zone.
2195          */
2196         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2197 
2198         /*
2199          * Initialize generic zone resource controls, if any.
2200          */
2201         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2202             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2203             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2204             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2205 
2206         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2207             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2208             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2209             RCTL_GLOBAL_INFINITE,
2210             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2211 
2212         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2213             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2214             INT_MAX, INT_MAX, &zone_lwps_ops);
2215 
2216         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2217             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2218             INT_MAX, INT_MAX, &zone_procs_ops);
2219 
2220         /*
2221          * System V IPC resource controls
2222          */
2223         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2224             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2225             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2226 
2227         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2228             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2229             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2230 
2231         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2232             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2233             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2234 
2235         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2236             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2237             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2238 
2239         /*
2240          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2241          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2242          */
2243         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2244         bzero(dval, sizeof (rctl_val_t));
2245         dval->rcv_value = 1;
2246         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2247         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2248         dval->rcv_action_recip_pid = -1;
2249 
2250         rde = rctl_dict_lookup("zone.cpu-shares");
2251         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2252 
2253         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2254             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2255             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2256             &zone_locked_mem_ops);
2257 
2258         rc_zone_max_swap = rctl_register("zone.max-swap",
2259             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2260             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2261             &zone_max_swap_ops);
2262 
2263         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2264             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2265             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2266             &zone_max_lofi_ops);
2267 
2268         /*
2269          * Initialize the ``global zone''.
2270          */
2271         set = rctl_set_create();
2272         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2273         mutex_enter(&p0.p_lock);
2274         e.rcep_p.zone = &zone0;
2275         e.rcep_t = RCENTITY_ZONE;
2276         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2277             gp);
2278 
2279         zone0.zone_nlwps = p0.p_lwpcnt;
2280         zone0.zone_nprocs = 1;
2281         zone0.zone_ntasks = 1;
2282         mutex_exit(&p0.p_lock);
2283         zone0.zone_restart_init = B_TRUE;
2284         zone0.zone_brand = &native_brand;
2285         rctl_prealloc_destroy(gp);
2286         /*
2287          * pool_default hasn't been initialized yet, so we let pool_init()
2288          * take care of making sure the global zone is in the default pool.
2289          */
2290 
2291         /*
2292          * Initialize global zone kstats
2293          */
2294         zone_kstat_create(&zone0);
2295 
2296         /*
2297          * Initialize zone label.
2298          * mlp are initialized when tnzonecfg is loaded.
2299          */
2300         zone0.zone_slabel = l_admin_low;
2301         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2302         label_hold(l_admin_low);
2303 
2304         /*
2305          * Initialise the lock for the database structure used by mntfs.
2306          */
2307         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2308 
2309         zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
2310 
2311         mutex_enter(&zonehash_lock);
2312         zone_uniqid(&zone0);
2313         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2314 
2315         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2316             mod_hash_null_valdtor);
2317         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2318             zone_hash_size, mod_hash_null_valdtor);
2319         /*
2320          * maintain zonehashbylabel only for labeled systems
2321          */
2322         if (is_system_labeled())
2323                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2324                     zone_hash_size, mod_hash_null_keydtor,
2325                     mod_hash_null_valdtor, hash_bylabel, NULL,
2326                     hash_labelkey_cmp, KM_SLEEP);
2327         zonecount = 1;
2328 
2329         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2330             (mod_hash_val_t)&zone0);
2331         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2332             (mod_hash_val_t)&zone0);
2333         if (is_system_labeled()) {
2334                 zone0.zone_flags |= ZF_HASHED_LABEL;
2335                 (void) mod_hash_insert(zonehashbylabel,
2336                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2337         }
2338         mutex_exit(&zonehash_lock);
2339 
2340         /*
2341          * We avoid setting zone_kcred until now, since kcred is initialized
2342          * sometime after zone_zsd_init() and before zone_init().
2343          */
2344         zone0.zone_kcred = kcred;
2345         /*
2346          * The global zone is fully initialized (except for zone_rootvp which
2347          * will be set when the root filesystem is mounted).
2348          */
2349         global_zone = &zone0;
2350 
2351         /*
2352          * Setup an event channel to send zone status change notifications on
2353          */
2354         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2355             EVCH_CREAT);
2356 
2357         if (res)
2358                 panic("Sysevent_evc_bind failed during zone setup.\n");
2359 
2360 }
2361 
2362 static void
2363 zone_free(zone_t *zone)
2364 {
2365         ASSERT(zone != global_zone);
2366         ASSERT(zone->zone_ntasks == 0);
2367         ASSERT(zone->zone_nlwps == 0);
2368         ASSERT(zone->zone_nprocs == 0);
2369         ASSERT(zone->zone_cred_ref == 0);
2370         ASSERT(zone->zone_kcred == NULL);
2371         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2372             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2373         ASSERT(list_is_empty(&zone->zone_ref_list));
2374 
2375         /*
2376          * Remove any zone caps.
2377          */
2378         cpucaps_zone_remove(zone);
2379 
2380         ASSERT(zone->zone_cpucap == NULL);
2381 
2382         /* remove from deathrow list */
2383         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2384                 ASSERT(zone->zone_ref == 0);
2385                 mutex_enter(&zone_deathrow_lock);
2386                 list_remove(&zone_deathrow, zone);
2387                 mutex_exit(&zone_deathrow_lock);
2388         }
2389 
2390         list_destroy(&zone->zone_ref_list);
2391         zone_free_zsd(zone);
2392         zone_free_datasets(zone);
2393         list_destroy(&zone->zone_dl_list);
2394 
2395         cpu_uarray_free(zone->zone_ustate);
2396 
2397         if (zone->zone_rootvp != NULL) {
2398                 vnode_t *vp = zone->zone_rootvp;
2399 
2400                 mutex_enter(&vp->v_lock);
2401                 vp->v_flag &= ~VZONEROOT;
2402                 mutex_exit(&vp->v_lock);
2403                 VN_RELE(vp);
2404                 /* No need to worry about NULL-ing out zone_rootvp. */
2405         }
2406         if (zone->zone_rootpath)
2407                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2408         if (zone->zone_name != NULL)
2409                 kmem_free(zone->zone_name, ZONENAME_MAX);
2410         if (zone->zone_slabel != NULL)
2411                 label_rele(zone->zone_slabel);
2412         if (zone->zone_nodename != NULL)
2413                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2414         if (zone->zone_domain != NULL)
2415                 kmem_free(zone->zone_domain, _SYS_NMLN);
2416         if (zone->zone_privset != NULL)
2417                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2418         if (zone->zone_rctls != NULL)
2419                 rctl_set_free(zone->zone_rctls);
2420         if (zone->zone_bootargs != NULL)
2421                 strfree(zone->zone_bootargs);
2422         if (zone->zone_initname != NULL)
2423                 strfree(zone->zone_initname);
2424         if (zone->zone_fs_allowed != NULL)
2425                 strfree(zone->zone_fs_allowed);
2426         if (zone->zone_pfexecd != NULL)
2427                 klpd_freelist(&zone->zone_pfexecd);
2428         id_free(zoneid_space, zone->zone_id);
2429         mutex_destroy(&zone->zone_lock);
2430         cv_destroy(&zone->zone_cv);
2431         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2432         rw_destroy(&zone->zone_mntfs_db_lock);
2433         kmem_free(zone, sizeof (zone_t));
2434 }
2435 
2436 /*
2437  * See block comment at the top of this file for information about zone
2438  * status values.
2439  */
2440 /*
2441  * Convenience function for setting zone status.
2442  */
2443 static void
2444 zone_status_set(zone_t *zone, zone_status_t status)
2445 {
2446 
2447         nvlist_t *nvl = NULL;
2448         ASSERT(MUTEX_HELD(&zone_status_lock));
2449         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2450             status >= zone_status_get(zone));
2451 
2452         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2453             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2454             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2455             zone_status_table[status]) ||
2456             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2457             zone_status_table[zone->zone_status]) ||
2458             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2459             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2460             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2461             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2462 #ifdef DEBUG
2463                 (void) printf(
2464                     "Failed to allocate and send zone state change event.\n");
2465 #endif
2466         }
2467         nvlist_free(nvl);
2468 
2469         zone->zone_status = status;
2470 
2471         cv_broadcast(&zone->zone_cv);
2472 }
2473 
2474 /*
2475  * Public function to retrieve the zone status.  The zone status may
2476  * change after it is retrieved.
2477  */
2478 zone_status_t
2479 zone_status_get(zone_t *zone)
2480 {
2481         return (zone->zone_status);
2482 }
2483 
2484 static int
2485 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2486 {
2487         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2488         int err = 0;
2489 
2490         ASSERT(zone != global_zone);
2491         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2492                 goto done;      /* EFAULT or ENAMETOOLONG */
2493 
2494         if (zone->zone_bootargs != NULL)
2495                 strfree(zone->zone_bootargs);
2496 
2497         zone->zone_bootargs = strdup(buf);
2498 
2499 done:
2500         kmem_free(buf, BOOTARGS_MAX);
2501         return (err);
2502 }
2503 
2504 static int
2505 zone_set_brand(zone_t *zone, const char *brand)
2506 {
2507         struct brand_attr *attrp;
2508         brand_t *bp;
2509 
2510         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2511         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2512                 kmem_free(attrp, sizeof (struct brand_attr));
2513                 return (EFAULT);
2514         }
2515 
2516         bp = brand_register_zone(attrp);
2517         kmem_free(attrp, sizeof (struct brand_attr));
2518         if (bp == NULL)
2519                 return (EINVAL);
2520 
2521         /*
2522          * This is the only place where a zone can change it's brand.
2523          * We already need to hold zone_status_lock to check the zone
2524          * status, so we'll just use that lock to serialize zone
2525          * branding requests as well.
2526          */
2527         mutex_enter(&zone_status_lock);
2528 
2529         /* Re-Branding is not allowed and the zone can't be booted yet */
2530         if ((ZONE_IS_BRANDED(zone)) ||
2531             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2532                 mutex_exit(&zone_status_lock);
2533                 brand_unregister_zone(bp);
2534                 return (EINVAL);
2535         }
2536 
2537         /* set up the brand specific data */
2538         zone->zone_brand = bp;
2539         ZBROP(zone)->b_init_brand_data(zone);
2540 
2541         mutex_exit(&zone_status_lock);
2542         return (0);
2543 }
2544 
2545 static int
2546 zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2547 {
2548         int err = 0;
2549         psecflags_t psf;
2550 
2551         ASSERT(zone != global_zone);
2552 
2553         if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2554                 return (err);
2555 
2556         if (zone_status_get(zone) > ZONE_IS_READY)
2557                 return (EINVAL);
2558 
2559         if (!psecflags_validate(&psf))
2560                 return (EINVAL);
2561 
2562         (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2563 
2564         /* Set security flags on the zone's zsched */
2565         (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2566             sizeof (zone->zone_zsched->p_secflags));
2567 
2568         return (0);
2569 }
2570 
2571 static int
2572 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2573 {
2574         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2575         int err = 0;
2576 
2577         ASSERT(zone != global_zone);
2578         if ((err = copyinstr(zone_fs_allowed, buf,
2579             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2580                 goto done;
2581 
2582         if (zone->zone_fs_allowed != NULL)
2583                 strfree(zone->zone_fs_allowed);
2584 
2585         zone->zone_fs_allowed = strdup(buf);
2586 
2587 done:
2588         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2589         return (err);
2590 }
2591 
2592 static int
2593 zone_set_initname(zone_t *zone, const char *zone_initname)
2594 {
2595         char initname[INITNAME_SZ];
2596         size_t len;
2597         int err = 0;
2598 
2599         ASSERT(zone != global_zone);
2600         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2601                 return (err);   /* EFAULT or ENAMETOOLONG */
2602 
2603         if (zone->zone_initname != NULL)
2604                 strfree(zone->zone_initname);
2605 
2606         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2607         (void) strcpy(zone->zone_initname, initname);
2608         return (0);
2609 }
2610 
2611 static int
2612 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2613 {
2614         uint64_t mcap;
2615         int err = 0;
2616 
2617         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2618                 zone->zone_phys_mcap = mcap;
2619 
2620         return (err);
2621 }
2622 
2623 static int
2624 zone_set_sched_class(zone_t *zone, const char *new_class)
2625 {
2626         char sched_class[PC_CLNMSZ];
2627         id_t classid;
2628         int err;
2629 
2630         ASSERT(zone != global_zone);
2631         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2632                 return (err);   /* EFAULT or ENAMETOOLONG */
2633 
2634         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2635                 return (set_errno(EINVAL));
2636         zone->zone_defaultcid = classid;
2637         ASSERT(zone->zone_defaultcid > 0 &&
2638             zone->zone_defaultcid < loaded_classes);
2639 
2640         return (0);
2641 }
2642 
2643 /*
2644  * Block indefinitely waiting for (zone_status >= status)
2645  */
2646 void
2647 zone_status_wait(zone_t *zone, zone_status_t status)
2648 {
2649         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2650 
2651         mutex_enter(&zone_status_lock);
2652         while (zone->zone_status < status) {
2653                 cv_wait(&zone->zone_cv, &zone_status_lock);
2654         }
2655         mutex_exit(&zone_status_lock);
2656 }
2657 
2658 /*
2659  * Private CPR-safe version of zone_status_wait().
2660  */
2661 static void
2662 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2663 {
2664         callb_cpr_t cprinfo;
2665 
2666         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2667 
2668         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2669             str);
2670         mutex_enter(&zone_status_lock);
2671         while (zone->zone_status < status) {
2672                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2673                 cv_wait(&zone->zone_cv, &zone_status_lock);
2674                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2675         }
2676         /*
2677          * zone_status_lock is implicitly released by the following.
2678          */
2679         CALLB_CPR_EXIT(&cprinfo);
2680 }
2681 
2682 /*
2683  * Block until zone enters requested state or signal is received.  Return (0)
2684  * if signaled, non-zero otherwise.
2685  */
2686 int
2687 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2688 {
2689         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2690 
2691         mutex_enter(&zone_status_lock);
2692         while (zone->zone_status < status) {
2693                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2694                         mutex_exit(&zone_status_lock);
2695                         return (0);
2696                 }
2697         }
2698         mutex_exit(&zone_status_lock);
2699         return (1);
2700 }
2701 
2702 /*
2703  * Block until the zone enters the requested state or the timeout expires,
2704  * whichever happens first.  Return (-1) if operation timed out, time remaining
2705  * otherwise.
2706  */
2707 clock_t
2708 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2709 {
2710         clock_t timeleft = 0;
2711 
2712         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2713 
2714         mutex_enter(&zone_status_lock);
2715         while (zone->zone_status < status && timeleft != -1) {
2716                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2717         }
2718         mutex_exit(&zone_status_lock);
2719         return (timeleft);
2720 }
2721 
2722 /*
2723  * Block until the zone enters the requested state, the current process is
2724  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2725  * operation timed out, 0 if signaled, time remaining otherwise.
2726  */
2727 clock_t
2728 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2729 {
2730         clock_t timeleft = tim - ddi_get_lbolt();
2731 
2732         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2733 
2734         mutex_enter(&zone_status_lock);
2735         while (zone->zone_status < status) {
2736                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2737                     tim);
2738                 if (timeleft <= 0)
2739                         break;
2740         }
2741         mutex_exit(&zone_status_lock);
2742         return (timeleft);
2743 }
2744 
2745 /*
2746  * Zones have two reference counts: one for references from credential
2747  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2748  * This is so we can allow a zone to be rebooted while there are still
2749  * outstanding cred references, since certain drivers cache dblks (which
2750  * implicitly results in cached creds).  We wait for zone_ref to drop to
2751  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2752  * later freed when the zone_cred_ref drops to 0, though nothing other
2753  * than the zone id and privilege set should be accessed once the zone
2754  * is "dead".
2755  *
2756  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2757  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2758  * to 0.  This can be useful to flush out other sources of cached creds
2759  * that may be less innocuous than the driver case.
2760  *
2761  * Zones also provide a tracked reference counting mechanism in which zone
2762  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2763  * debuggers determine the sources of leaked zone references.  See
2764  * zone_hold_ref() and zone_rele_ref() below for more information.
2765  */
2766 
2767 int zone_wait_for_cred = 0;
2768 
2769 static void
2770 zone_hold_locked(zone_t *z)
2771 {
2772         ASSERT(MUTEX_HELD(&z->zone_lock));
2773         z->zone_ref++;
2774         ASSERT(z->zone_ref != 0);
2775 }
2776 
2777 /*
2778  * Increment the specified zone's reference count.  The zone's zone_t structure
2779  * will not be freed as long as the zone's reference count is nonzero.
2780  * Decrement the zone's reference count via zone_rele().
2781  *
2782  * NOTE: This function should only be used to hold zones for short periods of
2783  * time.  Use zone_hold_ref() if the zone must be held for a long time.
2784  */
2785 void
2786 zone_hold(zone_t *z)
2787 {
2788         mutex_enter(&z->zone_lock);
2789         zone_hold_locked(z);
2790         mutex_exit(&z->zone_lock);
2791 }
2792 
2793 /*
2794  * If the non-cred ref count drops to 1 and either the cred ref count
2795  * is 0 or we aren't waiting for cred references, the zone is ready to
2796  * be destroyed.
2797  */
2798 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2799             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2800 
2801 /*
2802  * Common zone reference release function invoked by zone_rele() and
2803  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2804  * zone's subsystem-specific reference counters are not affected by the
2805  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2806  * removed from the specified zone's reference list.  ref must be non-NULL iff
2807  * subsys is not ZONE_REF_NUM_SUBSYS.
2808  */
2809 static void
2810 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2811 {
2812         boolean_t wakeup;
2813 
2814         mutex_enter(&z->zone_lock);
2815         ASSERT(z->zone_ref != 0);
2816         z->zone_ref--;
2817         if (subsys != ZONE_REF_NUM_SUBSYS) {
2818                 ASSERT(z->zone_subsys_ref[subsys] != 0);
2819                 z->zone_subsys_ref[subsys]--;
2820                 list_remove(&z->zone_ref_list, ref);
2821         }
2822         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2823                 /* no more refs, free the structure */
2824                 mutex_exit(&z->zone_lock);
2825                 zone_free(z);
2826                 return;
2827         }
2828         /* signal zone_destroy so the zone can finish halting */
2829         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2830         mutex_exit(&z->zone_lock);
2831 
2832         if (wakeup) {
2833                 /*
2834                  * Grabbing zonehash_lock here effectively synchronizes with
2835                  * zone_destroy() to avoid missed signals.
2836                  */
2837                 mutex_enter(&zonehash_lock);
2838                 cv_broadcast(&zone_destroy_cv);
2839                 mutex_exit(&zonehash_lock);
2840         }
2841 }
2842 
2843 /*
2844  * Decrement the specified zone's reference count.  The specified zone will
2845  * cease to exist after this function returns if the reference count drops to
2846  * zero.  This function should be paired with zone_hold().
2847  */
2848 void
2849 zone_rele(zone_t *z)
2850 {
2851         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2852 }
2853 
2854 /*
2855  * Initialize a zone reference structure.  This function must be invoked for
2856  * a reference structure before the structure is passed to zone_hold_ref().
2857  */
2858 void
2859 zone_init_ref(zone_ref_t *ref)
2860 {
2861         ref->zref_zone = NULL;
2862         list_link_init(&ref->zref_linkage);
2863 }
2864 
2865 /*
2866  * Acquire a reference to zone z.  The caller must specify the
2867  * zone_ref_subsys_t constant associated with its subsystem.  The specified
2868  * zone_ref_t structure will represent a reference to the specified zone.  Use
2869  * zone_rele_ref() to release the reference.
2870  *
2871  * The referenced zone_t structure will not be freed as long as the zone_t's
2872  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2873  * references.
2874  *
2875  * NOTE: The zone_ref_t structure must be initialized before it is used.
2876  * See zone_init_ref() above.
2877  */
2878 void
2879 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2880 {
2881         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2882 
2883         /*
2884          * Prevent consumers from reusing a reference structure before
2885          * releasing it.
2886          */
2887         VERIFY(ref->zref_zone == NULL);
2888 
2889         ref->zref_zone = z;
2890         mutex_enter(&z->zone_lock);
2891         zone_hold_locked(z);
2892         z->zone_subsys_ref[subsys]++;
2893         ASSERT(z->zone_subsys_ref[subsys] != 0);
2894         list_insert_head(&z->zone_ref_list, ref);
2895         mutex_exit(&z->zone_lock);
2896 }
2897 
2898 /*
2899  * Release the zone reference represented by the specified zone_ref_t.
2900  * The reference is invalid after it's released; however, the zone_ref_t
2901  * structure can be reused without having to invoke zone_init_ref().
2902  * subsys should be the same value that was passed to zone_hold_ref()
2903  * when the reference was acquired.
2904  */
2905 void
2906 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2907 {
2908         zone_rele_common(ref->zref_zone, ref, subsys);
2909 
2910         /*
2911          * Set the zone_ref_t's zref_zone field to NULL to generate panics
2912          * when consumers dereference the reference.  This helps us catch
2913          * consumers who use released references.  Furthermore, this lets
2914          * consumers reuse the zone_ref_t structure without having to
2915          * invoke zone_init_ref().
2916          */
2917         ref->zref_zone = NULL;
2918 }
2919 
2920 void
2921 zone_cred_hold(zone_t *z)
2922 {
2923         mutex_enter(&z->zone_lock);
2924         z->zone_cred_ref++;
2925         ASSERT(z->zone_cred_ref != 0);
2926         mutex_exit(&z->zone_lock);
2927 }
2928 
2929 void
2930 zone_cred_rele(zone_t *z)
2931 {
2932         boolean_t wakeup;
2933 
2934         mutex_enter(&z->zone_lock);
2935         ASSERT(z->zone_cred_ref != 0);
2936         z->zone_cred_ref--;
2937         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2938                 /* no more refs, free the structure */
2939                 mutex_exit(&z->zone_lock);
2940                 zone_free(z);
2941                 return;
2942         }
2943         /*
2944          * If zone_destroy is waiting for the cred references to drain
2945          * out, and they have, signal it.
2946          */
2947         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2948             zone_status_get(z) >= ZONE_IS_DEAD);
2949         mutex_exit(&z->zone_lock);
2950 
2951         if (wakeup) {
2952                 /*
2953                  * Grabbing zonehash_lock here effectively synchronizes with
2954                  * zone_destroy() to avoid missed signals.
2955                  */
2956                 mutex_enter(&zonehash_lock);
2957                 cv_broadcast(&zone_destroy_cv);
2958                 mutex_exit(&zonehash_lock);
2959         }
2960 }
2961 
2962 void
2963 zone_task_hold(zone_t *z)
2964 {
2965         mutex_enter(&z->zone_lock);
2966         z->zone_ntasks++;
2967         ASSERT(z->zone_ntasks != 0);
2968         mutex_exit(&z->zone_lock);
2969 }
2970 
2971 void
2972 zone_task_rele(zone_t *zone)
2973 {
2974         uint_t refcnt;
2975 
2976         mutex_enter(&zone->zone_lock);
2977         ASSERT(zone->zone_ntasks != 0);
2978         refcnt = --zone->zone_ntasks;
2979         if (refcnt > 1)      {       /* Common case */
2980                 mutex_exit(&zone->zone_lock);
2981                 return;
2982         }
2983         zone_hold_locked(zone); /* so we can use the zone_t later */
2984         mutex_exit(&zone->zone_lock);
2985         if (refcnt == 1) {
2986                 /*
2987                  * See if the zone is shutting down.
2988                  */
2989                 mutex_enter(&zone_status_lock);
2990                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2991                         goto out;
2992                 }
2993 
2994                 /*
2995                  * Make sure the ntasks didn't change since we
2996                  * dropped zone_lock.
2997                  */
2998                 mutex_enter(&zone->zone_lock);
2999                 if (refcnt != zone->zone_ntasks) {
3000                         mutex_exit(&zone->zone_lock);
3001                         goto out;
3002                 }
3003                 mutex_exit(&zone->zone_lock);
3004 
3005                 /*
3006                  * No more user processes in the zone.  The zone is empty.
3007                  */
3008                 zone_status_set(zone, ZONE_IS_EMPTY);
3009                 goto out;
3010         }
3011 
3012         ASSERT(refcnt == 0);
3013         /*
3014          * zsched has exited; the zone is dead.
3015          */
3016         zone->zone_zsched = NULL;            /* paranoia */
3017         mutex_enter(&zone_status_lock);
3018         zone_status_set(zone, ZONE_IS_DEAD);
3019 out:
3020         mutex_exit(&zone_status_lock);
3021         zone_rele(zone);
3022 }
3023 
3024 zoneid_t
3025 getzoneid(void)
3026 {
3027         return (curproc->p_zone->zone_id);
3028 }
3029 
3030 /*
3031  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3032  * check the validity of a zone's state.
3033  */
3034 static zone_t *
3035 zone_find_all_by_id(zoneid_t zoneid)
3036 {
3037         mod_hash_val_t hv;
3038         zone_t *zone = NULL;
3039 
3040         ASSERT(MUTEX_HELD(&zonehash_lock));
3041 
3042         if (mod_hash_find(zonehashbyid,
3043             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3044                 zone = (zone_t *)hv;
3045         return (zone);
3046 }
3047 
3048 static zone_t *
3049 zone_find_all_by_label(const ts_label_t *label)
3050 {
3051         mod_hash_val_t hv;
3052         zone_t *zone = NULL;
3053 
3054         ASSERT(MUTEX_HELD(&zonehash_lock));
3055 
3056         /*
3057          * zonehashbylabel is not maintained for unlabeled systems
3058          */
3059         if (!is_system_labeled())
3060                 return (NULL);
3061         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3062                 zone = (zone_t *)hv;
3063         return (zone);
3064 }
3065 
3066 static zone_t *
3067 zone_find_all_by_name(char *name)
3068 {
3069         mod_hash_val_t hv;
3070         zone_t *zone = NULL;
3071 
3072         ASSERT(MUTEX_HELD(&zonehash_lock));
3073 
3074         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3075                 zone = (zone_t *)hv;
3076         return (zone);
3077 }
3078 
3079 /*
3080  * Public interface for looking up a zone by zoneid.  Only returns the zone if
3081  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3082  * Caller must call zone_rele() once it is done with the zone.
3083  *
3084  * The zone may begin the zone_destroy() sequence immediately after this
3085  * function returns, but may be safely used until zone_rele() is called.
3086  */
3087 zone_t *
3088 zone_find_by_id(zoneid_t zoneid)
3089 {
3090         zone_t *zone;
3091         zone_status_t status;
3092 
3093         mutex_enter(&zonehash_lock);
3094         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3095                 mutex_exit(&zonehash_lock);
3096                 return (NULL);
3097         }
3098         status = zone_status_get(zone);
3099         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3100                 /*
3101                  * For all practical purposes the zone doesn't exist.
3102                  */
3103                 mutex_exit(&zonehash_lock);
3104                 return (NULL);
3105         }
3106         zone_hold(zone);
3107         mutex_exit(&zonehash_lock);
3108         return (zone);
3109 }
3110 
3111 /*
3112  * Similar to zone_find_by_id, but using zone label as the key.
3113  */
3114 zone_t *
3115 zone_find_by_label(const ts_label_t *label)
3116 {
3117         zone_t *zone;
3118         zone_status_t status;
3119 
3120         mutex_enter(&zonehash_lock);
3121         if ((zone = zone_find_all_by_label(label)) == NULL) {
3122                 mutex_exit(&zonehash_lock);
3123                 return (NULL);
3124         }
3125 
3126         status = zone_status_get(zone);
3127         if (status > ZONE_IS_DOWN) {
3128                 /*
3129                  * For all practical purposes the zone doesn't exist.
3130                  */
3131                 mutex_exit(&zonehash_lock);
3132                 return (NULL);
3133         }
3134         zone_hold(zone);
3135         mutex_exit(&zonehash_lock);
3136         return (zone);
3137 }
3138 
3139 /*
3140  * Similar to zone_find_by_id, but using zone name as the key.
3141  */
3142 zone_t *
3143 zone_find_by_name(char *name)
3144 {
3145         zone_t *zone;
3146         zone_status_t status;
3147 
3148         mutex_enter(&zonehash_lock);
3149         if ((zone = zone_find_all_by_name(name)) == NULL) {
3150                 mutex_exit(&zonehash_lock);
3151                 return (NULL);
3152         }
3153         status = zone_status_get(zone);
3154         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3155                 /*
3156                  * For all practical purposes the zone doesn't exist.
3157                  */
3158                 mutex_exit(&zonehash_lock);
3159                 return (NULL);
3160         }
3161         zone_hold(zone);
3162         mutex_exit(&zonehash_lock);
3163         return (zone);
3164 }
3165 
3166 /*
3167  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3168  * if there is a zone "foo" rooted at /foo/root, and the path argument
3169  * is "/foo/root/proc", it will return the held zone_t corresponding to
3170  * zone "foo".
3171  *
3172  * zone_find_by_path() always returns a non-NULL value, since at the
3173  * very least every path will be contained in the global zone.
3174  *
3175  * As with the other zone_find_by_*() functions, the caller is
3176  * responsible for zone_rele()ing the return value of this function.
3177  */
3178 zone_t *
3179 zone_find_by_path(const char *path)
3180 {
3181         zone_t *zone;
3182         zone_t *zret = NULL;
3183         zone_status_t status;
3184 
3185         if (path == NULL) {
3186                 /*
3187                  * Call from rootconf().
3188                  */
3189                 zone_hold(global_zone);
3190                 return (global_zone);
3191         }
3192         ASSERT(*path == '/');
3193         mutex_enter(&zonehash_lock);
3194         for (zone = list_head(&zone_active); zone != NULL;
3195             zone = list_next(&zone_active, zone)) {
3196                 if (ZONE_PATH_VISIBLE(path, zone))
3197                         zret = zone;
3198         }
3199         ASSERT(zret != NULL);
3200         status = zone_status_get(zret);
3201         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3202                 /*
3203                  * Zone practically doesn't exist.
3204                  */
3205                 zret = global_zone;
3206         }
3207         zone_hold(zret);
3208         mutex_exit(&zonehash_lock);
3209         return (zret);
3210 }
3211 
3212 /*
3213  * Public interface for updating per-zone load averages.  Called once per
3214  * second.
3215  *
3216  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3217  */
3218 void
3219 zone_loadavg_update(void)
3220 {
3221         zone_t *zp;
3222         zone_status_t status;
3223         struct loadavg_s *lavg;
3224         hrtime_t zone_total;
3225         uint64_t tmp;
3226         int i;
3227         hrtime_t hr_avg;
3228         int nrun;
3229         static int64_t f[3] = { 135, 27, 9 };
3230         int64_t q, r;
3231 
3232         mutex_enter(&zonehash_lock);
3233         for (zp = list_head(&zone_active); zp != NULL;
3234             zp = list_next(&zone_active, zp)) {
3235                 mutex_enter(&zp->zone_lock);
3236 
3237                 /* Skip zones that are on the way down or not yet up */
3238                 status = zone_status_get(zp);
3239                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3240                         /* For all practical purposes the zone doesn't exist. */
3241                         mutex_exit(&zp->zone_lock);
3242                         continue;
3243                 }
3244 
3245                 /*
3246                  * Update the 10 second moving average data in zone_loadavg.
3247                  */
3248                 lavg = &zp->zone_loadavg;
3249 
3250                 tmp = cpu_uarray_sum_all(zp->zone_ustate);
3251                 zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
3252 
3253                 scalehrtime(&zone_total);
3254 
3255                 /* The zone_total should always be increasing. */
3256                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3257                     zone_total - lavg->lg_total : 0;
3258                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3259                 /* lg_total holds the prev. 1 sec. total */
3260                 lavg->lg_total = zone_total;
3261 
3262                 /*
3263                  * To simplify the calculation, we don't calculate the load avg.
3264                  * until the zone has been up for at least 10 seconds and our
3265                  * moving average is thus full.
3266                  */
3267                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3268                         lavg->lg_len++;
3269                         mutex_exit(&zp->zone_lock);
3270                         continue;
3271                 }
3272 
3273                 /* Now calculate the 1min, 5min, 15 min load avg. */
3274                 hr_avg = 0;
3275                 for (i = 0; i < S_LOADAVG_SZ; i++)
3276                         hr_avg += lavg->lg_loads[i];
3277                 hr_avg = hr_avg / S_LOADAVG_SZ;
3278                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3279 
3280                 /* Compute load avg. See comment in calcloadavg() */
3281                 for (i = 0; i < 3; i++) {
3282                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3283                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3284                         zp->zone_hp_avenrun[i] +=
3285                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3286 
3287                         /* avenrun[] can only hold 31 bits of load avg. */
3288                         if (zp->zone_hp_avenrun[i] <
3289                             ((uint64_t)1<<(31+16-FSHIFT)))
3290                                 zp->zone_avenrun[i] = (int32_t)
3291                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3292                         else
3293                                 zp->zone_avenrun[i] = 0x7fffffff;
3294                 }
3295 
3296                 mutex_exit(&zp->zone_lock);
3297         }
3298         mutex_exit(&zonehash_lock);
3299 }
3300 
3301 /*
3302  * Get the number of cpus visible to this zone.  The system-wide global
3303  * 'ncpus' is returned if pools are disabled, the caller is in the
3304  * global zone, or a NULL zone argument is passed in.
3305  */
3306 int
3307 zone_ncpus_get(zone_t *zone)
3308 {
3309         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3310 
3311         return (myncpus != 0 ? myncpus : ncpus);
3312 }
3313 
3314 /*
3315  * Get the number of online cpus visible to this zone.  The system-wide
3316  * global 'ncpus_online' is returned if pools are disabled, the caller
3317  * is in the global zone, or a NULL zone argument is passed in.
3318  */
3319 int
3320 zone_ncpus_online_get(zone_t *zone)
3321 {
3322         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3323 
3324         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3325 }
3326 
3327 /*
3328  * Return the pool to which the zone is currently bound.
3329  */
3330 pool_t *
3331 zone_pool_get(zone_t *zone)
3332 {
3333         ASSERT(pool_lock_held());
3334 
3335         return (zone->zone_pool);
3336 }
3337 
3338 /*
3339  * Set the zone's pool pointer and update the zone's visibility to match
3340  * the resources in the new pool.
3341  */
3342 void
3343 zone_pool_set(zone_t *zone, pool_t *pool)
3344 {
3345         ASSERT(pool_lock_held());
3346         ASSERT(MUTEX_HELD(&cpu_lock));
3347 
3348         zone->zone_pool = pool;
3349         zone_pset_set(zone, pool->pool_pset->pset_id);
3350 }
3351 
3352 /*
3353  * Return the cached value of the id of the processor set to which the
3354  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3355  * facility is disabled.
3356  */
3357 psetid_t
3358 zone_pset_get(zone_t *zone)
3359 {
3360         ASSERT(MUTEX_HELD(&cpu_lock));
3361 
3362         return (zone->zone_psetid);
3363 }
3364 
3365 /*
3366  * Set the cached value of the id of the processor set to which the zone
3367  * is currently bound.  Also update the zone's visibility to match the
3368  * resources in the new processor set.
3369  */
3370 void
3371 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3372 {
3373         psetid_t oldpsetid;
3374 
3375         ASSERT(MUTEX_HELD(&cpu_lock));
3376         oldpsetid = zone_pset_get(zone);
3377 
3378         if (oldpsetid == newpsetid)
3379                 return;
3380         /*
3381          * Global zone sees all.
3382          */
3383         if (zone != global_zone) {
3384                 zone->zone_psetid = newpsetid;
3385                 if (newpsetid != ZONE_PS_INVAL)
3386                         pool_pset_visibility_add(newpsetid, zone);
3387                 if (oldpsetid != ZONE_PS_INVAL)
3388                         pool_pset_visibility_remove(oldpsetid, zone);
3389         }
3390         /*
3391          * Disabling pools, so we should start using the global values
3392          * for ncpus and ncpus_online.
3393          */
3394         if (newpsetid == ZONE_PS_INVAL) {
3395                 zone->zone_ncpus = 0;
3396                 zone->zone_ncpus_online = 0;
3397         }
3398 }
3399 
3400 /*
3401  * Walk the list of active zones and issue the provided callback for
3402  * each of them.
3403  *
3404  * Caller must not be holding any locks that may be acquired under
3405  * zonehash_lock.  See comment at the beginning of the file for a list of
3406  * common locks and their interactions with zones.
3407  */
3408 int
3409 zone_walk(int (*cb)(zone_t *, void *), void *data)
3410 {
3411         zone_t *zone;
3412         int ret = 0;
3413         zone_status_t status;
3414 
3415         mutex_enter(&zonehash_lock);
3416         for (zone = list_head(&zone_active); zone != NULL;
3417             zone = list_next(&zone_active, zone)) {
3418                 /*
3419                  * Skip zones that shouldn't be externally visible.
3420                  */
3421                 status = zone_status_get(zone);
3422                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3423                         continue;
3424                 /*
3425                  * Bail immediately if any callback invocation returns a
3426                  * non-zero value.
3427                  */
3428                 ret = (*cb)(zone, data);
3429                 if (ret != 0)
3430                         break;
3431         }
3432         mutex_exit(&zonehash_lock);
3433         return (ret);
3434 }
3435 
3436 static int
3437 zone_set_root(zone_t *zone, const char *upath)
3438 {
3439         vnode_t *vp;
3440         int trycount;
3441         int error = 0;
3442         char *path;
3443         struct pathname upn, pn;
3444         size_t pathlen;
3445 
3446         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3447                 return (error);
3448 
3449         pn_alloc(&pn);
3450 
3451         /* prevent infinite loop */
3452         trycount = 10;
3453         for (;;) {
3454                 if (--trycount <= 0) {
3455                         error = ESTALE;
3456                         goto out;
3457                 }
3458 
3459                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3460                         /*
3461                          * VOP_ACCESS() may cover 'vp' with a new
3462                          * filesystem, if 'vp' is an autoFS vnode.
3463                          * Get the new 'vp' if so.
3464                          */
3465                         if ((error =
3466                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3467                             (!vn_ismntpt(vp) ||
3468                             (error = traverse(&vp)) == 0)) {
3469                                 pathlen = pn.pn_pathlen + 2;
3470                                 path = kmem_alloc(pathlen, KM_SLEEP);
3471                                 (void) strncpy(path, pn.pn_path,
3472                                     pn.pn_pathlen + 1);
3473                                 path[pathlen - 2] = '/';
3474                                 path[pathlen - 1] = '\0';
3475                                 pn_free(&pn);
3476                                 pn_free(&upn);
3477 
3478                                 /* Success! */
3479                                 break;
3480                         }
3481                         VN_RELE(vp);
3482                 }
3483                 if (error != ESTALE)
3484                         goto out;
3485         }
3486 
3487         ASSERT(error == 0);
3488         mutex_enter(&vp->v_lock);
3489         if (vp->v_flag & VZONEROOT) {
3490                 /* Wow, someone's already using this zone root! */
3491                 error = EEXIST; /* XXX KEBE ASKS, better errno? */
3492                 mutex_exit(&vp->v_lock);
3493                 VN_RELE(vp);
3494                 goto out;
3495         }
3496         vp->v_flag |= VZONEROOT;
3497         mutex_exit(&vp->v_lock);
3498         zone->zone_rootvp = vp;              /* we hold a reference to vp */
3499         zone->zone_rootpath = path;
3500         zone->zone_rootpathlen = pathlen;
3501         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3502                 zone->zone_flags |= ZF_IS_SCRATCH;
3503         return (0);
3504 
3505 out:
3506         pn_free(&pn);
3507         pn_free(&upn);
3508         return (error);
3509 }
3510 
3511 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3512                         ((c) >= 'a' && (c) <= 'z') || \
3513                         ((c) >= 'A' && (c) <= 'Z'))
3514 
3515 static int
3516 zone_set_name(zone_t *zone, const char *uname)
3517 {
3518         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3519         size_t len;
3520         int i, err;
3521 
3522         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3523                 kmem_free(kname, ZONENAME_MAX);
3524                 return (err);   /* EFAULT or ENAMETOOLONG */
3525         }
3526 
3527         /* must be less than ZONENAME_MAX */
3528         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3529                 kmem_free(kname, ZONENAME_MAX);
3530                 return (EINVAL);
3531         }
3532 
3533         /*
3534          * Name must start with an alphanumeric and must contain only
3535          * alphanumerics, '-', '_' and '.'.
3536          */
3537         if (!isalnum(kname[0])) {
3538                 kmem_free(kname, ZONENAME_MAX);
3539                 return (EINVAL);
3540         }
3541         for (i = 1; i < len - 1; i++) {
3542                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3543                     kname[i] != '.') {
3544                         kmem_free(kname, ZONENAME_MAX);
3545                         return (EINVAL);
3546                 }
3547         }
3548 
3549         zone->zone_name = kname;
3550         return (0);
3551 }
3552 
3553 /*
3554  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3555  * is NULL or it points to a zone with no hostid emulation, then the machine's
3556  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3557  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3558  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3559  * hostid and the machine's hostid is invalid.
3560  */
3561 uint32_t
3562 zone_get_hostid(zone_t *zonep)
3563 {
3564         unsigned long machine_hostid;
3565 
3566         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3567                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3568                         return (HW_INVALID_HOSTID);
3569                 return ((uint32_t)machine_hostid);
3570         }
3571         return (zonep->zone_hostid);
3572 }
3573 
3574 /*
3575  * Similar to thread_create(), but makes sure the thread is in the appropriate
3576  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3577  */
3578 /*ARGSUSED*/
3579 kthread_t *
3580 zthread_create(
3581     caddr_t stk,
3582     size_t stksize,
3583     void (*proc)(),
3584     void *arg,
3585     size_t len,
3586     pri_t pri)
3587 {
3588         kthread_t *t;
3589         zone_t *zone = curproc->p_zone;
3590         proc_t *pp = zone->zone_zsched;
3591 
3592         zone_hold(zone);        /* Reference to be dropped when thread exits */
3593 
3594         /*
3595          * No-one should be trying to create threads if the zone is shutting
3596          * down and there aren't any kernel threads around.  See comment
3597          * in zthread_exit().
3598          */
3599         ASSERT(!(zone->zone_kthreads == NULL &&
3600             zone_status_get(zone) >= ZONE_IS_EMPTY));
3601         /*
3602          * Create a thread, but don't let it run until we've finished setting
3603          * things up.
3604          */
3605         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3606         ASSERT(t->t_forw == NULL);
3607         mutex_enter(&zone_status_lock);
3608         if (zone->zone_kthreads == NULL) {
3609                 t->t_forw = t->t_back = t;
3610         } else {
3611                 kthread_t *tx = zone->zone_kthreads;
3612 
3613                 t->t_forw = tx;
3614                 t->t_back = tx->t_back;
3615                 tx->t_back->t_forw = t;
3616                 tx->t_back = t;
3617         }
3618         zone->zone_kthreads = t;
3619         mutex_exit(&zone_status_lock);
3620 
3621         mutex_enter(&pp->p_lock);
3622         t->t_proc_flag |= TP_ZTHREAD;
3623         project_rele(t->t_proj);
3624         t->t_proj = project_hold(pp->p_task->tk_proj);
3625 
3626         /*
3627          * Setup complete, let it run.
3628          */
3629         thread_lock(t);
3630         t->t_schedflag |= TS_ALLSTART;
3631         setrun_locked(t);
3632         thread_unlock(t);
3633 
3634         mutex_exit(&pp->p_lock);
3635 
3636         return (t);
3637 }
3638 
3639 /*
3640  * Similar to thread_exit().  Must be called by threads created via
3641  * zthread_exit().
3642  */
3643 void
3644 zthread_exit(void)
3645 {
3646         kthread_t *t = curthread;
3647         proc_t *pp = curproc;
3648         zone_t *zone = pp->p_zone;
3649 
3650         mutex_enter(&zone_status_lock);
3651 
3652         /*
3653          * Reparent to p0
3654          */
3655         kpreempt_disable();
3656         mutex_enter(&pp->p_lock);
3657         t->t_proc_flag &= ~TP_ZTHREAD;
3658         t->t_procp = &p0;
3659         hat_thread_exit(t);
3660         mutex_exit(&pp->p_lock);
3661         kpreempt_enable();
3662 
3663         if (t->t_back == t) {
3664                 ASSERT(t->t_forw == t);
3665                 /*
3666                  * If the zone is empty, once the thread count
3667                  * goes to zero no further kernel threads can be
3668                  * created.  This is because if the creator is a process
3669                  * in the zone, then it must have exited before the zone
3670                  * state could be set to ZONE_IS_EMPTY.
3671                  * Otherwise, if the creator is a kernel thread in the
3672                  * zone, the thread count is non-zero.
3673                  *
3674                  * This really means that non-zone kernel threads should
3675                  * not create zone kernel threads.
3676                  */
3677                 zone->zone_kthreads = NULL;
3678                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3679                         zone_status_set(zone, ZONE_IS_DOWN);
3680                         /*
3681                          * Remove any CPU caps on this zone.
3682                          */
3683                         cpucaps_zone_remove(zone);
3684                 }
3685         } else {
3686                 t->t_forw->t_back = t->t_back;
3687                 t->t_back->t_forw = t->t_forw;
3688                 if (zone->zone_kthreads == t)
3689                         zone->zone_kthreads = t->t_forw;
3690         }
3691         mutex_exit(&zone_status_lock);
3692         zone_rele(zone);
3693         thread_exit();
3694         /* NOTREACHED */
3695 }
3696 
3697 static void
3698 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3699 {
3700         vnode_t *oldvp;
3701 
3702         /* we're going to hold a reference here to the directory */
3703         VN_HOLD(vp);
3704 
3705         /* update abs cwd/root path see c2/audit.c */
3706         if (AU_AUDITING())
3707                 audit_chdirec(vp, vpp);
3708 
3709         mutex_enter(&pp->p_lock);
3710         oldvp = *vpp;
3711         *vpp = vp;
3712         mutex_exit(&pp->p_lock);
3713         if (oldvp != NULL)
3714                 VN_RELE(oldvp);
3715 }
3716 
3717 /*
3718  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3719  */
3720 static int
3721 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3722 {
3723         nvpair_t *nvp = NULL;
3724         boolean_t priv_set = B_FALSE;
3725         boolean_t limit_set = B_FALSE;
3726         boolean_t action_set = B_FALSE;
3727 
3728         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3729                 const char *name;
3730                 uint64_t ui64;
3731 
3732                 name = nvpair_name(nvp);
3733                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3734                         return (EINVAL);
3735                 (void) nvpair_value_uint64(nvp, &ui64);
3736                 if (strcmp(name, "privilege") == 0) {
3737                         /*
3738                          * Currently only privileged values are allowed, but
3739                          * this may change in the future.
3740                          */
3741                         if (ui64 != RCPRIV_PRIVILEGED)
3742                                 return (EINVAL);
3743                         rv->rcv_privilege = ui64;
3744                         priv_set = B_TRUE;
3745                 } else if (strcmp(name, "limit") == 0) {
3746                         rv->rcv_value = ui64;
3747                         limit_set = B_TRUE;
3748                 } else if (strcmp(name, "action") == 0) {
3749                         if (ui64 != RCTL_LOCAL_NOACTION &&
3750                             ui64 != RCTL_LOCAL_DENY)
3751                                 return (EINVAL);
3752                         rv->rcv_flagaction = ui64;
3753                         action_set = B_TRUE;
3754                 } else {
3755                         return (EINVAL);
3756                 }
3757         }
3758 
3759         if (!(priv_set && limit_set && action_set))
3760                 return (EINVAL);
3761         rv->rcv_action_signal = 0;
3762         rv->rcv_action_recipient = NULL;
3763         rv->rcv_action_recip_pid = -1;
3764         rv->rcv_firing_time = 0;
3765 
3766         return (0);
3767 }
3768 
3769 /*
3770  * Non-global zone version of start_init.
3771  */
3772 void
3773 zone_start_init(void)
3774 {
3775         proc_t *p = ttoproc(curthread);
3776         zone_t *z = p->p_zone;
3777 
3778         ASSERT(!INGLOBALZONE(curproc));
3779 
3780         /*
3781          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3782          * storing just the pid of init is sufficient.
3783          */
3784         z->zone_proc_initpid = p->p_pid;
3785 
3786         /*
3787          * We maintain zone_boot_err so that we can return the cause of the
3788          * failure back to the caller of the zone_boot syscall.
3789          */
3790         p->p_zone->zone_boot_err = start_init_common();
3791 
3792         /*
3793          * We will prevent booting zones from becoming running zones if the
3794          * global zone is shutting down.
3795          */
3796         mutex_enter(&zone_status_lock);
3797         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3798             ZONE_IS_SHUTTING_DOWN) {
3799                 /*
3800                  * Make sure we are still in the booting state-- we could have
3801                  * raced and already be shutting down, or even further along.
3802                  */
3803                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3804                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3805                 }
3806                 mutex_exit(&zone_status_lock);
3807                 /* It's gone bad, dispose of the process */
3808                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3809                         mutex_enter(&p->p_lock);
3810                         ASSERT(p->p_flag & SEXITLWPS);
3811                         lwp_exit();
3812                 }
3813         } else {
3814                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3815                         zone_status_set(z, ZONE_IS_RUNNING);
3816                 mutex_exit(&zone_status_lock);
3817                 /* cause the process to return to userland. */
3818                 lwp_rtt();
3819         }
3820 }
3821 
3822 struct zsched_arg {
3823         zone_t *zone;
3824         nvlist_t *nvlist;
3825 };
3826 
3827 /*
3828  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3829  * anything to do with scheduling, but rather with the fact that
3830  * per-zone kernel threads are parented to zsched, just like regular
3831  * kernel threads are parented to sched (p0).
3832  *
3833  * zsched is also responsible for launching init for the zone.
3834  */
3835 static void
3836 zsched(void *arg)
3837 {
3838         struct zsched_arg *za = arg;
3839         proc_t *pp = curproc;
3840         proc_t *initp = proc_init;
3841         zone_t *zone = za->zone;
3842         cred_t *cr, *oldcred;
3843         rctl_set_t *set;
3844         rctl_alloc_gp_t *gp;
3845         contract_t *ct = NULL;
3846         task_t *tk, *oldtk;
3847         rctl_entity_p_t e;
3848         kproject_t *pj;
3849 
3850         nvlist_t *nvl = za->nvlist;
3851         nvpair_t *nvp = NULL;
3852 
3853         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3854         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3855         PTOU(pp)->u_argc = 0;
3856         PTOU(pp)->u_argv = 0;
3857         PTOU(pp)->u_envp = 0;
3858         PTOU(pp)->u_commpagep = 0;
3859         closeall(P_FINFO(pp));
3860 
3861         /*
3862          * We are this zone's "zsched" process.  As the zone isn't generally
3863          * visible yet we don't need to grab any locks before initializing its
3864          * zone_proc pointer.
3865          */
3866         zone_hold(zone);  /* this hold is released by zone_destroy() */
3867         zone->zone_zsched = pp;
3868         mutex_enter(&pp->p_lock);
3869         pp->p_zone = zone;
3870         mutex_exit(&pp->p_lock);
3871 
3872         /*
3873          * Disassociate process from its 'parent'; parent ourselves to init
3874          * (pid 1) and change other values as needed.
3875          */
3876         sess_create();
3877 
3878         mutex_enter(&pidlock);
3879         proc_detach(pp);
3880         pp->p_ppid = 1;
3881         pp->p_flag |= SZONETOP;
3882         pp->p_ancpid = 1;
3883         pp->p_parent = initp;
3884         pp->p_psibling = NULL;
3885         if (initp->p_child)
3886                 initp->p_child->p_psibling = pp;
3887         pp->p_sibling = initp->p_child;
3888         initp->p_child = pp;
3889 
3890         /* Decrement what newproc() incremented. */
3891         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3892         /*
3893          * Our credentials are about to become kcred-like, so we don't care
3894          * about the caller's ruid.
3895          */
3896         upcount_inc(crgetruid(kcred), zone->zone_id);
3897         mutex_exit(&pidlock);
3898 
3899         /*
3900          * getting out of global zone, so decrement lwp and process counts
3901          */
3902         pj = pp->p_task->tk_proj;
3903         mutex_enter(&global_zone->zone_nlwps_lock);
3904         pj->kpj_nlwps -= pp->p_lwpcnt;
3905         global_zone->zone_nlwps -= pp->p_lwpcnt;
3906         pj->kpj_nprocs--;
3907         global_zone->zone_nprocs--;
3908         mutex_exit(&global_zone->zone_nlwps_lock);
3909 
3910         /*
3911          * Decrement locked memory counts on old zone and project.
3912          */
3913         mutex_enter(&global_zone->zone_mem_lock);
3914         global_zone->zone_locked_mem -= pp->p_locked_mem;
3915         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3916         mutex_exit(&global_zone->zone_mem_lock);
3917 
3918         /*
3919          * Create and join a new task in project '0' of this zone.
3920          *
3921          * We don't need to call holdlwps() since we know we're the only lwp in
3922          * this process.
3923          *
3924          * task_join() returns with p_lock held.
3925          */
3926         tk = task_create(0, zone);
3927         mutex_enter(&cpu_lock);
3928         oldtk = task_join(tk, 0);
3929 
3930         pj = pp->p_task->tk_proj;
3931 
3932         mutex_enter(&zone->zone_mem_lock);
3933         zone->zone_locked_mem += pp->p_locked_mem;
3934         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3935         mutex_exit(&zone->zone_mem_lock);
3936 
3937         /*
3938          * add lwp and process counts to zsched's zone, and increment
3939          * project's task and process count due to the task created in
3940          * the above task_create.
3941          */
3942         mutex_enter(&zone->zone_nlwps_lock);
3943         pj->kpj_nlwps += pp->p_lwpcnt;
3944         pj->kpj_ntasks += 1;
3945         zone->zone_nlwps += pp->p_lwpcnt;
3946         pj->kpj_nprocs++;
3947         zone->zone_nprocs++;
3948         mutex_exit(&zone->zone_nlwps_lock);
3949 
3950         mutex_exit(&curproc->p_lock);
3951         mutex_exit(&cpu_lock);
3952         task_rele(oldtk);
3953 
3954         /*
3955          * The process was created by a process in the global zone, hence the
3956          * credentials are wrong.  We might as well have kcred-ish credentials.
3957          */
3958         cr = zone->zone_kcred;
3959         crhold(cr);
3960         mutex_enter(&pp->p_crlock);
3961         oldcred = pp->p_cred;
3962         pp->p_cred = cr;
3963         mutex_exit(&pp->p_crlock);
3964         crfree(oldcred);
3965 
3966         /*
3967          * Hold credentials again (for thread)
3968          */
3969         crhold(cr);
3970 
3971         /*
3972          * p_lwpcnt can't change since this is a kernel process.
3973          */
3974         crset(pp, cr);
3975 
3976         /*
3977          * Chroot
3978          */
3979         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3980         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3981 
3982         /*
3983          * Initialize zone's rctl set.
3984          */
3985         set = rctl_set_create();
3986         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3987         mutex_enter(&pp->p_lock);
3988         e.rcep_p.zone = zone;
3989         e.rcep_t = RCENTITY_ZONE;
3990         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3991         mutex_exit(&pp->p_lock);
3992         rctl_prealloc_destroy(gp);
3993 
3994         /*
3995          * Apply the rctls passed in to zone_create().  This is basically a list
3996          * assignment: all of the old values are removed and the new ones
3997          * inserted.  That is, if an empty list is passed in, all values are
3998          * removed.
3999          */
4000         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4001                 rctl_dict_entry_t *rde;
4002                 rctl_hndl_t hndl;
4003                 char *name;
4004                 nvlist_t **nvlarray;
4005                 uint_t i, nelem;
4006                 int error;      /* For ASSERT()s */
4007 
4008                 name = nvpair_name(nvp);
4009                 hndl = rctl_hndl_lookup(name);
4010                 ASSERT(hndl != -1);
4011                 rde = rctl_dict_lookup_hndl(hndl);
4012                 ASSERT(rde != NULL);
4013 
4014                 for (; /* ever */; ) {
4015                         rctl_val_t oval;
4016 
4017                         mutex_enter(&pp->p_lock);
4018                         error = rctl_local_get(hndl, NULL, &oval, pp);
4019                         mutex_exit(&pp->p_lock);
4020                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4021                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4022                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
4023                                 break;
4024                         mutex_enter(&pp->p_lock);
4025                         error = rctl_local_delete(hndl, &oval, pp);
4026                         mutex_exit(&pp->p_lock);
4027                         ASSERT(error == 0);
4028                 }
4029                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4030                 ASSERT(error == 0);
4031                 for (i = 0; i < nelem; i++) {
4032                         rctl_val_t *nvalp;
4033 
4034                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4035                         error = nvlist2rctlval(nvlarray[i], nvalp);
4036                         ASSERT(error == 0);
4037                         /*
4038                          * rctl_local_insert can fail if the value being
4039                          * inserted is a duplicate; this is OK.
4040                          */
4041                         mutex_enter(&pp->p_lock);
4042                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
4043                                 kmem_cache_free(rctl_val_cache, nvalp);
4044                         mutex_exit(&pp->p_lock);
4045                 }
4046         }
4047 
4048         /*
4049          * Tell the world that we're done setting up.
4050          *
4051          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4052          * and atomically set the zone's processor set visibility.  Once
4053          * we drop pool_lock() this zone will automatically get updated
4054          * to reflect any future changes to the pools configuration.
4055          *
4056          * Note that after we drop the locks below (zonehash_lock in
4057          * particular) other operations such as a zone_getattr call can
4058          * now proceed and observe the zone. That is the reason for doing a
4059          * state transition to the INITIALIZED state.
4060          */
4061         pool_lock();
4062         mutex_enter(&cpu_lock);
4063         mutex_enter(&zonehash_lock);
4064         zone_uniqid(zone);
4065         zone_zsd_configure(zone);
4066         if (pool_state == POOL_ENABLED)
4067                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4068         mutex_enter(&zone_status_lock);
4069         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4070         zone_status_set(zone, ZONE_IS_INITIALIZED);
4071         mutex_exit(&zone_status_lock);
4072         mutex_exit(&zonehash_lock);
4073         mutex_exit(&cpu_lock);
4074         pool_unlock();
4075 
4076         /* Now call the create callback for this key */
4077         zsd_apply_all_keys(zsd_apply_create, zone);
4078 
4079         /* The callbacks are complete. Mark ZONE_IS_READY */
4080         mutex_enter(&zone_status_lock);
4081         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4082         zone_status_set(zone, ZONE_IS_READY);
4083         mutex_exit(&zone_status_lock);
4084 
4085         /*
4086          * Once we see the zone transition to the ZONE_IS_BOOTING state,
4087          * we launch init, and set the state to running.
4088          */
4089         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4090 
4091         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4092                 id_t cid;
4093 
4094                 /*
4095                  * Ok, this is a little complicated.  We need to grab the
4096                  * zone's pool's scheduling class ID; note that by now, we
4097                  * are already bound to a pool if we need to be (zoneadmd
4098                  * will have done that to us while we're in the READY
4099                  * state).  *But* the scheduling class for the zone's 'init'
4100                  * must be explicitly passed to newproc, which doesn't
4101                  * respect pool bindings.
4102                  *
4103                  * We hold the pool_lock across the call to newproc() to
4104                  * close the obvious race: the pool's scheduling class
4105                  * could change before we manage to create the LWP with
4106                  * classid 'cid'.
4107                  */
4108                 pool_lock();
4109                 if (zone->zone_defaultcid > 0)
4110                         cid = zone->zone_defaultcid;
4111                 else
4112                         cid = pool_get_class(zone->zone_pool);
4113                 if (cid == -1)
4114                         cid = defaultcid;
4115 
4116                 /*
4117                  * If this fails, zone_boot will ultimately fail.  The
4118                  * state of the zone will be set to SHUTTING_DOWN-- userland
4119                  * will have to tear down the zone, and fail, or try again.
4120                  */
4121                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4122                     minclsyspri - 1, &ct, 0)) != 0) {
4123                         mutex_enter(&zone_status_lock);
4124                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4125                         mutex_exit(&zone_status_lock);
4126                 } else {
4127                         zone->zone_boot_time = gethrestime_sec();
4128                 }
4129 
4130                 pool_unlock();
4131         }
4132 
4133         /*
4134          * Wait for zone_destroy() to be called.  This is what we spend
4135          * most of our life doing.
4136          */
4137         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4138 
4139         if (ct)
4140                 /*
4141                  * At this point the process contract should be empty.
4142                  * (Though if it isn't, it's not the end of the world.)
4143                  */
4144                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4145 
4146         /*
4147          * Allow kcred to be freed when all referring processes
4148          * (including this one) go away.  We can't just do this in
4149          * zone_free because we need to wait for the zone_cred_ref to
4150          * drop to 0 before calling zone_free, and the existence of
4151          * zone_kcred will prevent that.  Thus, we call crfree here to
4152          * balance the crdup in zone_create.  The crhold calls earlier
4153          * in zsched will be dropped when the thread and process exit.
4154          */
4155         crfree(zone->zone_kcred);
4156         zone->zone_kcred = NULL;
4157 
4158         exit(CLD_EXITED, 0);
4159 }
4160 
4161 /*
4162  * Helper function to determine if there are any submounts of the
4163  * provided path.  Used to make sure the zone doesn't "inherit" any
4164  * mounts from before it is created.
4165  */
4166 static uint_t
4167 zone_mount_count(const char *rootpath)
4168 {
4169         vfs_t *vfsp;
4170         uint_t count = 0;
4171         size_t rootpathlen = strlen(rootpath);
4172 
4173         /*
4174          * Holding zonehash_lock prevents race conditions with
4175          * vfs_list_add()/vfs_list_remove() since we serialize with
4176          * zone_find_by_path().
4177          */
4178         ASSERT(MUTEX_HELD(&zonehash_lock));
4179         /*
4180          * The rootpath must end with a '/'
4181          */
4182         ASSERT(rootpath[rootpathlen - 1] == '/');
4183 
4184         /*
4185          * This intentionally does not count the rootpath itself if that
4186          * happens to be a mount point.
4187          */
4188         vfs_list_read_lock();
4189         vfsp = rootvfs;
4190         do {
4191                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4192                     rootpathlen) == 0)
4193                         count++;
4194                 vfsp = vfsp->vfs_next;
4195         } while (vfsp != rootvfs);
4196         vfs_list_unlock();
4197         return (count);
4198 }
4199 
4200 /*
4201  * Helper function to make sure that a zone created on 'rootpath'
4202  * wouldn't end up containing other zones' rootpaths.
4203  */
4204 static boolean_t
4205 zone_is_nested(const char *rootpath)
4206 {
4207         zone_t *zone;
4208         size_t rootpathlen = strlen(rootpath);
4209         size_t len;
4210 
4211         ASSERT(MUTEX_HELD(&zonehash_lock));
4212 
4213         /*
4214          * zone_set_root() appended '/' and '\0' at the end of rootpath
4215          */
4216         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4217             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4218                 return (B_TRUE);
4219 
4220         for (zone = list_head(&zone_active); zone != NULL;
4221             zone = list_next(&zone_active, zone)) {
4222                 if (zone == global_zone)
4223                         continue;
4224                 len = strlen(zone->zone_rootpath);
4225                 if (strncmp(rootpath, zone->zone_rootpath,
4226                     MIN(rootpathlen, len)) == 0)
4227                         return (B_TRUE);
4228         }
4229         return (B_FALSE);
4230 }
4231 
4232 static int
4233 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4234     size_t zone_privssz)
4235 {
4236         priv_set_t *privs;
4237 
4238         if (zone_privssz < sizeof (priv_set_t))
4239                 return (ENOMEM);
4240 
4241         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4242 
4243         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4244                 kmem_free(privs, sizeof (priv_set_t));
4245                 return (EFAULT);
4246         }
4247 
4248         zone->zone_privset = privs;
4249         return (0);
4250 }
4251 
4252 /*
4253  * We make creative use of nvlists to pass in rctls from userland.  The list is
4254  * a list of the following structures:
4255  *
4256  * (name = rctl_name, value = nvpair_list_array)
4257  *
4258  * Where each element of the nvpair_list_array is of the form:
4259  *
4260  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4261  *      (name = "limit", value = uint64_t),
4262  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4263  */
4264 static int
4265 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4266 {
4267         nvpair_t *nvp = NULL;
4268         nvlist_t *nvl = NULL;
4269         char *kbuf;
4270         int error;
4271         rctl_val_t rv;
4272 
4273         *nvlp = NULL;
4274 
4275         if (buflen == 0)
4276                 return (0);
4277 
4278         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4279                 return (ENOMEM);
4280         if (copyin(ubuf, kbuf, buflen)) {
4281                 error = EFAULT;
4282                 goto out;
4283         }
4284         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4285                 /*
4286                  * nvl may have been allocated/free'd, but the value set to
4287                  * non-NULL, so we reset it here.
4288                  */
4289                 nvl = NULL;
4290                 error = EINVAL;
4291                 goto out;
4292         }
4293         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4294                 rctl_dict_entry_t *rde;
4295                 rctl_hndl_t hndl;
4296                 nvlist_t **nvlarray;
4297                 uint_t i, nelem;
4298                 char *name;
4299 
4300                 error = EINVAL;
4301                 name = nvpair_name(nvp);
4302                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4303                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4304                         goto out;
4305                 }
4306                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4307                         goto out;
4308                 }
4309                 rde = rctl_dict_lookup_hndl(hndl);
4310                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4311                 ASSERT(error == 0);
4312                 for (i = 0; i < nelem; i++) {
4313                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4314                                 goto out;
4315                 }
4316                 if (rctl_invalid_value(rde, &rv)) {
4317                         error = EINVAL;
4318                         goto out;
4319                 }
4320         }
4321         error = 0;
4322         *nvlp = nvl;
4323 out:
4324         kmem_free(kbuf, buflen);
4325         if (error && nvl != NULL)
4326                 nvlist_free(nvl);
4327         return (error);
4328 }
4329 
4330 int
4331 zone_create_error(int er_error, int er_ext, int *er_out)
4332 {
4333         if (er_out != NULL) {
4334                 if (copyout(&er_ext, er_out, sizeof (int))) {
4335                         return (set_errno(EFAULT));
4336                 }
4337         }
4338         return (set_errno(er_error));
4339 }
4340 
4341 static int
4342 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4343 {
4344         ts_label_t *tsl;
4345         bslabel_t blab;
4346 
4347         /* Get label from user */
4348         if (copyin(lab, &blab, sizeof (blab)) != 0)
4349                 return (EFAULT);
4350         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4351         if (tsl == NULL)
4352                 return (ENOMEM);
4353 
4354         zone->zone_slabel = tsl;
4355         return (0);
4356 }
4357 
4358 /*
4359  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4360  */
4361 static int
4362 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4363 {
4364         char *kbuf;
4365         char *dataset, *next;
4366         zone_dataset_t *zd;
4367         size_t len;
4368 
4369         if (ubuf == NULL || buflen == 0)
4370                 return (0);
4371 
4372         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4373                 return (ENOMEM);
4374 
4375         if (copyin(ubuf, kbuf, buflen) != 0) {
4376                 kmem_free(kbuf, buflen);
4377                 return (EFAULT);
4378         }
4379 
4380         dataset = next = kbuf;
4381         for (;;) {
4382                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4383 
4384                 next = strchr(dataset, ',');
4385 
4386                 if (next == NULL)
4387                         len = strlen(dataset);
4388                 else
4389                         len = next - dataset;
4390 
4391                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4392                 bcopy(dataset, zd->zd_dataset, len);
4393                 zd->zd_dataset[len] = '\0';
4394 
4395                 list_insert_head(&zone->zone_datasets, zd);
4396 
4397                 if (next == NULL)
4398                         break;
4399 
4400                 dataset = next + 1;
4401         }
4402 
4403         kmem_free(kbuf, buflen);
4404         return (0);
4405 }
4406 
4407 /*
4408  * System call to create/initialize a new zone named 'zone_name', rooted
4409  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4410  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4411  * with labeling set by 'match', 'doi', and 'label'.
4412  *
4413  * If extended error is non-null, we may use it to return more detailed
4414  * error information.
4415  */
4416 static zoneid_t
4417 zone_create(const char *zone_name, const char *zone_root,
4418     const priv_set_t *zone_privs, size_t zone_privssz,
4419     caddr_t rctlbuf, size_t rctlbufsz,
4420     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4421     int match, uint32_t doi, const bslabel_t *label,
4422     int flags)
4423 {
4424         struct zsched_arg zarg;
4425         nvlist_t *rctls = NULL;
4426         proc_t *pp = curproc;
4427         zone_t *zone, *ztmp;
4428         zoneid_t zoneid, start = GLOBAL_ZONEID;
4429         int error;
4430         int error2 = 0;
4431         char *str;
4432         cred_t *zkcr;
4433         boolean_t insert_label_hash;
4434 
4435         if (secpolicy_zone_config(CRED()) != 0)
4436                 return (set_errno(EPERM));
4437 
4438         /* can't boot zone from within chroot environment */
4439         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4440                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4441                     extended_error));
4442         /*
4443          * As the first step of zone creation, we want to allocate a zoneid.
4444          * This allocation is complicated by the fact that netstacks use the
4445          * zoneid to determine their stackid, but netstacks themselves are
4446          * freed asynchronously with respect to zone destruction.  This means
4447          * that a netstack reference leak (or in principle, an extraordinarily
4448          * long netstack reference hold) could result in a zoneid being
4449          * allocated that in fact corresponds to a stackid from an active
4450          * (referenced) netstack -- unleashing all sorts of havoc when that
4451          * netstack is actually (re)used.  (In the abstract, we might wish a
4452          * zoneid to not be deallocated until its last referencing netstack
4453          * has been released, but netstacks lack a backpointer into their
4454          * referencing zone -- and changing them to have such a pointer would
4455          * be substantial, to put it euphemistically.)  To avoid this, we
4456          * detect this condition on allocation: if we have allocated a zoneid
4457          * that corresponds to a netstack that's still in use, we warn about
4458          * it (as it is much more likely to be a reference leak than an actual
4459          * netstack reference), free it, and allocate another.  That these
4460          * identifers are allocated out of an ID space assures that we won't
4461          * see the identifier we just allocated.
4462          */
4463         for (;;) {
4464                 zoneid = id_alloc(zoneid_space);
4465 
4466                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4467                         break;
4468 
4469                 id_free(zoneid_space, zoneid);
4470 
4471                 if (start == GLOBAL_ZONEID) {
4472                         start = zoneid;
4473                 } else if (zoneid == start) {
4474                         /*
4475                          * We have managed to iterate over the entire available
4476                          * zoneid space -- there are no identifiers available,
4477                          * presumably due to some number of leaked netstack
4478                          * references.  While it's in principle possible for us
4479                          * to continue to try, it seems wiser to give up at
4480                          * this point to warn and fail explicitly with a
4481                          * distinctive error.
4482                          */
4483                         cmn_err(CE_WARN, "zone_create() failed: all available "
4484                             "zone IDs have netstacks still in use");
4485                         return (set_errno(ENFILE));
4486                 }
4487 
4488                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4489                     "netstack still in use", zoneid);
4490         }
4491 
4492         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4493         zone->zone_id = zoneid;
4494         zone->zone_status = ZONE_IS_UNINITIALIZED;
4495         zone->zone_pool = pool_default;
4496         zone->zone_pool_mod = gethrtime();
4497         zone->zone_psetid = ZONE_PS_INVAL;
4498         zone->zone_ncpus = 0;
4499         zone->zone_ncpus_online = 0;
4500         zone->zone_restart_init = B_TRUE;
4501         zone->zone_brand = &native_brand;
4502         zone->zone_initname = NULL;
4503         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4504         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4505         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4506         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4507         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4508             offsetof(zone_ref_t, zref_linkage));
4509         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4510             offsetof(struct zsd_entry, zsd_linkage));
4511         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4512             offsetof(zone_dataset_t, zd_linkage));
4513         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4514             offsetof(zone_dl_t, zdl_linkage));
4515         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4516         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4517 
4518         if (flags & ZCF_NET_EXCL) {
4519                 zone->zone_flags |= ZF_NET_EXCL;
4520         }
4521 
4522         if ((error = zone_set_name(zone, zone_name)) != 0) {
4523                 zone_free(zone);
4524                 return (zone_create_error(error, 0, extended_error));
4525         }
4526 
4527         if ((error = zone_set_root(zone, zone_root)) != 0) {
4528                 zone_free(zone);
4529                 return (zone_create_error(error, 0, extended_error));
4530         }
4531         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4532                 zone_free(zone);
4533                 return (zone_create_error(error, 0, extended_error));
4534         }
4535 
4536         /* initialize node name to be the same as zone name */
4537         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4538         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4539         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4540 
4541         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4542         zone->zone_domain[0] = '\0';
4543         zone->zone_hostid = HW_INVALID_HOSTID;
4544         zone->zone_shares = 1;
4545         zone->zone_shmmax = 0;
4546         zone->zone_ipc.ipcq_shmmni = 0;
4547         zone->zone_ipc.ipcq_semmni = 0;
4548         zone->zone_ipc.ipcq_msgmni = 0;
4549         zone->zone_bootargs = NULL;
4550         zone->zone_fs_allowed = NULL;
4551 
4552         psecflags_default(&zone->zone_secflags);
4553 
4554         zone->zone_initname =
4555             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4556         (void) strcpy(zone->zone_initname, zone_default_initname);
4557         zone->zone_nlwps = 0;
4558         zone->zone_nlwps_ctl = INT_MAX;
4559         zone->zone_nprocs = 0;
4560         zone->zone_nprocs_ctl = INT_MAX;
4561         zone->zone_locked_mem = 0;
4562         zone->zone_locked_mem_ctl = UINT64_MAX;
4563         zone->zone_max_swap = 0;
4564         zone->zone_max_swap_ctl = UINT64_MAX;
4565         zone->zone_max_lofi = 0;
4566         zone->zone_max_lofi_ctl = UINT64_MAX;
4567         zone0.zone_lockedmem_kstat = NULL;
4568         zone0.zone_swapresv_kstat = NULL;
4569 
4570         zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
4571 
4572         /*
4573          * Zsched initializes the rctls.
4574          */
4575         zone->zone_rctls = NULL;
4576 
4577         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4578                 zone_free(zone);
4579                 return (zone_create_error(error, 0, extended_error));
4580         }
4581 
4582         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4583                 zone_free(zone);
4584                 return (set_errno(error));
4585         }
4586 
4587         /*
4588          * Read in the trusted system parameters:
4589          * match flag and sensitivity label.
4590          */
4591         zone->zone_match = match;
4592         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4593                 /* Fail if requested to set doi to anything but system's doi */
4594                 if (doi != 0 && doi != default_doi) {
4595                         zone_free(zone);
4596                         return (set_errno(EINVAL));
4597                 }
4598                 /* Always apply system's doi to the zone */
4599                 error = zone_set_label(zone, label, default_doi);
4600                 if (error != 0) {
4601                         zone_free(zone);
4602                         return (set_errno(error));
4603                 }
4604                 insert_label_hash = B_TRUE;
4605         } else {
4606                 /* all zones get an admin_low label if system is not labeled */
4607                 zone->zone_slabel = l_admin_low;
4608                 label_hold(l_admin_low);
4609                 insert_label_hash = B_FALSE;
4610         }
4611 
4612         /*
4613          * Stop all lwps since that's what normally happens as part of fork().
4614          * This needs to happen before we grab any locks to avoid deadlock
4615          * (another lwp in the process could be waiting for the held lock).
4616          */
4617         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4618                 zone_free(zone);
4619                 nvlist_free(rctls);
4620                 return (zone_create_error(error, 0, extended_error));
4621         }
4622 
4623         if (block_mounts(zone) == 0) {
4624                 mutex_enter(&pp->p_lock);
4625                 if (curthread != pp->p_agenttp)
4626                         continuelwps(pp);
4627                 mutex_exit(&pp->p_lock);
4628                 zone_free(zone);
4629                 nvlist_free(rctls);
4630                 return (zone_create_error(error, 0, extended_error));
4631         }
4632 
4633         /*
4634          * Set up credential for kernel access.  After this, any errors
4635          * should go through the dance in errout rather than calling
4636          * zone_free directly.
4637          */
4638         zone->zone_kcred = crdup(kcred);
4639         crsetzone(zone->zone_kcred, zone);
4640         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4641         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4642         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4643         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4644 
4645         mutex_enter(&zonehash_lock);
4646         /*
4647          * Make sure zone doesn't already exist.
4648          *
4649          * If the system and zone are labeled,
4650          * make sure no other zone exists that has the same label.
4651          */
4652         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4653             (insert_label_hash &&
4654             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4655                 zone_status_t status;
4656 
4657                 status = zone_status_get(ztmp);
4658                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4659                         error = EEXIST;
4660                 else
4661                         error = EBUSY;
4662 
4663                 if (insert_label_hash)
4664                         error2 = ZE_LABELINUSE;
4665 
4666                 goto errout;
4667         }
4668 
4669         /*
4670          * Don't allow zone creations which would cause one zone's rootpath to
4671          * be accessible from that of another (non-global) zone.
4672          */
4673         if (zone_is_nested(zone->zone_rootpath)) {
4674                 error = EBUSY;
4675                 goto errout;
4676         }
4677 
4678         ASSERT(zonecount != 0);         /* check for leaks */
4679         if (zonecount + 1 > maxzones) {
4680                 error = ENOMEM;
4681                 goto errout;
4682         }
4683 
4684         if (zone_mount_count(zone->zone_rootpath) != 0) {
4685                 error = EBUSY;
4686                 error2 = ZE_AREMOUNTS;
4687                 goto errout;
4688         }
4689 
4690         /*
4691          * Zone is still incomplete, but we need to drop all locks while
4692          * zsched() initializes this zone's kernel process.  We
4693          * optimistically add the zone to the hashtable and associated
4694          * lists so a parallel zone_create() doesn't try to create the
4695          * same zone.
4696          */
4697         zonecount++;
4698         (void) mod_hash_insert(zonehashbyid,
4699             (mod_hash_key_t)(uintptr_t)zone->zone_id,
4700             (mod_hash_val_t)(uintptr_t)zone);
4701         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4702         (void) strcpy(str, zone->zone_name);
4703         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4704             (mod_hash_val_t)(uintptr_t)zone);
4705         if (insert_label_hash) {
4706                 (void) mod_hash_insert(zonehashbylabel,
4707                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4708                 zone->zone_flags |= ZF_HASHED_LABEL;
4709         }
4710 
4711         /*
4712          * Insert into active list.  At this point there are no 'hold's
4713          * on the zone, but everyone else knows not to use it, so we can
4714          * continue to use it.  zsched() will do a zone_hold() if the
4715          * newproc() is successful.
4716          */
4717         list_insert_tail(&zone_active, zone);
4718         mutex_exit(&zonehash_lock);
4719 
4720         zarg.zone = zone;
4721         zarg.nvlist = rctls;
4722         /*
4723          * The process, task, and project rctls are probably wrong;
4724          * we need an interface to get the default values of all rctls,
4725          * and initialize zsched appropriately.  I'm not sure that that
4726          * makes much of a difference, though.
4727          */
4728         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4729         if (error != 0) {
4730                 /*
4731                  * We need to undo all globally visible state.
4732                  */
4733                 mutex_enter(&zonehash_lock);
4734                 list_remove(&zone_active, zone);
4735                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4736                         ASSERT(zone->zone_slabel != NULL);
4737                         (void) mod_hash_destroy(zonehashbylabel,
4738                             (mod_hash_key_t)zone->zone_slabel);
4739                 }
4740                 (void) mod_hash_destroy(zonehashbyname,
4741                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4742                 (void) mod_hash_destroy(zonehashbyid,
4743                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4744                 ASSERT(zonecount > 1);
4745                 zonecount--;
4746                 goto errout;
4747         }
4748 
4749         /*
4750          * Zone creation can't fail from now on.
4751          */
4752 
4753         /*
4754          * Create zone kstats
4755          */
4756         zone_kstat_create(zone);
4757 
4758         /*
4759          * Let the other lwps continue.
4760          */
4761         mutex_enter(&pp->p_lock);
4762         if (curthread != pp->p_agenttp)
4763                 continuelwps(pp);
4764         mutex_exit(&pp->p_lock);
4765 
4766         /*
4767          * Wait for zsched to finish initializing the zone.
4768          */
4769         zone_status_wait(zone, ZONE_IS_READY);
4770         /*
4771          * The zone is fully visible, so we can let mounts progress.
4772          */
4773         resume_mounts(zone);
4774         nvlist_free(rctls);
4775 
4776         return (zoneid);
4777 
4778 errout:
4779         mutex_exit(&zonehash_lock);
4780         /*
4781          * Let the other lwps continue.
4782          */
4783         mutex_enter(&pp->p_lock);
4784         if (curthread != pp->p_agenttp)
4785                 continuelwps(pp);
4786         mutex_exit(&pp->p_lock);
4787 
4788         resume_mounts(zone);
4789         nvlist_free(rctls);
4790         /*
4791          * There is currently one reference to the zone, a cred_ref from
4792          * zone_kcred.  To free the zone, we call crfree, which will call
4793          * zone_cred_rele, which will call zone_free.
4794          */
4795         ASSERT(zone->zone_cred_ref == 1);
4796         ASSERT(zone->zone_kcred->cr_ref == 1);
4797         ASSERT(zone->zone_ref == 0);
4798         zkcr = zone->zone_kcred;
4799         zone->zone_kcred = NULL;
4800         crfree(zkcr);                           /* triggers call to zone_free */
4801         return (zone_create_error(error, error2, extended_error));
4802 }
4803 
4804 /*
4805  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4806  * the heavy lifting.  initname is the path to the program to launch
4807  * at the "top" of the zone; if this is NULL, we use the system default,
4808  * which is stored at zone_default_initname.
4809  */
4810 static int
4811 zone_boot(zoneid_t zoneid)
4812 {
4813         int err;
4814         zone_t *zone;
4815 
4816         if (secpolicy_zone_config(CRED()) != 0)
4817                 return (set_errno(EPERM));
4818         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4819                 return (set_errno(EINVAL));
4820 
4821         mutex_enter(&zonehash_lock);
4822         /*
4823          * Look for zone under hash lock to prevent races with calls to
4824          * zone_shutdown, zone_destroy, etc.
4825          */
4826         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4827                 mutex_exit(&zonehash_lock);
4828                 return (set_errno(EINVAL));
4829         }
4830 
4831         mutex_enter(&zone_status_lock);
4832         if (zone_status_get(zone) != ZONE_IS_READY) {
4833                 mutex_exit(&zone_status_lock);
4834                 mutex_exit(&zonehash_lock);
4835                 return (set_errno(EINVAL));
4836         }
4837         zone_status_set(zone, ZONE_IS_BOOTING);
4838         mutex_exit(&zone_status_lock);
4839 
4840         zone_hold(zone);        /* so we can use the zone_t later */
4841         mutex_exit(&zonehash_lock);
4842 
4843         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4844                 zone_rele(zone);
4845                 return (set_errno(EINTR));
4846         }
4847 
4848         /*
4849          * Boot (starting init) might have failed, in which case the zone
4850          * will go to the SHUTTING_DOWN state; an appropriate errno will
4851          * be placed in zone->zone_boot_err, and so we return that.
4852          */
4853         err = zone->zone_boot_err;
4854         zone_rele(zone);
4855         return (err ? set_errno(err) : 0);
4856 }
4857 
4858 /*
4859  * Kills all user processes in the zone, waiting for them all to exit
4860  * before returning.
4861  */
4862 static int
4863 zone_empty(zone_t *zone)
4864 {
4865         int waitstatus;
4866 
4867         /*
4868          * We need to drop zonehash_lock before killing all
4869          * processes, otherwise we'll deadlock with zone_find_*
4870          * which can be called from the exit path.
4871          */
4872         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4873         while ((waitstatus = zone_status_timedwait_sig(zone,
4874             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4875                 killall(zone->zone_id);
4876         }
4877         /*
4878          * return EINTR if we were signaled
4879          */
4880         if (waitstatus == 0)
4881                 return (EINTR);
4882         return (0);
4883 }
4884 
4885 /*
4886  * This function implements the policy for zone visibility.
4887  *
4888  * In standard Solaris, a non-global zone can only see itself.
4889  *
4890  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4891  * it dominates. For this test, the label of the global zone is treated as
4892  * admin_high so it is special-cased instead of being checked for dominance.
4893  *
4894  * Returns true if zone attributes are viewable, false otherwise.
4895  */
4896 static boolean_t
4897 zone_list_access(zone_t *zone)
4898 {
4899 
4900         if (curproc->p_zone == global_zone ||
4901             curproc->p_zone == zone) {
4902                 return (B_TRUE);
4903         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4904                 bslabel_t *curproc_label;
4905                 bslabel_t *zone_label;
4906 
4907                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4908                 zone_label = label2bslabel(zone->zone_slabel);
4909 
4910                 if (zone->zone_id != GLOBAL_ZONEID &&
4911                     bldominates(curproc_label, zone_label)) {
4912                         return (B_TRUE);
4913                 } else {
4914                         return (B_FALSE);
4915                 }
4916         } else {
4917                 return (B_FALSE);
4918         }
4919 }
4920 
4921 /*
4922  * Systemcall to start the zone's halt sequence.  By the time this
4923  * function successfully returns, all user processes and kernel threads
4924  * executing in it will have exited, ZSD shutdown callbacks executed,
4925  * and the zone status set to ZONE_IS_DOWN.
4926  *
4927  * It is possible that the call will interrupt itself if the caller is the
4928  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4929  */
4930 static int
4931 zone_shutdown(zoneid_t zoneid)
4932 {
4933         int error;
4934         zone_t *zone;
4935         zone_status_t status;
4936 
4937         if (secpolicy_zone_config(CRED()) != 0)
4938                 return (set_errno(EPERM));
4939         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4940                 return (set_errno(EINVAL));
4941 
4942         mutex_enter(&zonehash_lock);
4943         /*
4944          * Look for zone under hash lock to prevent races with other
4945          * calls to zone_shutdown and zone_destroy.
4946          */
4947         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4948                 mutex_exit(&zonehash_lock);
4949                 return (set_errno(EINVAL));
4950         }
4951 
4952         /*
4953          * We have to drop zonehash_lock before calling block_mounts.
4954          * Hold the zone so we can continue to use the zone_t.
4955          */
4956         zone_hold(zone);
4957         mutex_exit(&zonehash_lock);
4958 
4959         /*
4960          * Block mounts so that VFS_MOUNT() can get an accurate view of
4961          * the zone's status with regards to ZONE_IS_SHUTTING down.
4962          *
4963          * e.g. NFS can fail the mount if it determines that the zone
4964          * has already begun the shutdown sequence.
4965          *
4966          */
4967         if (block_mounts(zone) == 0) {
4968                 zone_rele(zone);
4969                 return (set_errno(EINTR));
4970         }
4971 
4972         mutex_enter(&zonehash_lock);
4973         mutex_enter(&zone_status_lock);
4974         status = zone_status_get(zone);
4975         /*
4976          * Fail if the zone isn't fully initialized yet.
4977          */
4978         if (status < ZONE_IS_READY) {
4979                 mutex_exit(&zone_status_lock);
4980                 mutex_exit(&zonehash_lock);
4981                 resume_mounts(zone);
4982                 zone_rele(zone);
4983                 return (set_errno(EINVAL));
4984         }
4985         /*
4986          * If conditions required for zone_shutdown() to return have been met,
4987          * return success.
4988          */
4989         if (status >= ZONE_IS_DOWN) {
4990                 mutex_exit(&zone_status_lock);
4991                 mutex_exit(&zonehash_lock);
4992                 resume_mounts(zone);
4993                 zone_rele(zone);
4994                 return (0);
4995         }
4996         /*
4997          * If zone_shutdown() hasn't been called before, go through the motions.
4998          * If it has, there's nothing to do but wait for the kernel threads to
4999          * drain.
5000          */
5001         if (status < ZONE_IS_EMPTY) {
5002                 uint_t ntasks;
5003 
5004                 mutex_enter(&zone->zone_lock);
5005                 if ((ntasks = zone->zone_ntasks) != 1) {
5006                         /*
5007                          * There's still stuff running.
5008                          */
5009                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5010                 }
5011                 mutex_exit(&zone->zone_lock);
5012                 if (ntasks == 1) {
5013                         /*
5014                          * The only way to create another task is through
5015                          * zone_enter(), which will block until we drop
5016                          * zonehash_lock.  The zone is empty.
5017                          */
5018                         if (zone->zone_kthreads == NULL) {
5019                                 /*
5020                                  * Skip ahead to ZONE_IS_DOWN
5021                                  */
5022                                 zone_status_set(zone, ZONE_IS_DOWN);
5023                         } else {
5024                                 zone_status_set(zone, ZONE_IS_EMPTY);
5025                         }
5026                 }
5027         }
5028         mutex_exit(&zone_status_lock);
5029         mutex_exit(&zonehash_lock);
5030         resume_mounts(zone);
5031 
5032         if (error = zone_empty(zone)) {
5033                 zone_rele(zone);
5034                 return (set_errno(error));
5035         }
5036         /*
5037          * After the zone status goes to ZONE_IS_DOWN this zone will no
5038          * longer be notified of changes to the pools configuration, so
5039          * in order to not end up with a stale pool pointer, we point
5040          * ourselves at the default pool and remove all resource
5041          * visibility.  This is especially important as the zone_t may
5042          * languish on the deathrow for a very long time waiting for
5043          * cred's to drain out.
5044          *
5045          * This rebinding of the zone can happen multiple times
5046          * (presumably due to interrupted or parallel systemcalls)
5047          * without any adverse effects.
5048          */
5049         if (pool_lock_intr() != 0) {
5050                 zone_rele(zone);
5051                 return (set_errno(EINTR));
5052         }
5053         if (pool_state == POOL_ENABLED) {
5054                 mutex_enter(&cpu_lock);
5055                 zone_pool_set(zone, pool_default);
5056                 /*
5057                  * The zone no longer needs to be able to see any cpus.
5058                  */
5059                 zone_pset_set(zone, ZONE_PS_INVAL);
5060                 mutex_exit(&cpu_lock);
5061         }
5062         pool_unlock();
5063 
5064         /*
5065          * ZSD shutdown callbacks can be executed multiple times, hence
5066          * it is safe to not be holding any locks across this call.
5067          */
5068         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5069 
5070         mutex_enter(&zone_status_lock);
5071         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5072                 zone_status_set(zone, ZONE_IS_DOWN);
5073         mutex_exit(&zone_status_lock);
5074 
5075         /*
5076          * Wait for kernel threads to drain.
5077          */
5078         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5079                 zone_rele(zone);
5080                 return (set_errno(EINTR));
5081         }
5082 
5083         /*
5084          * Zone can be become down/destroyable even if the above wait
5085          * returns EINTR, so any code added here may never execute.
5086          * (i.e. don't add code here)
5087          */
5088 
5089         zone_rele(zone);
5090         return (0);
5091 }
5092 
5093 /*
5094  * Log the specified zone's reference counts.  The caller should not be
5095  * holding the zone's zone_lock.
5096  */
5097 static void
5098 zone_log_refcounts(zone_t *zone)
5099 {
5100         char *buffer;
5101         char *buffer_position;
5102         uint32_t buffer_size;
5103         uint32_t index;
5104         uint_t ref;
5105         uint_t cred_ref;
5106 
5107         /*
5108          * Construct a string representing the subsystem-specific reference
5109          * counts.  The counts are printed in ascending order by index into the
5110          * zone_t::zone_subsys_ref array.  The list will be surrounded by
5111          * square brackets [] and will only contain nonzero reference counts.
5112          *
5113          * The buffer will hold two square bracket characters plus ten digits,
5114          * one colon, one space, one comma, and some characters for a
5115          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5116          * bit integers have at most ten decimal digits.)  The last
5117          * reference count's comma is replaced by the closing square
5118          * bracket and a NULL character to terminate the string.
5119          *
5120          * NOTE: We have to grab the zone's zone_lock to create a consistent
5121          * snapshot of the zone's reference counters.
5122          *
5123          * First, figure out how much space the string buffer will need.
5124          * The buffer's size is stored in buffer_size.
5125          */
5126         buffer_size = 2;                        /* for the square brackets */
5127         mutex_enter(&zone->zone_lock);
5128         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5129         ref = zone->zone_ref;
5130         cred_ref = zone->zone_cred_ref;
5131         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5132                 if (zone->zone_subsys_ref[index] != 0)
5133                         buffer_size += strlen(zone_ref_subsys_names[index]) +
5134                             13;
5135         if (buffer_size == 2) {
5136                 /*
5137                  * No subsystems had nonzero reference counts.  Don't bother
5138                  * with allocating a buffer; just log the general-purpose and
5139                  * credential reference counts.
5140                  */
5141                 mutex_exit(&zone->zone_lock);
5142                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5143                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
5144                     "references and %u credential references are still extant",
5145                     zone->zone_name, zone->zone_id, ref, cred_ref);
5146                 return;
5147         }
5148 
5149         /*
5150          * buffer_size contains the exact number of characters that the
5151          * buffer will need.  Allocate the buffer and fill it with nonzero
5152          * subsystem-specific reference counts.  Surround the results with
5153          * square brackets afterwards.
5154          */
5155         buffer = kmem_alloc(buffer_size, KM_SLEEP);
5156         buffer_position = &buffer[1];
5157         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5158                 /*
5159                  * NOTE: The DDI's version of sprintf() returns a pointer to
5160                  * the modified buffer rather than the number of bytes written
5161                  * (as in snprintf(3C)).  This is unfortunate and annoying.
5162                  * Therefore, we'll use snprintf() with INT_MAX to get the
5163                  * number of bytes written.  Using INT_MAX is safe because
5164                  * the buffer is perfectly sized for the data: we'll never
5165                  * overrun the buffer.
5166                  */
5167                 if (zone->zone_subsys_ref[index] != 0)
5168                         buffer_position += snprintf(buffer_position, INT_MAX,
5169                             "%s: %u,", zone_ref_subsys_names[index],
5170                             zone->zone_subsys_ref[index]);
5171         }
5172         mutex_exit(&zone->zone_lock);
5173         buffer[0] = '[';
5174         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5175         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5176         buffer_position[-1] = ']';
5177 
5178         /*
5179          * Log the reference counts and free the message buffer.
5180          */
5181         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5182             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5183             "%u credential references are still extant %s", zone->zone_name,
5184             zone->zone_id, ref, cred_ref, buffer);
5185         kmem_free(buffer, buffer_size);
5186 }
5187 
5188 /*
5189  * Systemcall entry point to finalize the zone halt process.  The caller
5190  * must have already successfully called zone_shutdown().
5191  *
5192  * Upon successful completion, the zone will have been fully destroyed:
5193  * zsched will have exited, destructor callbacks executed, and the zone
5194  * removed from the list of active zones.
5195  */
5196 static int
5197 zone_destroy(zoneid_t zoneid)
5198 {
5199         uint64_t uniqid;
5200         zone_t *zone;
5201         zone_status_t status;
5202         clock_t wait_time;
5203         boolean_t log_refcounts;
5204 
5205         if (secpolicy_zone_config(CRED()) != 0)
5206                 return (set_errno(EPERM));
5207         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5208                 return (set_errno(EINVAL));
5209 
5210         mutex_enter(&zonehash_lock);
5211         /*
5212          * Look for zone under hash lock to prevent races with other
5213          * calls to zone_destroy.
5214          */
5215         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5216                 mutex_exit(&zonehash_lock);
5217                 return (set_errno(EINVAL));
5218         }
5219 
5220         if (zone_mount_count(zone->zone_rootpath) != 0) {
5221                 mutex_exit(&zonehash_lock);
5222                 return (set_errno(EBUSY));
5223         }
5224         mutex_enter(&zone_status_lock);
5225         status = zone_status_get(zone);
5226         if (status < ZONE_IS_DOWN) {
5227                 mutex_exit(&zone_status_lock);
5228                 mutex_exit(&zonehash_lock);
5229                 return (set_errno(EBUSY));
5230         } else if (status == ZONE_IS_DOWN) {
5231                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5232         }
5233         mutex_exit(&zone_status_lock);
5234         zone_hold(zone);
5235         mutex_exit(&zonehash_lock);
5236 
5237         /*
5238          * wait for zsched to exit
5239          */
5240         zone_status_wait(zone, ZONE_IS_DEAD);
5241         zone_zsd_callbacks(zone, ZSD_DESTROY);
5242         zone->zone_netstack = NULL;
5243         uniqid = zone->zone_uniqid;
5244         zone_rele(zone);
5245         zone = NULL;    /* potentially free'd */
5246 
5247         log_refcounts = B_FALSE;
5248         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5249         mutex_enter(&zonehash_lock);
5250         for (; /* ever */; ) {
5251                 boolean_t unref;
5252                 boolean_t refs_have_been_logged;
5253 
5254                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5255                     zone->zone_uniqid != uniqid) {
5256                         /*
5257                          * The zone has gone away.  Necessary conditions
5258                          * are met, so we return success.
5259                          */
5260                         mutex_exit(&zonehash_lock);
5261                         return (0);
5262                 }
5263                 mutex_enter(&zone->zone_lock);
5264                 unref = ZONE_IS_UNREF(zone);
5265                 refs_have_been_logged = (zone->zone_flags &
5266                     ZF_REFCOUNTS_LOGGED);
5267                 mutex_exit(&zone->zone_lock);
5268                 if (unref) {
5269                         /*
5270                          * There is only one reference to the zone -- that
5271                          * added when the zone was added to the hashtables --
5272                          * and things will remain this way until we drop
5273                          * zonehash_lock... we can go ahead and cleanup the
5274                          * zone.
5275                          */
5276                         break;
5277                 }
5278 
5279                 /*
5280                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5281                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5282                  * some zone's general-purpose reference count reaches one.
5283                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5284                  * on zone_destroy_cv, then log the zone's reference counts and
5285                  * continue to wait for zone_rele() and zone_cred_rele().
5286                  */
5287                 if (!refs_have_been_logged) {
5288                         if (!log_refcounts) {
5289                                 /*
5290                                  * This thread hasn't timed out waiting on
5291                                  * zone_destroy_cv yet.  Wait wait_time clock
5292                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5293                                  * seconds) for the zone's references to clear.
5294                                  */
5295                                 ASSERT(wait_time > 0);
5296                                 wait_time = cv_reltimedwait_sig(
5297                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5298                                     TR_SEC);
5299                                 if (wait_time > 0) {
5300                                         /*
5301                                          * A thread in zone_rele() or
5302                                          * zone_cred_rele() signaled
5303                                          * zone_destroy_cv before this thread's
5304                                          * wait timed out.  The zone might have
5305                                          * only one reference left; find out!
5306                                          */
5307                                         continue;
5308                                 } else if (wait_time == 0) {
5309                                         /* The thread's process was signaled. */
5310                                         mutex_exit(&zonehash_lock);
5311                                         return (set_errno(EINTR));
5312                                 }
5313 
5314                                 /*
5315                                  * The thread timed out while waiting on
5316                                  * zone_destroy_cv.  Even though the thread
5317                                  * timed out, it has to check whether another
5318                                  * thread woke up from zone_destroy_cv and
5319                                  * destroyed the zone.
5320                                  *
5321                                  * If the zone still exists and has more than
5322                                  * one unreleased general-purpose reference,
5323                                  * then log the zone's reference counts.
5324                                  */
5325                                 log_refcounts = B_TRUE;
5326                                 continue;
5327                         }
5328 
5329                         /*
5330                          * The thread already timed out on zone_destroy_cv while
5331                          * waiting for subsystems to release the zone's last
5332                          * general-purpose references.  Log the zone's reference
5333                          * counts and wait indefinitely on zone_destroy_cv.
5334                          */
5335                         zone_log_refcounts(zone);
5336                 }
5337                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5338                         /* The thread's process was signaled. */
5339                         mutex_exit(&zonehash_lock);
5340                         return (set_errno(EINTR));
5341                 }
5342         }
5343 
5344         /*
5345          * Remove CPU cap for this zone now since we're not going to
5346          * fail below this point.
5347          */
5348         cpucaps_zone_remove(zone);
5349 
5350         /* Get rid of the zone's kstats */
5351         zone_kstat_delete(zone);
5352 
5353         /* remove the pfexecd doors */
5354         if (zone->zone_pfexecd != NULL) {
5355                 klpd_freelist(&zone->zone_pfexecd);
5356                 zone->zone_pfexecd = NULL;
5357         }
5358 
5359         /* free brand specific data */
5360         if (ZONE_IS_BRANDED(zone))
5361                 ZBROP(zone)->b_free_brand_data(zone);
5362 
5363         /* Say goodbye to brand framework. */
5364         brand_unregister_zone(zone->zone_brand);
5365 
5366         /*
5367          * It is now safe to let the zone be recreated; remove it from the
5368          * lists.  The memory will not be freed until the last cred
5369          * reference goes away.
5370          */
5371         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
5372         zonecount--;
5373         /* remove from active list and hash tables */
5374         list_remove(&zone_active, zone);
5375         (void) mod_hash_destroy(zonehashbyname,
5376             (mod_hash_key_t)zone->zone_name);
5377         (void) mod_hash_destroy(zonehashbyid,
5378             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5379         if (zone->zone_flags & ZF_HASHED_LABEL)
5380                 (void) mod_hash_destroy(zonehashbylabel,
5381                     (mod_hash_key_t)zone->zone_slabel);
5382         mutex_exit(&zonehash_lock);
5383 
5384         /*
5385          * Release the root vnode; we're not using it anymore.  Nor should any
5386          * other thread that might access it exist.
5387          */
5388         if (zone->zone_rootvp != NULL) {
5389                 vnode_t *vp = zone->zone_rootvp;
5390 
5391                 mutex_enter(&vp->v_lock);
5392                 vp->v_flag &= ~VZONEROOT;
5393                 mutex_exit(&vp->v_lock);
5394                 VN_RELE(vp);
5395                 zone->zone_rootvp = NULL;
5396         }
5397 
5398         /* add to deathrow list */
5399         mutex_enter(&zone_deathrow_lock);
5400         list_insert_tail(&zone_deathrow, zone);
5401         mutex_exit(&zone_deathrow_lock);
5402 
5403         /*
5404          * Drop last reference (which was added by zsched()), this will
5405          * free the zone unless there are outstanding cred references.
5406          */
5407         zone_rele(zone);
5408         return (0);
5409 }
5410 
5411 /*
5412  * Systemcall entry point for zone_getattr(2).
5413  */
5414 static ssize_t
5415 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5416 {
5417         size_t size;
5418         int error = 0, err;
5419         zone_t *zone;
5420         char *zonepath;
5421         char *outstr;
5422         zone_status_t zone_status;
5423         pid_t initpid;
5424         boolean_t global = (curzone == global_zone);
5425         boolean_t inzone = (curzone->zone_id == zoneid);
5426         ushort_t flags;
5427         zone_net_data_t *zbuf;
5428 
5429         mutex_enter(&zonehash_lock);
5430         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5431                 mutex_exit(&zonehash_lock);
5432                 return (set_errno(EINVAL));
5433         }
5434         zone_status = zone_status_get(zone);
5435         if (zone_status < ZONE_IS_INITIALIZED) {
5436                 mutex_exit(&zonehash_lock);
5437                 return (set_errno(EINVAL));
5438         }
5439         zone_hold(zone);
5440         mutex_exit(&zonehash_lock);
5441 
5442         /*
5443          * If not in the global zone, don't show information about other zones,
5444          * unless the system is labeled and the local zone's label dominates
5445          * the other zone.
5446          */
5447         if (!zone_list_access(zone)) {
5448                 zone_rele(zone);
5449                 return (set_errno(EINVAL));
5450         }
5451 
5452         switch (attr) {
5453         case ZONE_ATTR_ROOT:
5454                 if (global) {
5455                         /*
5456                          * Copy the path to trim the trailing "/" (except for
5457                          * the global zone).
5458                          */
5459                         if (zone != global_zone)
5460                                 size = zone->zone_rootpathlen - 1;
5461                         else
5462                                 size = zone->zone_rootpathlen;
5463                         zonepath = kmem_alloc(size, KM_SLEEP);
5464                         bcopy(zone->zone_rootpath, zonepath, size);
5465                         zonepath[size - 1] = '\0';
5466                 } else {
5467                         if (inzone || !is_system_labeled()) {
5468                                 /*
5469                                  * Caller is not in the global zone.
5470                                  * if the query is on the current zone
5471                                  * or the system is not labeled,
5472                                  * just return faked-up path for current zone.
5473                                  */
5474                                 zonepath = "/";
5475                                 size = 2;
5476                         } else {
5477                                 /*
5478                                  * Return related path for current zone.
5479                                  */
5480                                 int prefix_len = strlen(zone_prefix);
5481                                 int zname_len = strlen(zone->zone_name);
5482 
5483                                 size = prefix_len + zname_len + 1;
5484                                 zonepath = kmem_alloc(size, KM_SLEEP);
5485                                 bcopy(zone_prefix, zonepath, prefix_len);
5486                                 bcopy(zone->zone_name, zonepath +
5487                                     prefix_len, zname_len);
5488                                 zonepath[size - 1] = '\0';
5489                         }
5490                 }
5491                 if (bufsize > size)
5492                         bufsize = size;
5493                 if (buf != NULL) {
5494                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5495                         if (err != 0 && err != ENAMETOOLONG)
5496                                 error = EFAULT;
5497                 }
5498                 if (global || (is_system_labeled() && !inzone))
5499                         kmem_free(zonepath, size);
5500                 break;
5501 
5502         case ZONE_ATTR_NAME:
5503                 size = strlen(zone->zone_name) + 1;
5504                 if (bufsize > size)
5505                         bufsize = size;
5506                 if (buf != NULL) {
5507                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5508                         if (err != 0 && err != ENAMETOOLONG)
5509                                 error = EFAULT;
5510                 }
5511                 break;
5512 
5513         case ZONE_ATTR_STATUS:
5514                 /*
5515                  * Since we're not holding zonehash_lock, the zone status
5516                  * may be anything; leave it up to userland to sort it out.
5517                  */
5518                 size = sizeof (zone_status);
5519                 if (bufsize > size)
5520                         bufsize = size;
5521                 zone_status = zone_status_get(zone);
5522                 if (buf != NULL &&
5523                     copyout(&zone_status, buf, bufsize) != 0)
5524                         error = EFAULT;
5525                 break;
5526         case ZONE_ATTR_FLAGS:
5527                 size = sizeof (zone->zone_flags);
5528                 if (bufsize > size)
5529                         bufsize = size;
5530                 flags = zone->zone_flags;
5531                 if (buf != NULL &&
5532                     copyout(&flags, buf, bufsize) != 0)
5533                         error = EFAULT;
5534                 break;
5535         case ZONE_ATTR_PRIVSET:
5536                 size = sizeof (priv_set_t);
5537                 if (bufsize > size)
5538                         bufsize = size;
5539                 if (buf != NULL &&
5540                     copyout(zone->zone_privset, buf, bufsize) != 0)
5541                         error = EFAULT;
5542                 break;
5543         case ZONE_ATTR_UNIQID:
5544                 size = sizeof (zone->zone_uniqid);
5545                 if (bufsize > size)
5546                         bufsize = size;
5547                 if (buf != NULL &&
5548                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5549                         error = EFAULT;
5550                 break;
5551         case ZONE_ATTR_POOLID:
5552                 {
5553                         pool_t *pool;
5554                         poolid_t poolid;
5555 
5556                         if (pool_lock_intr() != 0) {
5557                                 error = EINTR;
5558                                 break;
5559                         }
5560                         pool = zone_pool_get(zone);
5561                         poolid = pool->pool_id;
5562                         pool_unlock();
5563                         size = sizeof (poolid);
5564                         if (bufsize > size)
5565                                 bufsize = size;
5566                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5567                                 error = EFAULT;
5568                 }
5569                 break;
5570         case ZONE_ATTR_SLBL:
5571                 size = sizeof (bslabel_t);
5572                 if (bufsize > size)
5573                         bufsize = size;
5574                 if (zone->zone_slabel == NULL)
5575                         error = EINVAL;
5576                 else if (buf != NULL &&
5577                     copyout(label2bslabel(zone->zone_slabel), buf,
5578                     bufsize) != 0)
5579                         error = EFAULT;
5580                 break;
5581         case ZONE_ATTR_INITPID:
5582                 size = sizeof (initpid);
5583                 if (bufsize > size)
5584                         bufsize = size;
5585                 initpid = zone->zone_proc_initpid;
5586                 if (initpid == -1) {
5587                         error = ESRCH;
5588                         break;
5589                 }
5590                 if (buf != NULL &&
5591                     copyout(&initpid, buf, bufsize) != 0)
5592                         error = EFAULT;
5593                 break;
5594         case ZONE_ATTR_BRAND:
5595                 size = strlen(zone->zone_brand->b_name) + 1;
5596 
5597                 if (bufsize > size)
5598                         bufsize = size;
5599                 if (buf != NULL) {
5600                         err = copyoutstr(zone->zone_brand->b_name, buf,
5601                             bufsize, NULL);
5602                         if (err != 0 && err != ENAMETOOLONG)
5603                                 error = EFAULT;
5604                 }
5605                 break;
5606         case ZONE_ATTR_INITNAME:
5607                 size = strlen(zone->zone_initname) + 1;
5608                 if (bufsize > size)
5609                         bufsize = size;
5610                 if (buf != NULL) {
5611                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5612                             NULL);
5613                         if (err != 0 && err != ENAMETOOLONG)
5614                                 error = EFAULT;
5615                 }
5616                 break;
5617         case ZONE_ATTR_BOOTARGS:
5618                 if (zone->zone_bootargs == NULL)
5619                         outstr = "";
5620                 else
5621                         outstr = zone->zone_bootargs;
5622                 size = strlen(outstr) + 1;
5623                 if (bufsize > size)
5624                         bufsize = size;
5625                 if (buf != NULL) {
5626                         err = copyoutstr(outstr, buf, bufsize, NULL);
5627                         if (err != 0 && err != ENAMETOOLONG)
5628                                 error = EFAULT;
5629                 }
5630                 break;
5631         case ZONE_ATTR_PHYS_MCAP:
5632                 size = sizeof (zone->zone_phys_mcap);
5633                 if (bufsize > size)
5634                         bufsize = size;
5635                 if (buf != NULL &&
5636                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5637                         error = EFAULT;
5638                 break;
5639         case ZONE_ATTR_SCHED_CLASS:
5640                 mutex_enter(&class_lock);
5641 
5642                 if (zone->zone_defaultcid >= loaded_classes)
5643                         outstr = "";
5644                 else
5645                         outstr = sclass[zone->zone_defaultcid].cl_name;
5646                 size = strlen(outstr) + 1;
5647                 if (bufsize > size)
5648                         bufsize = size;
5649                 if (buf != NULL) {
5650                         err = copyoutstr(outstr, buf, bufsize, NULL);
5651                         if (err != 0 && err != ENAMETOOLONG)
5652                                 error = EFAULT;
5653                 }
5654 
5655                 mutex_exit(&class_lock);
5656                 break;
5657         case ZONE_ATTR_HOSTID:
5658                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5659                     bufsize == sizeof (zone->zone_hostid)) {
5660                         size = sizeof (zone->zone_hostid);
5661                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5662                             bufsize) != 0)
5663                                 error = EFAULT;
5664                 } else {
5665                         error = EINVAL;
5666                 }
5667                 break;
5668         case ZONE_ATTR_FS_ALLOWED:
5669                 if (zone->zone_fs_allowed == NULL)
5670                         outstr = "";
5671                 else
5672                         outstr = zone->zone_fs_allowed;
5673                 size = strlen(outstr) + 1;
5674                 if (bufsize > size)
5675                         bufsize = size;
5676                 if (buf != NULL) {
5677                         err = copyoutstr(outstr, buf, bufsize, NULL);
5678                         if (err != 0 && err != ENAMETOOLONG)
5679                                 error = EFAULT;
5680                 }
5681                 break;
5682         case ZONE_ATTR_SECFLAGS:
5683                 size = sizeof (zone->zone_secflags);
5684                 if (bufsize > size)
5685                         bufsize = size;
5686                 if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5687                         error = EFAULT;
5688                 break;
5689         case ZONE_ATTR_NETWORK:
5690                 bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
5691                 size = bufsize;
5692                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5693                 if (copyin(buf, zbuf, bufsize) != 0) {
5694                         error = EFAULT;
5695                 } else {
5696                         error = zone_get_network(zoneid, zbuf);
5697                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5698                                 error = EFAULT;
5699                 }
5700                 kmem_free(zbuf, bufsize);
5701                 break;
5702         default:
5703                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5704                         size = bufsize;
5705                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5706                 } else {
5707                         error = EINVAL;
5708                 }
5709         }
5710         zone_rele(zone);
5711 
5712         if (error)
5713                 return (set_errno(error));
5714         return ((ssize_t)size);
5715 }
5716 
5717 /*
5718  * Systemcall entry point for zone_setattr(2).
5719  */
5720 /*ARGSUSED*/
5721 static int
5722 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5723 {
5724         zone_t *zone;
5725         zone_status_t zone_status;
5726         int err = -1;
5727         zone_net_data_t *zbuf;
5728 
5729         if (secpolicy_zone_config(CRED()) != 0)
5730                 return (set_errno(EPERM));
5731 
5732         /*
5733          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5734          * global zone.
5735          */
5736         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5737                 return (set_errno(EINVAL));
5738         }
5739 
5740         mutex_enter(&zonehash_lock);
5741         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5742                 mutex_exit(&zonehash_lock);
5743                 return (set_errno(EINVAL));
5744         }
5745         zone_hold(zone);
5746         mutex_exit(&zonehash_lock);
5747 
5748         /*
5749          * At present most attributes can only be set on non-running,
5750          * non-global zones.
5751          */
5752         zone_status = zone_status_get(zone);
5753         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5754                 err = EINVAL;
5755                 goto done;
5756         }
5757 
5758         switch (attr) {
5759         case ZONE_ATTR_INITNAME:
5760                 err = zone_set_initname(zone, (const char *)buf);
5761                 break;
5762         case ZONE_ATTR_INITNORESTART:
5763                 zone->zone_restart_init = B_FALSE;
5764                 err = 0;
5765                 break;
5766         case ZONE_ATTR_BOOTARGS:
5767                 err = zone_set_bootargs(zone, (const char *)buf);
5768                 break;
5769         case ZONE_ATTR_BRAND:
5770                 err = zone_set_brand(zone, (const char *)buf);
5771                 break;
5772         case ZONE_ATTR_FS_ALLOWED:
5773                 err = zone_set_fs_allowed(zone, (const char *)buf);
5774                 break;
5775         case ZONE_ATTR_SECFLAGS:
5776                 err = zone_set_secflags(zone, (psecflags_t *)buf);
5777                 break;
5778         case ZONE_ATTR_PHYS_MCAP:
5779                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5780                 break;
5781         case ZONE_ATTR_SCHED_CLASS:
5782                 err = zone_set_sched_class(zone, (const char *)buf);
5783                 break;
5784         case ZONE_ATTR_HOSTID:
5785                 if (bufsize == sizeof (zone->zone_hostid)) {
5786                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5787                                 err = 0;
5788                         else
5789                                 err = EFAULT;
5790                 } else {
5791                         err = EINVAL;
5792                 }
5793                 break;
5794         case ZONE_ATTR_NETWORK:
5795                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5796                         err = EINVAL;
5797                         break;
5798                 }
5799                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5800                 if (copyin(buf, zbuf, bufsize) != 0) {
5801                         kmem_free(zbuf, bufsize);
5802                         err = EFAULT;
5803                         break;
5804                 }
5805                 err = zone_set_network(zoneid, zbuf);
5806                 kmem_free(zbuf, bufsize);
5807                 break;
5808         default:
5809                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5810                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5811                 else
5812                         err = EINVAL;
5813         }
5814 
5815 done:
5816         zone_rele(zone);
5817         ASSERT(err != -1);
5818         return (err != 0 ? set_errno(err) : 0);
5819 }
5820 
5821 /*
5822  * Return zero if the process has at least one vnode mapped in to its
5823  * address space which shouldn't be allowed to change zones.
5824  *
5825  * Also return zero if the process has any shared mappings which reserve
5826  * swap.  This is because the counting for zone.max-swap does not allow swap
5827  * reservation to be shared between zones.  zone swap reservation is counted
5828  * on zone->zone_max_swap.
5829  */
5830 static int
5831 as_can_change_zones(void)
5832 {
5833         proc_t *pp = curproc;
5834         struct seg *seg;
5835         struct as *as = pp->p_as;
5836         vnode_t *vp;
5837         int allow = 1;
5838 
5839         ASSERT(pp->p_as != &kas);
5840         AS_LOCK_ENTER(as, RW_READER);
5841         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5842 
5843                 /*
5844                  * Cannot enter zone with shared anon memory which
5845                  * reserves swap.  See comment above.
5846                  */
5847                 if (seg_can_change_zones(seg) == B_FALSE) {
5848                         allow = 0;
5849                         break;
5850                 }
5851                 /*
5852                  * if we can't get a backing vnode for this segment then skip
5853                  * it.
5854                  */
5855                 vp = NULL;
5856                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5857                         continue;
5858                 if (!vn_can_change_zones(vp)) { /* bail on first match */
5859                         allow = 0;
5860                         break;
5861                 }
5862         }
5863         AS_LOCK_EXIT(as);
5864         return (allow);
5865 }
5866 
5867 /*
5868  * Count swap reserved by curproc's address space
5869  */
5870 static size_t
5871 as_swresv(void)
5872 {
5873         proc_t *pp = curproc;
5874         struct seg *seg;
5875         struct as *as = pp->p_as;
5876         size_t swap = 0;
5877 
5878         ASSERT(pp->p_as != &kas);
5879         ASSERT(AS_WRITE_HELD(as));
5880         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5881                 swap += seg_swresv(seg);
5882 
5883         return (swap);
5884 }
5885 
5886 /*
5887  * Systemcall entry point for zone_enter().
5888  *
5889  * The current process is injected into said zone.  In the process
5890  * it will change its project membership, privileges, rootdir/cwd,
5891  * zone-wide rctls, and pool association to match those of the zone.
5892  *
5893  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5894  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5895  * enter a zone that is "ready" or "running".
5896  */
5897 static int
5898 zone_enter(zoneid_t zoneid)
5899 {
5900         zone_t *zone;
5901         vnode_t *vp;
5902         proc_t *pp = curproc;
5903         contract_t *ct;
5904         cont_process_t *ctp;
5905         task_t *tk, *oldtk;
5906         kproject_t *zone_proj0;
5907         cred_t *cr, *newcr;
5908         pool_t *oldpool, *newpool;
5909         sess_t *sp;
5910         uid_t uid;
5911         zone_status_t status;
5912         int err = 0;
5913         rctl_entity_p_t e;
5914         size_t swap;
5915         kthread_id_t t;
5916 
5917         if (secpolicy_zone_config(CRED()) != 0)
5918                 return (set_errno(EPERM));
5919         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5920                 return (set_errno(EINVAL));
5921 
5922         /*
5923          * Stop all lwps so we don't need to hold a lock to look at
5924          * curproc->p_zone.  This needs to happen before we grab any
5925          * locks to avoid deadlock (another lwp in the process could
5926          * be waiting for the held lock).
5927          */
5928         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5929                 return (set_errno(EINTR));
5930 
5931         /*
5932          * Make sure we're not changing zones with files open or mapped in
5933          * to our address space which shouldn't be changing zones.
5934          */
5935         if (!files_can_change_zones()) {
5936                 err = EBADF;
5937                 goto out;
5938         }
5939         if (!as_can_change_zones()) {
5940                 err = EFAULT;
5941                 goto out;
5942         }
5943 
5944         mutex_enter(&zonehash_lock);
5945         if (pp->p_zone != global_zone) {
5946                 mutex_exit(&zonehash_lock);
5947                 err = EINVAL;
5948                 goto out;
5949         }
5950 
5951         zone = zone_find_all_by_id(zoneid);
5952         if (zone == NULL) {
5953                 mutex_exit(&zonehash_lock);
5954                 err = EINVAL;
5955                 goto out;
5956         }
5957 
5958         /*
5959          * To prevent processes in a zone from holding contracts on
5960          * extrazonal resources, and to avoid process contract
5961          * memberships which span zones, contract holders and processes
5962          * which aren't the sole members of their encapsulating process
5963          * contracts are not allowed to zone_enter.
5964          */
5965         ctp = pp->p_ct_process;
5966         ct = &ctp->conp_contract;
5967         mutex_enter(&ct->ct_lock);
5968         mutex_enter(&pp->p_lock);
5969         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5970                 mutex_exit(&pp->p_lock);
5971                 mutex_exit(&ct->ct_lock);
5972                 mutex_exit(&zonehash_lock);
5973                 err = EINVAL;
5974                 goto out;
5975         }
5976 
5977         /*
5978          * Moreover, we don't allow processes whose encapsulating
5979          * process contracts have inherited extrazonal contracts.
5980          * While it would be easier to eliminate all process contracts
5981          * with inherited contracts, we need to be able to give a
5982          * restarted init (or other zone-penetrating process) its
5983          * predecessor's contracts.
5984          */
5985         if (ctp->conp_ninherited != 0) {
5986                 contract_t *next;
5987                 for (next = list_head(&ctp->conp_inherited); next;
5988                     next = list_next(&ctp->conp_inherited, next)) {
5989                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
5990                                 mutex_exit(&pp->p_lock);
5991                                 mutex_exit(&ct->ct_lock);
5992                                 mutex_exit(&zonehash_lock);
5993                                 err = EINVAL;
5994                                 goto out;
5995                         }
5996                 }
5997         }
5998 
5999         mutex_exit(&pp->p_lock);
6000         mutex_exit(&ct->ct_lock);
6001 
6002         status = zone_status_get(zone);
6003         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
6004                 /*
6005                  * Can't join
6006                  */
6007                 mutex_exit(&zonehash_lock);
6008                 err = EINVAL;
6009                 goto out;
6010         }
6011 
6012         /*
6013          * Make sure new priv set is within the permitted set for caller
6014          */
6015         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
6016                 mutex_exit(&zonehash_lock);
6017                 err = EPERM;
6018                 goto out;
6019         }
6020         /*
6021          * We want to momentarily drop zonehash_lock while we optimistically
6022          * bind curproc to the pool it should be running in.  This is safe
6023          * since the zone can't disappear (we have a hold on it).
6024          */
6025         zone_hold(zone);
6026         mutex_exit(&zonehash_lock);
6027 
6028         /*
6029          * Grab pool_lock to keep the pools configuration from changing
6030          * and to stop ourselves from getting rebound to another pool
6031          * until we join the zone.
6032          */
6033         if (pool_lock_intr() != 0) {
6034                 zone_rele(zone);
6035                 err = EINTR;
6036                 goto out;
6037         }
6038         ASSERT(secpolicy_pool(CRED()) == 0);
6039         /*
6040          * Bind ourselves to the pool currently associated with the zone.
6041          */
6042         oldpool = curproc->p_pool;
6043         newpool = zone_pool_get(zone);
6044         if (pool_state == POOL_ENABLED && newpool != oldpool &&
6045             (err = pool_do_bind(newpool, P_PID, P_MYID,
6046             POOL_BIND_ALL)) != 0) {
6047                 pool_unlock();
6048                 zone_rele(zone);
6049                 goto out;
6050         }
6051 
6052         /*
6053          * Grab cpu_lock now; we'll need it later when we call
6054          * task_join().
6055          */
6056         mutex_enter(&cpu_lock);
6057         mutex_enter(&zonehash_lock);
6058         /*
6059          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6060          */
6061         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6062                 /*
6063                  * Can't join anymore.
6064                  */
6065                 mutex_exit(&zonehash_lock);
6066                 mutex_exit(&cpu_lock);
6067                 if (pool_state == POOL_ENABLED &&
6068                     newpool != oldpool)
6069                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
6070                             POOL_BIND_ALL);
6071                 pool_unlock();
6072                 zone_rele(zone);
6073                 err = EINVAL;
6074                 goto out;
6075         }
6076 
6077         /*
6078          * a_lock must be held while transfering locked memory and swap
6079          * reservation from the global zone to the non global zone because
6080          * asynchronous faults on the processes' address space can lock
6081          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6082          * segments respectively.
6083          */
6084         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6085         swap = as_swresv();
6086         mutex_enter(&pp->p_lock);
6087         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6088         /* verify that we do not exceed and task or lwp limits */
6089         mutex_enter(&zone->zone_nlwps_lock);
6090         /* add new lwps to zone and zone's proj0 */
6091         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6092         zone->zone_nlwps += pp->p_lwpcnt;
6093         /* add 1 task to zone's proj0 */
6094         zone_proj0->kpj_ntasks += 1;
6095 
6096         zone_proj0->kpj_nprocs++;
6097         zone->zone_nprocs++;
6098         mutex_exit(&zone->zone_nlwps_lock);
6099 
6100         mutex_enter(&zone->zone_mem_lock);
6101         zone->zone_locked_mem += pp->p_locked_mem;
6102         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6103         zone->zone_max_swap += swap;
6104         mutex_exit(&zone->zone_mem_lock);
6105 
6106         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6107         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6108         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6109 
6110         /* remove lwps and process from proc's old zone and old project */
6111         mutex_enter(&pp->p_zone->zone_nlwps_lock);
6112         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6113         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6114         pp->p_task->tk_proj->kpj_nprocs--;
6115         pp->p_zone->zone_nprocs--;
6116         mutex_exit(&pp->p_zone->zone_nlwps_lock);
6117 
6118         mutex_enter(&pp->p_zone->zone_mem_lock);
6119         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6120         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6121         pp->p_zone->zone_max_swap -= swap;
6122         mutex_exit(&pp->p_zone->zone_mem_lock);
6123 
6124         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6125         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6126         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6127 
6128         pp->p_flag |= SZONETOP;
6129         pp->p_zone = zone;
6130         mutex_exit(&pp->p_lock);
6131         AS_LOCK_EXIT(pp->p_as);
6132 
6133         /*
6134          * Joining the zone cannot fail from now on.
6135          *
6136          * This means that a lot of the following code can be commonized and
6137          * shared with zsched().
6138          */
6139 
6140         /*
6141          * If the process contract fmri was inherited, we need to
6142          * flag this so that any contract status will not leak
6143          * extra zone information, svc_fmri in this case
6144          */
6145         if (ctp->conp_svc_ctid != ct->ct_id) {
6146                 mutex_enter(&ct->ct_lock);
6147                 ctp->conp_svc_zone_enter = ct->ct_id;
6148                 mutex_exit(&ct->ct_lock);
6149         }
6150 
6151         /*
6152          * Reset the encapsulating process contract's zone.
6153          */
6154         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6155         contract_setzuniqid(ct, zone->zone_uniqid);
6156 
6157         /*
6158          * Create a new task and associate the process with the project keyed
6159          * by (projid,zoneid).
6160          *
6161          * We might as well be in project 0; the global zone's projid doesn't
6162          * make much sense in a zone anyhow.
6163          *
6164          * This also increments zone_ntasks, and returns with p_lock held.
6165          */
6166         tk = task_create(0, zone);
6167         oldtk = task_join(tk, 0);
6168         mutex_exit(&cpu_lock);
6169 
6170         /*
6171          * call RCTLOP_SET functions on this proc
6172          */
6173         e.rcep_p.zone = zone;
6174         e.rcep_t = RCENTITY_ZONE;
6175         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6176             RCD_CALLBACK);
6177         mutex_exit(&pp->p_lock);
6178 
6179         /*
6180          * We don't need to hold any of zsched's locks here; not only do we know
6181          * the process and zone aren't going away, we know its session isn't
6182          * changing either.
6183          *
6184          * By joining zsched's session here, we mimic the behavior in the
6185          * global zone of init's sid being the pid of sched.  We extend this
6186          * to all zlogin-like zone_enter()'ing processes as well.
6187          */
6188         mutex_enter(&pidlock);
6189         sp = zone->zone_zsched->p_sessp;
6190         sess_hold(zone->zone_zsched);
6191         mutex_enter(&pp->p_lock);
6192         pgexit(pp);
6193         sess_rele(pp->p_sessp, B_TRUE);
6194         pp->p_sessp = sp;
6195         pgjoin(pp, zone->zone_zsched->p_pidp);
6196 
6197         /*
6198          * If any threads are scheduled to be placed on zone wait queue they
6199          * should abandon the idea since the wait queue is changing.
6200          * We need to be holding pidlock & p_lock to do this.
6201          */
6202         if ((t = pp->p_tlist) != NULL) {
6203                 do {
6204                         thread_lock(t);
6205                         /*
6206                          * Kick this thread so that it doesn't sit
6207                          * on a wrong wait queue.
6208                          */
6209                         if (ISWAITING(t))
6210                                 setrun_locked(t);
6211 
6212                         if (t->t_schedflag & TS_ANYWAITQ)
6213                                 t->t_schedflag &= ~ TS_ANYWAITQ;
6214 
6215                         thread_unlock(t);
6216                 } while ((t = t->t_forw) != pp->p_tlist);
6217         }
6218 
6219         /*
6220          * If there is a default scheduling class for the zone and it is not
6221          * the class we are currently in, change all of the threads in the
6222          * process to the new class.  We need to be holding pidlock & p_lock
6223          * when we call parmsset so this is a good place to do it.
6224          */
6225         if (zone->zone_defaultcid > 0 &&
6226             zone->zone_defaultcid != curthread->t_cid) {
6227                 pcparms_t pcparms;
6228 
6229                 pcparms.pc_cid = zone->zone_defaultcid;
6230                 pcparms.pc_clparms[0] = 0;
6231 
6232                 /*
6233                  * If setting the class fails, we still want to enter the zone.
6234                  */
6235                 if ((t = pp->p_tlist) != NULL) {
6236                         do {
6237                                 (void) parmsset(&pcparms, t);
6238                         } while ((t = t->t_forw) != pp->p_tlist);
6239                 }
6240         }
6241 
6242         mutex_exit(&pp->p_lock);
6243         mutex_exit(&pidlock);
6244 
6245         mutex_exit(&zonehash_lock);
6246         /*
6247          * We're firmly in the zone; let pools progress.
6248          */
6249         pool_unlock();
6250         task_rele(oldtk);
6251         /*
6252          * We don't need to retain a hold on the zone since we already
6253          * incremented zone_ntasks, so the zone isn't going anywhere.
6254          */
6255         zone_rele(zone);
6256 
6257         /*
6258          * Chroot
6259          */
6260         vp = zone->zone_rootvp;
6261         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6262         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6263 
6264         /*
6265          * Change process security flags.  Note that the _effective_ flags
6266          * cannot change
6267          */
6268         secflags_copy(&pp->p_secflags.psf_lower,
6269             &zone->zone_secflags.psf_lower);
6270         secflags_copy(&pp->p_secflags.psf_upper,
6271             &zone->zone_secflags.psf_upper);
6272         secflags_copy(&pp->p_secflags.psf_inherit,
6273             &zone->zone_secflags.psf_inherit);
6274 
6275         /*
6276          * Change process credentials
6277          */
6278         newcr = cralloc();
6279         mutex_enter(&pp->p_crlock);
6280         cr = pp->p_cred;
6281         crcopy_to(cr, newcr);
6282         crsetzone(newcr, zone);
6283         pp->p_cred = newcr;
6284 
6285         /*
6286          * Restrict all process privilege sets to zone limit
6287          */
6288         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6289         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6290         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6291         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6292         mutex_exit(&pp->p_crlock);
6293         crset(pp, newcr);
6294 
6295         /*
6296          * Adjust upcount to reflect zone entry.
6297          */
6298         uid = crgetruid(newcr);
6299         mutex_enter(&pidlock);
6300         upcount_dec(uid, GLOBAL_ZONEID);
6301         upcount_inc(uid, zoneid);
6302         mutex_exit(&pidlock);
6303 
6304         /*
6305          * Set up core file path and content.
6306          */
6307         set_core_defaults();
6308 
6309 out:
6310         /*
6311          * Let the other lwps continue.
6312          */
6313         mutex_enter(&pp->p_lock);
6314         if (curthread != pp->p_agenttp)
6315                 continuelwps(pp);
6316         mutex_exit(&pp->p_lock);
6317 
6318         return (err != 0 ? set_errno(err) : 0);
6319 }
6320 
6321 /*
6322  * Systemcall entry point for zone_list(2).
6323  *
6324  * Processes running in a (non-global) zone only see themselves.
6325  * On labeled systems, they see all zones whose label they dominate.
6326  */
6327 static int
6328 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6329 {
6330         zoneid_t *zoneids;
6331         zone_t *zone, *myzone;
6332         uint_t user_nzones, real_nzones;
6333         uint_t domi_nzones;
6334         int error;
6335 
6336         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6337                 return (set_errno(EFAULT));
6338 
6339         myzone = curproc->p_zone;
6340         if (myzone != global_zone) {
6341                 bslabel_t *mybslab;
6342 
6343                 if (!is_system_labeled()) {
6344                         /* just return current zone */
6345                         real_nzones = domi_nzones = 1;
6346                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6347                         zoneids[0] = myzone->zone_id;
6348                 } else {
6349                         /* return all zones that are dominated */
6350                         mutex_enter(&zonehash_lock);
6351                         real_nzones = zonecount;
6352                         domi_nzones = 0;
6353                         if (real_nzones > 0) {
6354                                 zoneids = kmem_alloc(real_nzones *
6355                                     sizeof (zoneid_t), KM_SLEEP);
6356                                 mybslab = label2bslabel(myzone->zone_slabel);
6357                                 for (zone = list_head(&zone_active);
6358                                     zone != NULL;
6359                                     zone = list_next(&zone_active, zone)) {
6360                                         if (zone->zone_id == GLOBAL_ZONEID)
6361                                                 continue;
6362                                         if (zone != myzone &&
6363                                             (zone->zone_flags & ZF_IS_SCRATCH))
6364                                                 continue;
6365                                         /*
6366                                          * Note that a label always dominates
6367                                          * itself, so myzone is always included
6368                                          * in the list.
6369                                          */
6370                                         if (bldominates(mybslab,
6371                                             label2bslabel(zone->zone_slabel))) {
6372                                                 zoneids[domi_nzones++] =
6373                                                     zone->zone_id;
6374                                         }
6375                                 }
6376                         }
6377                         mutex_exit(&zonehash_lock);
6378                 }
6379         } else {
6380                 mutex_enter(&zonehash_lock);
6381                 real_nzones = zonecount;
6382                 domi_nzones = 0;
6383                 if (real_nzones > 0) {
6384                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6385                             KM_SLEEP);
6386                         for (zone = list_head(&zone_active); zone != NULL;
6387                             zone = list_next(&zone_active, zone))
6388                                 zoneids[domi_nzones++] = zone->zone_id;
6389                         ASSERT(domi_nzones == real_nzones);
6390                 }
6391                 mutex_exit(&zonehash_lock);
6392         }
6393 
6394         /*
6395          * If user has allocated space for fewer entries than we found, then
6396          * return only up to their limit.  Either way, tell them exactly how
6397          * many we found.
6398          */
6399         if (domi_nzones < user_nzones)
6400                 user_nzones = domi_nzones;
6401         error = 0;
6402         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6403                 error = EFAULT;
6404         } else if (zoneidlist != NULL && user_nzones != 0) {
6405                 if (copyout(zoneids, zoneidlist,
6406                     user_nzones * sizeof (zoneid_t)) != 0)
6407                         error = EFAULT;
6408         }
6409 
6410         if (real_nzones > 0)
6411                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6412 
6413         if (error != 0)
6414                 return (set_errno(error));
6415         else
6416                 return (0);
6417 }
6418 
6419 /*
6420  * Systemcall entry point for zone_lookup(2).
6421  *
6422  * Non-global zones are only able to see themselves and (on labeled systems)
6423  * the zones they dominate.
6424  */
6425 static zoneid_t
6426 zone_lookup(const char *zone_name)
6427 {
6428         char *kname;
6429         zone_t *zone;
6430         zoneid_t zoneid;
6431         int err;
6432 
6433         if (zone_name == NULL) {
6434                 /* return caller's zone id */
6435                 return (getzoneid());
6436         }
6437 
6438         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6439         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6440                 kmem_free(kname, ZONENAME_MAX);
6441                 return (set_errno(err));
6442         }
6443 
6444         mutex_enter(&zonehash_lock);
6445         zone = zone_find_all_by_name(kname);
6446         kmem_free(kname, ZONENAME_MAX);
6447         /*
6448          * In a non-global zone, can only lookup global and own name.
6449          * In Trusted Extensions zone label dominance rules apply.
6450          */
6451         if (zone == NULL ||
6452             zone_status_get(zone) < ZONE_IS_READY ||
6453             !zone_list_access(zone)) {
6454                 mutex_exit(&zonehash_lock);
6455                 return (set_errno(EINVAL));
6456         } else {
6457                 zoneid = zone->zone_id;
6458                 mutex_exit(&zonehash_lock);
6459                 return (zoneid);
6460         }
6461 }
6462 
6463 static int
6464 zone_version(int *version_arg)
6465 {
6466         int version = ZONE_SYSCALL_API_VERSION;
6467 
6468         if (copyout(&version, version_arg, sizeof (int)) != 0)
6469                 return (set_errno(EFAULT));
6470         return (0);
6471 }
6472 
6473 /* ARGSUSED */
6474 long
6475 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6476 {
6477         zone_def zs;
6478         int err;
6479 
6480         switch (cmd) {
6481         case ZONE_CREATE:
6482                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6483                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6484                                 return (set_errno(EFAULT));
6485                         }
6486                 } else {
6487 #ifdef _SYSCALL32_IMPL
6488                         zone_def32 zs32;
6489 
6490                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6491                                 return (set_errno(EFAULT));
6492                         }
6493                         zs.zone_name =
6494                             (const char *)(unsigned long)zs32.zone_name;
6495                         zs.zone_root =
6496                             (const char *)(unsigned long)zs32.zone_root;
6497                         zs.zone_privs =
6498                             (const struct priv_set *)
6499                             (unsigned long)zs32.zone_privs;
6500                         zs.zone_privssz = zs32.zone_privssz;
6501                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6502                         zs.rctlbufsz = zs32.rctlbufsz;
6503                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6504                         zs.zfsbufsz = zs32.zfsbufsz;
6505                         zs.extended_error =
6506                             (int *)(unsigned long)zs32.extended_error;
6507                         zs.match = zs32.match;
6508                         zs.doi = zs32.doi;
6509                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6510                         zs.flags = zs32.flags;
6511 #else
6512                         panic("get_udatamodel() returned bogus result\n");
6513 #endif
6514                 }
6515 
6516                 return (zone_create(zs.zone_name, zs.zone_root,
6517                     zs.zone_privs, zs.zone_privssz,
6518                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6519                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6520                     zs.extended_error, zs.match, zs.doi,
6521                     zs.label, zs.flags));
6522         case ZONE_BOOT:
6523                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6524         case ZONE_DESTROY:
6525                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6526         case ZONE_GETATTR:
6527                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6528                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6529         case ZONE_SETATTR:
6530                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6531                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6532         case ZONE_ENTER:
6533                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6534         case ZONE_LIST:
6535                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6536         case ZONE_SHUTDOWN:
6537                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6538         case ZONE_LOOKUP:
6539                 return (zone_lookup((const char *)arg1));
6540         case ZONE_VERSION:
6541                 return (zone_version((int *)arg1));
6542         case ZONE_ADD_DATALINK:
6543                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6544                     (datalink_id_t)(uintptr_t)arg2));
6545         case ZONE_DEL_DATALINK:
6546                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6547                     (datalink_id_t)(uintptr_t)arg2));
6548         case ZONE_CHECK_DATALINK: {
6549                 zoneid_t        zoneid;
6550                 boolean_t       need_copyout;
6551 
6552                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6553                         return (EFAULT);
6554                 need_copyout = (zoneid == ALL_ZONES);
6555                 err = zone_check_datalink(&zoneid,
6556                     (datalink_id_t)(uintptr_t)arg2);
6557                 if (err == 0 && need_copyout) {
6558                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6559                                 err = EFAULT;
6560                 }
6561                 return (err == 0 ? 0 : set_errno(err));
6562         }
6563         case ZONE_LIST_DATALINK:
6564                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6565                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6566         default:
6567                 return (set_errno(EINVAL));
6568         }
6569 }
6570 
6571 struct zarg {
6572         zone_t *zone;
6573         zone_cmd_arg_t arg;
6574 };
6575 
6576 static int
6577 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6578 {
6579         char *buf;
6580         size_t buflen;
6581         int error;
6582 
6583         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6584         buf = kmem_alloc(buflen, KM_SLEEP);
6585         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6586         error = door_ki_open(buf, doorp);
6587         kmem_free(buf, buflen);
6588         return (error);
6589 }
6590 
6591 static void
6592 zone_release_door(door_handle_t *doorp)
6593 {
6594         door_ki_rele(*doorp);
6595         *doorp = NULL;
6596 }
6597 
6598 static void
6599 zone_ki_call_zoneadmd(struct zarg *zargp)
6600 {
6601         door_handle_t door = NULL;
6602         door_arg_t darg, save_arg;
6603         char *zone_name;
6604         size_t zone_namelen;
6605         zoneid_t zoneid;
6606         zone_t *zone;
6607         zone_cmd_arg_t arg;
6608         uint64_t uniqid;
6609         size_t size;
6610         int error;
6611         int retry;
6612 
6613         zone = zargp->zone;
6614         arg = zargp->arg;
6615         kmem_free(zargp, sizeof (*zargp));
6616 
6617         zone_namelen = strlen(zone->zone_name) + 1;
6618         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6619         bcopy(zone->zone_name, zone_name, zone_namelen);
6620         zoneid = zone->zone_id;
6621         uniqid = zone->zone_uniqid;
6622         /*
6623          * zoneadmd may be down, but at least we can empty out the zone.
6624          * We can ignore the return value of zone_empty() since we're called
6625          * from a kernel thread and know we won't be delivered any signals.
6626          */
6627         ASSERT(curproc == &p0);
6628         (void) zone_empty(zone);
6629         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6630         zone_rele(zone);
6631 
6632         size = sizeof (arg);
6633         darg.rbuf = (char *)&arg;
6634         darg.data_ptr = (char *)&arg;
6635         darg.rsize = size;
6636         darg.data_size = size;
6637         darg.desc_ptr = NULL;
6638         darg.desc_num = 0;
6639 
6640         save_arg = darg;
6641         /*
6642          * Since we're not holding a reference to the zone, any number of
6643          * things can go wrong, including the zone disappearing before we get a
6644          * chance to talk to zoneadmd.
6645          */
6646         for (retry = 0; /* forever */; retry++) {
6647                 if (door == NULL &&
6648                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6649                         goto next;
6650                 }
6651                 ASSERT(door != NULL);
6652 
6653                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6654                     SIZE_MAX, 0)) == 0) {
6655                         break;
6656                 }
6657                 switch (error) {
6658                 case EINTR:
6659                         /* FALLTHROUGH */
6660                 case EAGAIN:    /* process may be forking */
6661                         /*
6662                          * Back off for a bit
6663                          */
6664                         break;
6665                 case EBADF:
6666                         zone_release_door(&door);
6667                         if (zone_lookup_door(zone_name, &door) != 0) {
6668                                 /*
6669                                  * zoneadmd may be dead, but it may come back to
6670                                  * life later.
6671                                  */
6672                                 break;
6673                         }
6674                         break;
6675                 default:
6676                         cmn_err(CE_WARN,
6677                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6678                             error);
6679                         goto out;
6680                 }
6681 next:
6682                 /*
6683                  * If this isn't the same zone_t that we originally had in mind,
6684                  * then this is the same as if two kadmin requests come in at
6685                  * the same time: the first one wins.  This means we lose, so we
6686                  * bail.
6687                  */
6688                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
6689                         /*
6690                          * Problem is solved.
6691                          */
6692                         break;
6693                 }
6694                 if (zone->zone_uniqid != uniqid) {
6695                         /*
6696                          * zoneid recycled
6697                          */
6698                         zone_rele(zone);
6699                         break;
6700                 }
6701                 /*
6702                  * We could zone_status_timedwait(), but there doesn't seem to
6703                  * be much point in doing that (plus, it would mean that
6704                  * zone_free() isn't called until this thread exits).
6705                  */
6706                 zone_rele(zone);
6707                 delay(hz);
6708                 darg = save_arg;
6709         }
6710 out:
6711         if (door != NULL) {
6712                 zone_release_door(&door);
6713         }
6714         kmem_free(zone_name, zone_namelen);
6715         thread_exit();
6716 }
6717 
6718 /*
6719  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6720  * kadmin().  The caller is a process in the zone.
6721  *
6722  * In order to shutdown the zone, we will hand off control to zoneadmd
6723  * (running in the global zone) via a door.  We do a half-hearted job at
6724  * killing all processes in the zone, create a kernel thread to contact
6725  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6726  * a form of generation number used to let zoneadmd (as well as
6727  * zone_destroy()) know exactly which zone they're re talking about.
6728  */
6729 int
6730 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6731 {
6732         struct zarg *zargp;
6733         zone_cmd_t zcmd;
6734         zone_t *zone;
6735 
6736         zone = curproc->p_zone;
6737         ASSERT(getzoneid() != GLOBAL_ZONEID);
6738 
6739         switch (cmd) {
6740         case A_SHUTDOWN:
6741                 switch (fcn) {
6742                 case AD_HALT:
6743                 case AD_POWEROFF:
6744                         zcmd = Z_HALT;
6745                         break;
6746                 case AD_BOOT:
6747                         zcmd = Z_REBOOT;
6748                         break;
6749                 case AD_IBOOT:
6750                 case AD_SBOOT:
6751                 case AD_SIBOOT:
6752                 case AD_NOSYNC:
6753                         return (ENOTSUP);
6754                 default:
6755                         return (EINVAL);
6756                 }
6757                 break;
6758         case A_REBOOT:
6759                 zcmd = Z_REBOOT;
6760                 break;
6761         case A_FTRACE:
6762         case A_REMOUNT:
6763         case A_FREEZE:
6764         case A_DUMP:
6765         case A_CONFIG:
6766                 return (ENOTSUP);
6767         default:
6768                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6769                 return (EINVAL);
6770         }
6771 
6772         if (secpolicy_zone_admin(credp, B_FALSE))
6773                 return (EPERM);
6774         mutex_enter(&zone_status_lock);
6775 
6776         /*
6777          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6778          * is in the zone.
6779          */
6780         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6781         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6782                 /*
6783                  * This zone is already on its way down.
6784                  */
6785                 mutex_exit(&zone_status_lock);
6786                 return (0);
6787         }
6788         /*
6789          * Prevent future zone_enter()s
6790          */
6791         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6792         mutex_exit(&zone_status_lock);
6793 
6794         /*
6795          * Kill everyone now and call zoneadmd later.
6796          * zone_ki_call_zoneadmd() will do a more thorough job of this
6797          * later.
6798          */
6799         killall(zone->zone_id);
6800         /*
6801          * Now, create the thread to contact zoneadmd and do the rest of the
6802          * work.  This thread can't be created in our zone otherwise
6803          * zone_destroy() would deadlock.
6804          */
6805         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6806         zargp->arg.cmd = zcmd;
6807         zargp->arg.uniqid = zone->zone_uniqid;
6808         zargp->zone = zone;
6809         (void) strcpy(zargp->arg.locale, "C");
6810         /* mdep was already copied in for us by uadmin */
6811         if (mdep != NULL)
6812                 (void) strlcpy(zargp->arg.bootbuf, mdep,
6813                     sizeof (zargp->arg.bootbuf));
6814         zone_hold(zone);
6815 
6816         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6817             TS_RUN, minclsyspri);
6818         exit(CLD_EXITED, 0);
6819 
6820         return (EINVAL);
6821 }
6822 
6823 /*
6824  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6825  * status to ZONE_IS_SHUTTING_DOWN.
6826  *
6827  * This function also shuts down all running zones to ensure that they won't
6828  * fork new processes.
6829  */
6830 void
6831 zone_shutdown_global(void)
6832 {
6833         zone_t *current_zonep;
6834 
6835         ASSERT(INGLOBALZONE(curproc));
6836         mutex_enter(&zonehash_lock);
6837         mutex_enter(&zone_status_lock);
6838 
6839         /* Modify the global zone's status first. */
6840         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6841         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6842 
6843         /*
6844          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6845          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6846          * could cause assertions to fail (e.g., assertions about a zone's
6847          * state during initialization, readying, or booting) or produce races.
6848          * We'll let threads continue to initialize and ready new zones: they'll
6849          * fail to boot the new zones when they see that the global zone is
6850          * shutting down.
6851          */
6852         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6853             current_zonep = list_next(&zone_active, current_zonep)) {
6854                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6855                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6856         }
6857         mutex_exit(&zone_status_lock);
6858         mutex_exit(&zonehash_lock);
6859 }
6860 
6861 /*
6862  * Returns true if the named dataset is visible in the current zone.
6863  * The 'write' parameter is set to 1 if the dataset is also writable.
6864  */
6865 int
6866 zone_dataset_visible(const char *dataset, int *write)
6867 {
6868         static int zfstype = -1;
6869         zone_dataset_t *zd;
6870         size_t len;
6871         zone_t *zone = curproc->p_zone;
6872         const char *name = NULL;
6873         vfs_t *vfsp = NULL;
6874 
6875         if (dataset[0] == '\0')
6876                 return (0);
6877 
6878         /*
6879          * Walk the list once, looking for datasets which match exactly, or
6880          * specify a dataset underneath an exported dataset.  If found, return
6881          * true and note that it is writable.
6882          */
6883         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6884             zd = list_next(&zone->zone_datasets, zd)) {
6885 
6886                 len = strlen(zd->zd_dataset);
6887                 if (strlen(dataset) >= len &&
6888                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6889                     (dataset[len] == '\0' || dataset[len] == '/' ||
6890                     dataset[len] == '@')) {
6891                         if (write)
6892                                 *write = 1;
6893                         return (1);
6894                 }
6895         }
6896 
6897         /*
6898          * Walk the list a second time, searching for datasets which are parents
6899          * of exported datasets.  These should be visible, but read-only.
6900          *
6901          * Note that we also have to support forms such as 'pool/dataset/', with
6902          * a trailing slash.
6903          */
6904         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6905             zd = list_next(&zone->zone_datasets, zd)) {
6906 
6907                 len = strlen(dataset);
6908                 if (dataset[len - 1] == '/')
6909                         len--;  /* Ignore trailing slash */
6910                 if (len < strlen(zd->zd_dataset) &&
6911                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6912                     zd->zd_dataset[len] == '/') {
6913                         if (write)
6914                                 *write = 0;
6915                         return (1);
6916                 }
6917         }
6918 
6919         /*
6920          * We reach here if the given dataset is not found in the zone_dataset
6921          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6922          * instead of delegation. For this we search for the dataset in the
6923          * zone_vfslist of this zone. If found, return true and note that it is
6924          * not writable.
6925          */
6926 
6927         /*
6928          * Initialize zfstype if it is not initialized yet.
6929          */
6930         if (zfstype == -1) {
6931                 struct vfssw *vswp = vfs_getvfssw("zfs");
6932                 zfstype = vswp - vfssw;
6933                 vfs_unrefvfssw(vswp);
6934         }
6935 
6936         vfs_list_read_lock();
6937         vfsp = zone->zone_vfslist;
6938         do {
6939                 ASSERT(vfsp);
6940                 if (vfsp->vfs_fstype == zfstype) {
6941                         name = refstr_value(vfsp->vfs_resource);
6942 
6943                         /*
6944                          * Check if we have an exact match.
6945                          */
6946                         if (strcmp(dataset, name) == 0) {
6947                                 vfs_list_unlock();
6948                                 if (write)
6949                                         *write = 0;
6950                                 return (1);
6951                         }
6952                         /*
6953                          * We need to check if we are looking for parents of
6954                          * a dataset. These should be visible, but read-only.
6955                          */
6956                         len = strlen(dataset);
6957                         if (dataset[len - 1] == '/')
6958                                 len--;
6959 
6960                         if (len < strlen(name) &&
6961                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6962                                 vfs_list_unlock();
6963                                 if (write)
6964                                         *write = 0;
6965                                 return (1);
6966                         }
6967                 }
6968                 vfsp = vfsp->vfs_zone_next;
6969         } while (vfsp != zone->zone_vfslist);
6970 
6971         vfs_list_unlock();
6972         return (0);
6973 }
6974 
6975 /*
6976  * zone_find_by_any_path() -
6977  *
6978  * kernel-private routine similar to zone_find_by_path(), but which
6979  * effectively compares against zone paths rather than zonerootpath
6980  * (i.e., the last component of zonerootpaths, which should be "root/",
6981  * are not compared.)  This is done in order to accurately identify all
6982  * paths, whether zone-visible or not, including those which are parallel
6983  * to /root/, such as /dev/, /home/, etc...
6984  *
6985  * If the specified path does not fall under any zone path then global
6986  * zone is returned.
6987  *
6988  * The treat_abs parameter indicates whether the path should be treated as
6989  * an absolute path although it does not begin with "/".  (This supports
6990  * nfs mount syntax such as host:any/path.)
6991  *
6992  * The caller is responsible for zone_rele of the returned zone.
6993  */
6994 zone_t *
6995 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6996 {
6997         zone_t *zone;
6998         int path_offset = 0;
6999 
7000         if (path == NULL) {
7001                 zone_hold(global_zone);
7002                 return (global_zone);
7003         }
7004 
7005         if (*path != '/') {
7006                 ASSERT(treat_abs);
7007                 path_offset = 1;
7008         }
7009 
7010         mutex_enter(&zonehash_lock);
7011         for (zone = list_head(&zone_active); zone != NULL;
7012             zone = list_next(&zone_active, zone)) {
7013                 char    *c;
7014                 size_t  pathlen;
7015                 char *rootpath_start;
7016 
7017                 if (zone == global_zone)        /* skip global zone */
7018                         continue;
7019 
7020                 /* scan backwards to find start of last component */
7021                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
7022                 do {
7023                         c--;
7024                 } while (*c != '/');
7025 
7026                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
7027                 rootpath_start = (zone->zone_rootpath + path_offset);
7028                 if (strncmp(path, rootpath_start, pathlen) == 0)
7029                         break;
7030         }
7031         if (zone == NULL)
7032                 zone = global_zone;
7033         zone_hold(zone);
7034         mutex_exit(&zonehash_lock);
7035         return (zone);
7036 }
7037 
7038 /*
7039  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7040  * zone_dl_t pointer if found, and NULL otherwise.
7041  */
7042 static zone_dl_t *
7043 zone_find_dl(zone_t *zone, datalink_id_t linkid)
7044 {
7045         zone_dl_t *zdl;
7046 
7047         ASSERT(mutex_owned(&zone->zone_lock));
7048         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7049             zdl = list_next(&zone->zone_dl_list, zdl)) {
7050                 if (zdl->zdl_id == linkid)
7051                         break;
7052         }
7053         return (zdl);
7054 }
7055 
7056 static boolean_t
7057 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7058 {
7059         boolean_t exists;
7060 
7061         mutex_enter(&zone->zone_lock);
7062         exists = (zone_find_dl(zone, linkid) != NULL);
7063         mutex_exit(&zone->zone_lock);
7064         return (exists);
7065 }
7066 
7067 /*
7068  * Add an data link name for the zone.
7069  */
7070 static int
7071 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7072 {
7073         zone_dl_t *zdl;
7074         zone_t *zone;
7075         zone_t *thiszone;
7076 
7077         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7078                 return (set_errno(ENXIO));
7079 
7080         /* Verify that the datalink ID doesn't already belong to a zone. */
7081         mutex_enter(&zonehash_lock);
7082         for (zone = list_head(&zone_active); zone != NULL;
7083             zone = list_next(&zone_active, zone)) {
7084                 if (zone_dl_exists(zone, linkid)) {
7085                         mutex_exit(&zonehash_lock);
7086                         zone_rele(thiszone);
7087                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7088                 }
7089         }
7090 
7091         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7092         zdl->zdl_id = linkid;
7093         zdl->zdl_net = NULL;
7094         mutex_enter(&thiszone->zone_lock);
7095         list_insert_head(&thiszone->zone_dl_list, zdl);
7096         mutex_exit(&thiszone->zone_lock);
7097         mutex_exit(&zonehash_lock);
7098         zone_rele(thiszone);
7099         return (0);
7100 }
7101 
7102 static int
7103 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7104 {
7105         zone_dl_t *zdl;
7106         zone_t *zone;
7107         int err = 0;
7108 
7109         if ((zone = zone_find_by_id(zoneid)) == NULL)
7110                 return (set_errno(EINVAL));
7111 
7112         mutex_enter(&zone->zone_lock);
7113         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7114                 err = ENXIO;
7115         } else {
7116                 list_remove(&zone->zone_dl_list, zdl);
7117                 nvlist_free(zdl->zdl_net);
7118                 kmem_free(zdl, sizeof (zone_dl_t));
7119         }
7120         mutex_exit(&zone->zone_lock);
7121         zone_rele(zone);
7122         return (err == 0 ? 0 : set_errno(err));
7123 }
7124 
7125 /*
7126  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7127  * the linkid.  Otherwise we just check if the specified zoneidp has been
7128  * assigned the supplied linkid.
7129  */
7130 int
7131 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7132 {
7133         zone_t *zone;
7134         int err = ENXIO;
7135 
7136         if (*zoneidp != ALL_ZONES) {
7137                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7138                         if (zone_dl_exists(zone, linkid))
7139                                 err = 0;
7140                         zone_rele(zone);
7141                 }
7142                 return (err);
7143         }
7144 
7145         mutex_enter(&zonehash_lock);
7146         for (zone = list_head(&zone_active); zone != NULL;
7147             zone = list_next(&zone_active, zone)) {
7148                 if (zone_dl_exists(zone, linkid)) {
7149                         *zoneidp = zone->zone_id;
7150                         err = 0;
7151                         break;
7152                 }
7153         }
7154         mutex_exit(&zonehash_lock);
7155         return (err);
7156 }
7157 
7158 /*
7159  * Get the list of datalink IDs assigned to a zone.
7160  *
7161  * On input, *nump is the number of datalink IDs that can fit in the supplied
7162  * idarray.  Upon return, *nump is either set to the number of datalink IDs
7163  * that were placed in the array if the array was large enough, or to the
7164  * number of datalink IDs that the function needs to place in the array if the
7165  * array is too small.
7166  */
7167 static int
7168 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7169 {
7170         uint_t num, dlcount;
7171         zone_t *zone;
7172         zone_dl_t *zdl;
7173         datalink_id_t *idptr = idarray;
7174 
7175         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7176                 return (set_errno(EFAULT));
7177         if ((zone = zone_find_by_id(zoneid)) == NULL)
7178                 return (set_errno(ENXIO));
7179 
7180         num = 0;
7181         mutex_enter(&zone->zone_lock);
7182         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7183             zdl = list_next(&zone->zone_dl_list, zdl)) {
7184                 /*
7185                  * If the list is bigger than what the caller supplied, just
7186                  * count, don't do copyout.
7187                  */
7188                 if (++num > dlcount)
7189                         continue;
7190                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7191                         mutex_exit(&zone->zone_lock);
7192                         zone_rele(zone);
7193                         return (set_errno(EFAULT));
7194                 }
7195                 idptr++;
7196         }
7197         mutex_exit(&zone->zone_lock);
7198         zone_rele(zone);
7199 
7200         /* Increased or decreased, caller should be notified. */
7201         if (num != dlcount) {
7202                 if (copyout(&num, nump, sizeof (num)) != 0)
7203                         return (set_errno(EFAULT));
7204         }
7205         return (0);
7206 }
7207 
7208 /*
7209  * Public interface for looking up a zone by zoneid. It's a customized version
7210  * for netstack_zone_create(). It can only be called from the zsd create
7211  * callbacks, since it doesn't have reference on the zone structure hence if
7212  * it is called elsewhere the zone could disappear after the zonehash_lock
7213  * is dropped.
7214  *
7215  * Furthermore it
7216  * 1. Doesn't check the status of the zone.
7217  * 2. It will be called even before zone_init is called, in that case the
7218  *    address of zone0 is returned directly, and netstack_zone_create()
7219  *    will only assign a value to zone0.zone_netstack, won't break anything.
7220  * 3. Returns without the zone being held.
7221  */
7222 zone_t *
7223 zone_find_by_id_nolock(zoneid_t zoneid)
7224 {
7225         zone_t *zone;
7226 
7227         mutex_enter(&zonehash_lock);
7228         if (zonehashbyid == NULL)
7229                 zone = &zone0;
7230         else
7231                 zone = zone_find_all_by_id(zoneid);
7232         mutex_exit(&zonehash_lock);
7233         return (zone);
7234 }
7235 
7236 /*
7237  * Walk the datalinks for a given zone
7238  */
7239 int
7240 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7241     void *data)
7242 {
7243         zone_t          *zone;
7244         zone_dl_t       *zdl;
7245         datalink_id_t   *idarray;
7246         uint_t          idcount = 0;
7247         int             i, ret = 0;
7248 
7249         if ((zone = zone_find_by_id(zoneid)) == NULL)
7250                 return (ENOENT);
7251 
7252         /*
7253          * We first build an array of linkid's so that we can walk these and
7254          * execute the callback with the zone_lock dropped.
7255          */
7256         mutex_enter(&zone->zone_lock);
7257         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7258             zdl = list_next(&zone->zone_dl_list, zdl)) {
7259                 idcount++;
7260         }
7261 
7262         if (idcount == 0) {
7263                 mutex_exit(&zone->zone_lock);
7264                 zone_rele(zone);
7265                 return (0);
7266         }
7267 
7268         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7269         if (idarray == NULL) {
7270                 mutex_exit(&zone->zone_lock);
7271                 zone_rele(zone);
7272                 return (ENOMEM);
7273         }
7274 
7275         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7276             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7277                 idarray[i] = zdl->zdl_id;
7278         }
7279 
7280         mutex_exit(&zone->zone_lock);
7281 
7282         for (i = 0; i < idcount && ret == 0; i++) {
7283                 if ((ret = (*cb)(idarray[i], data)) != 0)
7284                         break;
7285         }
7286 
7287         zone_rele(zone);
7288         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7289         return (ret);
7290 }
7291 
7292 static char *
7293 zone_net_type2name(int type)
7294 {
7295         switch (type) {
7296         case ZONE_NETWORK_ADDRESS:
7297                 return (ZONE_NET_ADDRNAME);
7298         case ZONE_NETWORK_DEFROUTER:
7299                 return (ZONE_NET_RTRNAME);
7300         default:
7301                 return (NULL);
7302         }
7303 }
7304 
7305 static int
7306 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7307 {
7308         zone_t *zone;
7309         zone_dl_t *zdl;
7310         nvlist_t *nvl;
7311         int err = 0;
7312         uint8_t *new = NULL;
7313         char *nvname;
7314         int bufsize;
7315         datalink_id_t linkid = znbuf->zn_linkid;
7316 
7317         if (secpolicy_zone_config(CRED()) != 0)
7318                 return (set_errno(EPERM));
7319 
7320         if (zoneid == GLOBAL_ZONEID)
7321                 return (set_errno(EINVAL));
7322 
7323         nvname = zone_net_type2name(znbuf->zn_type);
7324         bufsize = znbuf->zn_len;
7325         new = znbuf->zn_val;
7326         if (nvname == NULL)
7327                 return (set_errno(EINVAL));
7328 
7329         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7330                 return (set_errno(EINVAL));
7331         }
7332 
7333         mutex_enter(&zone->zone_lock);
7334         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7335                 err = ENXIO;
7336                 goto done;
7337         }
7338         if ((nvl = zdl->zdl_net) == NULL) {
7339                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7340                         err = ENOMEM;
7341                         goto done;
7342                 } else {
7343                         zdl->zdl_net = nvl;
7344                 }
7345         }
7346         if (nvlist_exists(nvl, nvname)) {
7347                 err = EINVAL;
7348                 goto done;
7349         }
7350         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7351         ASSERT(err == 0);
7352 done:
7353         mutex_exit(&zone->zone_lock);
7354         zone_rele(zone);
7355         if (err != 0)
7356                 return (set_errno(err));
7357         else
7358                 return (0);
7359 }
7360 
7361 static int
7362 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7363 {
7364         zone_t *zone;
7365         zone_dl_t *zdl;
7366         nvlist_t *nvl;
7367         uint8_t *ptr;
7368         uint_t psize;
7369         int err = 0;
7370         char *nvname;
7371         int bufsize;
7372         void *buf;
7373         datalink_id_t linkid = znbuf->zn_linkid;
7374 
7375         if (zoneid == GLOBAL_ZONEID)
7376                 return (set_errno(EINVAL));
7377 
7378         nvname = zone_net_type2name(znbuf->zn_type);
7379         bufsize = znbuf->zn_len;
7380         buf = znbuf->zn_val;
7381 
7382         if (nvname == NULL)
7383                 return (set_errno(EINVAL));
7384         if ((zone = zone_find_by_id(zoneid)) == NULL)
7385                 return (set_errno(EINVAL));
7386 
7387         mutex_enter(&zone->zone_lock);
7388         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7389                 err = ENXIO;
7390                 goto done;
7391         }
7392         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7393                 err = ENOENT;
7394                 goto done;
7395         }
7396         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7397         ASSERT(err == 0);
7398 
7399         if (psize > bufsize) {
7400                 err = ENOBUFS;
7401                 goto done;
7402         }
7403         znbuf->zn_len = psize;
7404         bcopy(ptr, buf, psize);
7405 done:
7406         mutex_exit(&zone->zone_lock);
7407         zone_rele(zone);
7408         if (err != 0)
7409                 return (set_errno(err));
7410         else
7411                 return (0);
7412 }