io-lx-public-vs-joyent Old usr/src/uts/common/os/zone.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016, Joyent Inc.
  25  */
  26 
  27 /*
  28  * Zones
  29  *
  30  *   A zone is a named collection of processes, namespace constraints,
  31  *   and other system resources which comprise a secure and manageable
  32  *   application containment facility.
  33  *
  34  *   Zones (represented by the reference counted zone_t) are tracked in
  35  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36  *   (zoneid_t) are used to track zone association.  Zone IDs are
  37  *   dynamically generated when the zone is created; if a persistent
  38  *   identifier is needed (core files, accounting logs, audit trail,
  39  *   etc.), the zone name should be used.
  40  *
  41  *
  42  *   Global Zone:
  43  *
  44  *   The global zone (zoneid 0) is automatically associated with all
  45  *   system resources that have not been bound to a user-created zone.
  46  *   This means that even systems where zones are not in active use
  47  *   have a global zone, and all processes, mounts, etc. are
  48  *   associated with that zone.  The global zone is generally
  49  *   unconstrained in terms of privileges and access, though the usual
  50  *   credential and privilege based restrictions apply.
  51  *
  52  *
  53  *   Zone States:
  54  *
  55  *   The states in which a zone may be in and the transitions are as
  56  *   follows:
  57  *
  58  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  59  *   initialized zone is added to the list of active zones on the system but
  60  *   isn't accessible.
  61  *
  62  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  63  *   not yet completed. Not possible to enter the zone, but attributes can
  64  *   be retrieved.
  65  *
  66  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  67  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  68  *   executed.  A zone remains in this state until it transitions into
  69  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  70  *
  71  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  72  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  73  *   state.
  74  *
  75  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  76  *   successfully started init.   A zone remains in this state until
  77  *   zone_shutdown() is called.
  78  *
  79  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  80  *   killing all processes running in the zone. The zone remains
  81  *   in this state until there are no more user processes running in the zone.
  82  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  83  *   Since zone_shutdown() is restartable, it may be called successfully
  84  *   multiple times for the same zone_t.  Setting of the zone's state to
  85  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  86  *   the zone's status without worrying about it being a moving target.
  87  *
  88  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  89  *   are no more user processes in the zone.  The zone remains in this
  90  *   state until there are no more kernel threads associated with the
  91  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  92  *   fail.
  93  *
  94  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  95  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  96  *   join the zone or create kernel threads therein.
  97  *
  98  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  99  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 100  *   return NULL from now on.
 101  *
 102  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 103  *   processes or threads doing work on behalf of the zone.  The zone is
 104  *   removed from the list of active zones.  zone_destroy() returns, and
 105  *   the zone can be recreated.
 106  *
 107  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 108  *   callbacks are executed, and all memory associated with the zone is
 109  *   freed.
 110  *
 111  *   Threads can wait for the zone to enter a requested state by using
 112  *   zone_status_wait() or zone_status_timedwait() with the desired
 113  *   state passed in as an argument.  Zone state transitions are
 114  *   uni-directional; it is not possible to move back to an earlier state.
 115  *
 116  *
 117  *   Zone-Specific Data:
 118  *
 119  *   Subsystems needing to maintain zone-specific data can store that
 120  *   data using the ZSD mechanism.  This provides a zone-specific data
 121  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 122  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 123  *   to register callbacks to be invoked when a zone is created, shut
 124  *   down, or destroyed.  This can be used to initialize zone-specific
 125  *   data for new zones and to clean up when zones go away.
 126  *
 127  *
 128  *   Data Structures:
 129  *
 130  *   The per-zone structure (zone_t) is reference counted, and freed
 131  *   when all references are released.  zone_hold and zone_rele can be
 132  *   used to adjust the reference count.  In addition, reference counts
 133  *   associated with the cred_t structure are tracked separately using
 134  *   zone_cred_hold and zone_cred_rele.
 135  *
 136  *   Pointers to active zone_t's are stored in two hash tables; one
 137  *   for searching by id, the other for searching by name.  Lookups
 138  *   can be performed on either basis, using zone_find_by_id and
 139  *   zone_find_by_name.  Both return zone_t pointers with the zone
 140  *   held, so zone_rele should be called when the pointer is no longer
 141  *   needed.  Zones can also be searched by path; zone_find_by_path
 142  *   returns the zone with which a path name is associated (global
 143  *   zone if the path is not within some other zone's file system
 144  *   hierarchy).  This currently requires iterating through each zone,
 145  *   so it is slower than an id or name search via a hash table.
 146  *
 147  *
 148  *   Locking:
 149  *
 150  *   zonehash_lock: This is a top-level global lock used to protect the
 151  *       zone hash tables and lists.  Zones cannot be created or destroyed
 152  *       while this lock is held.
 153  *   zone_status_lock: This is a global lock protecting zone state.
 154  *       Zones cannot change state while this lock is held.  It also
 155  *       protects the list of kernel threads associated with a zone.
 156  *   zone_lock: This is a per-zone lock used to protect several fields of
 157  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 158  *       this lock means that the zone cannot go away.
 159  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 160  *       related to the zone.max-lwps rctl.
 161  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 162  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 163  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 164  *       currently just max_lofi
 165  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 166  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 167  *       list (a list of zones in the ZONE_IS_DEAD state).
 168  *
 169  *   Ordering requirements:
 170  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 171  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 172  *
 173  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 174  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 175  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 176  *
 177  *   Blocking memory allocations are permitted while holding any of the
 178  *   zone locks.
 179  *
 180  *
 181  *   System Call Interface:
 182  *
 183  *   The zone subsystem can be managed and queried from user level with
 184  *   the following system calls (all subcodes of the primary "zone"
 185  *   system call):
 186  *   - zone_create: creates a zone with selected attributes (name,
 187  *     root path, privileges, resource controls, ZFS datasets)
 188  *   - zone_enter: allows the current process to enter a zone
 189  *   - zone_getattr: reports attributes of a zone
 190  *   - zone_setattr: set attributes of a zone
 191  *   - zone_boot: set 'init' running for the zone
 192  *   - zone_list: lists all zones active in the system
 193  *   - zone_lookup: looks up zone id based on name
 194  *   - zone_shutdown: initiates shutdown process (see states above)
 195  *   - zone_destroy: completes shutdown process (see states above)
 196  *
 197  */
 198 
 199 #include <sys/priv_impl.h>
 200 #include <sys/cred.h>
 201 #include <c2/audit.h>
 202 #include <sys/debug.h>
 203 #include <sys/file.h>
 204 #include <sys/kmem.h>
 205 #include <sys/kstat.h>
 206 #include <sys/mutex.h>
 207 #include <sys/note.h>
 208 #include <sys/pathname.h>
 209 #include <sys/proc.h>
 210 #include <sys/project.h>
 211 #include <sys/sysevent.h>
 212 #include <sys/task.h>
 213 #include <sys/systm.h>
 214 #include <sys/types.h>
 215 #include <sys/utsname.h>
 216 #include <sys/vnode.h>
 217 #include <sys/vfs.h>
 218 #include <sys/systeminfo.h>
 219 #include <sys/policy.h>
 220 #include <sys/cred_impl.h>
 221 #include <sys/contract_impl.h>
 222 #include <sys/contract/process_impl.h>
 223 #include <sys/class.h>
 224 #include <sys/pool.h>
 225 #include <sys/pool_pset.h>
 226 #include <sys/pset.h>
 227 #include <sys/strlog.h>
 228 #include <sys/sysmacros.h>
 229 #include <sys/callb.h>
 230 #include <sys/vmparam.h>
 231 #include <sys/corectl.h>
 232 #include <sys/ipc_impl.h>
 233 #include <sys/klpd.h>
 234 
 235 #include <sys/door.h>
 236 #include <sys/cpuvar.h>
 237 #include <sys/sdt.h>
 238 
 239 #include <sys/uadmin.h>
 240 #include <sys/session.h>
 241 #include <sys/cmn_err.h>
 242 #include <sys/modhash.h>
 243 #include <sys/sunddi.h>
 244 #include <sys/nvpair.h>
 245 #include <sys/rctl.h>
 246 #include <sys/fss.h>
 247 #include <sys/brand.h>
 248 #include <sys/zone.h>
 249 #include <net/if.h>
 250 #include <sys/cpucaps.h>
 251 #include <vm/seg.h>
 252 #include <sys/mac.h>
 253 #include <sys/rt.h>
 254 #include <sys/fx.h>
 255 
 256 /*
 257  * This constant specifies the number of seconds that threads waiting for
 258  * subsystems to release a zone's general-purpose references will wait before
 259  * they log the zone's reference counts.  The constant's value shouldn't
 260  * be so small that reference counts are unnecessarily reported for zones
 261  * whose references are slowly released.  On the other hand, it shouldn't be so
 262  * large that users reboot their systems out of frustration over hung zones
 263  * before the system logs the zones' reference counts.
 264  */
 265 #define ZONE_DESTROY_TIMEOUT_SECS       60
 266 
 267 /* List of data link IDs which are accessible from the zone */
 268 typedef struct zone_dl {
 269         datalink_id_t   zdl_id;
 270         nvlist_t        *zdl_net;
 271         list_node_t     zdl_linkage;
 272 } zone_dl_t;
 273 
 274 /*
 275  * cv used to signal that all references to the zone have been released.  This
 276  * needs to be global since there may be multiple waiters, and the first to
 277  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  */
 279 static kcondvar_t zone_destroy_cv;
 280 /*
 281  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  */
 284 static kmutex_t zone_status_lock;
 285 
 286 /*
 287  * ZSD-related global variables.
 288  */
 289 static kmutex_t zsd_key_lock;   /* protects the following two */
 290 /*
 291  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  */
 293 static zone_key_t zsd_keyval = 0;
 294 /*
 295  * Global list of registered keys.  We use this when a new zone is created.
 296  */
 297 static list_t zsd_registered_keys;
 298 
 299 int zone_hash_size = 256;
 300 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 301 static kmutex_t zonehash_lock;
 302 static uint_t zonecount;
 303 static id_space_t *zoneid_space;
 304 
 305 /*
 306  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  * kernel proper runs, and which manages all other zones.
 308  *
 309  * Although not declared as static, the variable "zone0" should not be used
 310  * except for by code that needs to reference the global zone early on in boot,
 311  * before it is fully initialized.  All other consumers should use
 312  * 'global_zone'.
 313  */
 314 zone_t zone0;
 315 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316 
 317 /*
 318  * List of active zones, protected by zonehash_lock.
 319  */
 320 static list_t zone_active;
 321 
 322 /*
 323  * List of destroyed zones that still have outstanding cred references.
 324  * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  * problems in zone_free.
 326  */
 327 static list_t zone_deathrow;
 328 static kmutex_t zone_deathrow_lock;
 329 
 330 /* number of zones is limited by virtual interface limit in IP */
 331 uint_t maxzones = 8192;
 332 
 333 /* Event channel to sent zone state change notifications */
 334 evchan_t *zone_event_chan;
 335 
 336 /*
 337  * This table holds the mapping from kernel zone states to
 338  * states visible in the state notification API.
 339  * The idea is that we only expose "obvious" states and
 340  * do not expose states which are just implementation details.
 341  */
 342 const char  *zone_status_table[] = {
 343         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344         ZONE_EVENT_INITIALIZED,         /* initialized */
 345         ZONE_EVENT_READY,               /* ready */
 346         ZONE_EVENT_READY,               /* booting */
 347         ZONE_EVENT_RUNNING,             /* running */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352         ZONE_EVENT_UNINITIALIZED,       /* dead */
 353 };
 354 
 355 /*
 356  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  * (see sys/zone.h).
 358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368 
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_phys_mem;
 376 rctl_hndl_t rc_zone_max_lofi;
 377 rctl_hndl_t rc_zone_cpu_cap;
 378 rctl_hndl_t rc_zone_cpu_baseline;
 379 rctl_hndl_t rc_zone_cpu_burst_time;
 380 rctl_hndl_t rc_zone_zfs_io_pri;
 381 rctl_hndl_t rc_zone_nlwps;
 382 rctl_hndl_t rc_zone_nprocs;
 383 rctl_hndl_t rc_zone_shmmax;
 384 rctl_hndl_t rc_zone_shmmni;
 385 rctl_hndl_t rc_zone_semmni;
 386 rctl_hndl_t rc_zone_msgmni;
 387 
 388 const char * const zone_default_initname = "/sbin/init";
 389 static char * const zone_prefix = "/zone/";
 390 static int zone_shutdown(zoneid_t zoneid);
 391 static int zone_add_datalink(zoneid_t, datalink_id_t);
 392 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 393 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 394 static int zone_set_network(zoneid_t, zone_net_data_t *);
 395 static int zone_get_network(zoneid_t, zone_net_data_t *);
 396 
 397 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398 
 399 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 400 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 401 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 402 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 403     zone_key_t);
 404 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 405 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 406     kmutex_t *);
 407 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 408     kmutex_t *);
 409 
 410 /*
 411  * Bump this number when you alter the zone syscall interfaces; this is
 412  * because we need to have support for previous API versions in libc
 413  * to support patching; libc calls into the kernel to determine this number.
 414  *
 415  * Version 1 of the API is the version originally shipped with Solaris 10
 416  * Version 2 alters the zone_create system call in order to support more
 417  *     arguments by moving the args into a structure; and to do better
 418  *     error reporting when zone_create() fails.
 419  * Version 3 alters the zone_create system call in order to support the
 420  *     import of ZFS datasets to zones.
 421  * Version 4 alters the zone_create system call in order to support
 422  *     Trusted Extensions.
 423  * Version 5 alters the zone_boot system call, and converts its old
 424  *     bootargs parameter to be set by the zone_setattr API instead.
 425  * Version 6 adds the flag argument to zone_create.
 426  * Version 7 adds the requested zoneid to zone_create.
 427  */
 428 static const int ZONE_SYSCALL_API_VERSION = 7;
 429 
 430 /*
 431  * Certain filesystems (such as NFS and autofs) need to know which zone
 432  * the mount is being placed in.  Because of this, we need to be able to
 433  * ensure that a zone isn't in the process of being created/destroyed such
 434  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 435  * it gets added the list of mounted zones, it ends up on the wrong zone's
 436  * mount list. Since a zone can't reside on an NFS file system, we don't
 437  * have to worry about the zonepath itself.
 438  *
 439  * The following functions: block_mounts()/resume_mounts() and
 440  * mount_in_progress()/mount_completed() are used by zones and the VFS
 441  * layer (respectively) to synchronize zone state transitions and new
 442  * mounts within a zone. This syncronization is on a per-zone basis, so
 443  * activity for one zone will not interfere with activity for another zone.
 444  *
 445  * The semantics are like a reader-reader lock such that there may
 446  * either be multiple mounts (or zone state transitions, if that weren't
 447  * serialized by zonehash_lock) in progress at the same time, but not
 448  * both.
 449  *
 450  * We use cv's so the user can ctrl-C out of the operation if it's
 451  * taking too long.
 452  *
 453  * The semantics are such that there is unfair bias towards the
 454  * "current" operation.  This means that zone halt may starve if
 455  * there is a rapid succession of new mounts coming in to the zone.
 456  */
 457 /*
 458  * Prevent new mounts from progressing to the point of calling
 459  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 460  * them to complete.
 461  */
 462 static int
 463 block_mounts(zone_t *zp)
 464 {
 465         int retval = 0;
 466 
 467         /*
 468          * Since it may block for a long time, block_mounts() shouldn't be
 469          * called with zonehash_lock held.
 470          */
 471         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 472         mutex_enter(&zp->zone_mount_lock);
 473         while (zp->zone_mounts_in_progress > 0) {
 474                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 475                         goto signaled;
 476         }
 477         /*
 478          * A negative value of mounts_in_progress indicates that mounts
 479          * have been blocked by (-mounts_in_progress) different callers
 480          * (remotely possible if two threads enter zone_shutdown at the same
 481          * time).
 482          */
 483         zp->zone_mounts_in_progress--;
 484         retval = 1;
 485 signaled:
 486         mutex_exit(&zp->zone_mount_lock);
 487         return (retval);
 488 }
 489 
 490 /*
 491  * The VFS layer may progress with new mounts as far as we're concerned.
 492  * Allow them to progress if we were the last obstacle.
 493  */
 494 static void
 495 resume_mounts(zone_t *zp)
 496 {
 497         mutex_enter(&zp->zone_mount_lock);
 498         if (++zp->zone_mounts_in_progress == 0)
 499                 cv_broadcast(&zp->zone_mount_cv);
 500         mutex_exit(&zp->zone_mount_lock);
 501 }
 502 
 503 /*
 504  * The VFS layer is busy with a mount; this zone should wait until all
 505  * of its mounts are completed to progress.
 506  */
 507 void
 508 mount_in_progress(zone_t *zp)
 509 {
 510         mutex_enter(&zp->zone_mount_lock);
 511         while (zp->zone_mounts_in_progress < 0)
 512                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 513         zp->zone_mounts_in_progress++;
 514         mutex_exit(&zp->zone_mount_lock);
 515 }
 516 
 517 /*
 518  * VFS is done with one mount; wake up any waiting block_mounts()
 519  * callers if this is the last mount.
 520  */
 521 void
 522 mount_completed(zone_t *zp)
 523 {
 524         mutex_enter(&zp->zone_mount_lock);
 525         if (--zp->zone_mounts_in_progress == 0)
 526                 cv_broadcast(&zp->zone_mount_cv);
 527         mutex_exit(&zp->zone_mount_lock);
 528 }
 529 
 530 /*
 531  * ZSD routines.
 532  *
 533  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 534  * defined by the pthread_key_create() and related interfaces.
 535  *
 536  * Kernel subsystems may register one or more data items and/or
 537  * callbacks to be executed when a zone is created, shutdown, or
 538  * destroyed.
 539  *
 540  * Unlike the thread counterpart, destructor callbacks will be executed
 541  * even if the data pointer is NULL and/or there are no constructor
 542  * callbacks, so it is the responsibility of such callbacks to check for
 543  * NULL data values if necessary.
 544  *
 545  * The locking strategy and overall picture is as follows:
 546  *
 547  * When someone calls zone_key_create(), a template ZSD entry is added to the
 548  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 549  * holding that lock all the existing zones are marked as
 550  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 551  * zone_zsd list (protected by zone_lock). The global list is updated first
 552  * (under zone_key_lock) to make sure that newly created zones use the
 553  * most recent list of keys. Then under zonehash_lock we walk the zones
 554  * and mark them.  Similar locking is used in zone_key_delete().
 555  *
 556  * The actual create, shutdown, and destroy callbacks are done without
 557  * holding any lock. And zsd_flags are used to ensure that the operations
 558  * completed so that when zone_key_create (and zone_create) is done, as well as
 559  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 560  * are completed.
 561  *
 562  * When new zones are created constructor callbacks for all registered ZSD
 563  * entries will be called. That also uses the above two phases of marking
 564  * what needs to be done, and then running the callbacks without holding
 565  * any locks.
 566  *
 567  * The framework does not provide any locking around zone_getspecific() and
 568  * zone_setspecific() apart from that needed for internal consistency, so
 569  * callers interested in atomic "test-and-set" semantics will need to provide
 570  * their own locking.
 571  */
 572 
 573 /*
 574  * Helper function to find the zsd_entry associated with the key in the
 575  * given list.
 576  */
 577 static struct zsd_entry *
 578 zsd_find(list_t *l, zone_key_t key)
 579 {
 580         struct zsd_entry *zsd;
 581 
 582         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 583                 if (zsd->zsd_key == key) {
 584                         return (zsd);
 585                 }
 586         }
 587         return (NULL);
 588 }
 589 
 590 /*
 591  * Helper function to find the zsd_entry associated with the key in the
 592  * given list. Move it to the front of the list.
 593  */
 594 static struct zsd_entry *
 595 zsd_find_mru(list_t *l, zone_key_t key)
 596 {
 597         struct zsd_entry *zsd;
 598 
 599         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 600                 if (zsd->zsd_key == key) {
 601                         /*
 602                          * Move to head of list to keep list in MRU order.
 603                          */
 604                         if (zsd != list_head(l)) {
 605                                 list_remove(l, zsd);
 606                                 list_insert_head(l, zsd);
 607                         }
 608                         return (zsd);
 609                 }
 610         }
 611         return (NULL);
 612 }
 613 
 614 void
 615 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 616     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 617 {
 618         struct zsd_entry *zsdp;
 619         struct zsd_entry *t;
 620         struct zone *zone;
 621         zone_key_t  key;
 622 
 623         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 624         zsdp->zsd_data = NULL;
 625         zsdp->zsd_create = create;
 626         zsdp->zsd_shutdown = shutdown;
 627         zsdp->zsd_destroy = destroy;
 628 
 629         /*
 630          * Insert in global list of callbacks. Makes future zone creations
 631          * see it.
 632          */
 633         mutex_enter(&zsd_key_lock);
 634         key = zsdp->zsd_key = ++zsd_keyval;
 635         ASSERT(zsd_keyval != 0);
 636         list_insert_tail(&zsd_registered_keys, zsdp);
 637         mutex_exit(&zsd_key_lock);
 638 
 639         /*
 640          * Insert for all existing zones and mark them as needing
 641          * a create callback.
 642          */
 643         mutex_enter(&zonehash_lock);        /* stop the world */
 644         for (zone = list_head(&zone_active); zone != NULL;
 645             zone = list_next(&zone_active, zone)) {
 646                 zone_status_t status;
 647 
 648                 mutex_enter(&zone->zone_lock);
 649 
 650                 /* Skip zones that are on the way down or not yet up */
 651                 status = zone_status_get(zone);
 652                 if (status >= ZONE_IS_DOWN ||
 653                     status == ZONE_IS_UNINITIALIZED) {
 654                         mutex_exit(&zone->zone_lock);
 655                         continue;
 656                 }
 657 
 658                 t = zsd_find_mru(&zone->zone_zsd, key);
 659                 if (t != NULL) {
 660                         /*
 661                          * A zsd_configure already inserted it after
 662                          * we dropped zsd_key_lock above.
 663                          */
 664                         mutex_exit(&zone->zone_lock);
 665                         continue;
 666                 }
 667                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 668                 t->zsd_key = key;
 669                 t->zsd_create = create;
 670                 t->zsd_shutdown = shutdown;
 671                 t->zsd_destroy = destroy;
 672                 if (create != NULL) {
 673                         t->zsd_flags = ZSD_CREATE_NEEDED;
 674                         DTRACE_PROBE2(zsd__create__needed,
 675                             zone_t *, zone, zone_key_t, key);
 676                 }
 677                 list_insert_tail(&zone->zone_zsd, t);
 678                 mutex_exit(&zone->zone_lock);
 679         }
 680         mutex_exit(&zonehash_lock);
 681 
 682         if (create != NULL) {
 683                 /* Now call the create callback for this key */
 684                 zsd_apply_all_zones(zsd_apply_create, key);
 685         }
 686         /*
 687          * It is safe for consumers to use the key now, make it
 688          * globally visible. Specifically zone_getspecific() will
 689          * always successfully return the zone specific data associated
 690          * with the key.
 691          */
 692         *keyp = key;
 693 
 694 }
 695 
 696 /*
 697  * Function called when a module is being unloaded, or otherwise wishes
 698  * to unregister its ZSD key and callbacks.
 699  *
 700  * Remove from the global list and determine the functions that need to
 701  * be called under a global lock. Then call the functions without
 702  * holding any locks. Finally free up the zone_zsd entries. (The apply
 703  * functions need to access the zone_zsd entries to find zsd_data etc.)
 704  */
 705 int
 706 zone_key_delete(zone_key_t key)
 707 {
 708         struct zsd_entry *zsdp = NULL;
 709         zone_t *zone;
 710 
 711         mutex_enter(&zsd_key_lock);
 712         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 713         if (zsdp == NULL) {
 714                 mutex_exit(&zsd_key_lock);
 715                 return (-1);
 716         }
 717         list_remove(&zsd_registered_keys, zsdp);
 718         mutex_exit(&zsd_key_lock);
 719 
 720         mutex_enter(&zonehash_lock);
 721         for (zone = list_head(&zone_active); zone != NULL;
 722             zone = list_next(&zone_active, zone)) {
 723                 struct zsd_entry *del;
 724 
 725                 mutex_enter(&zone->zone_lock);
 726                 del = zsd_find_mru(&zone->zone_zsd, key);
 727                 if (del == NULL) {
 728                         /*
 729                          * Somebody else got here first e.g the zone going
 730                          * away.
 731                          */
 732                         mutex_exit(&zone->zone_lock);
 733                         continue;
 734                 }
 735                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 736                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 737                 if (del->zsd_shutdown != NULL &&
 738                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 739                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 740                         DTRACE_PROBE2(zsd__shutdown__needed,
 741                             zone_t *, zone, zone_key_t, key);
 742                 }
 743                 if (del->zsd_destroy != NULL &&
 744                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 745                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 746                         DTRACE_PROBE2(zsd__destroy__needed,
 747                             zone_t *, zone, zone_key_t, key);
 748                 }
 749                 mutex_exit(&zone->zone_lock);
 750         }
 751         mutex_exit(&zonehash_lock);
 752         kmem_free(zsdp, sizeof (*zsdp));
 753 
 754         /* Now call the shutdown and destroy callback for this key */
 755         zsd_apply_all_zones(zsd_apply_shutdown, key);
 756         zsd_apply_all_zones(zsd_apply_destroy, key);
 757 
 758         /* Now we can free up the zsdp structures in each zone */
 759         mutex_enter(&zonehash_lock);
 760         for (zone = list_head(&zone_active); zone != NULL;
 761             zone = list_next(&zone_active, zone)) {
 762                 struct zsd_entry *del;
 763 
 764                 mutex_enter(&zone->zone_lock);
 765                 del = zsd_find(&zone->zone_zsd, key);
 766                 if (del != NULL) {
 767                         list_remove(&zone->zone_zsd, del);
 768                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 769                         kmem_free(del, sizeof (*del));
 770                 }
 771                 mutex_exit(&zone->zone_lock);
 772         }
 773         mutex_exit(&zonehash_lock);
 774 
 775         return (0);
 776 }
 777 
 778 /*
 779  * ZSD counterpart of pthread_setspecific().
 780  *
 781  * Since all zsd callbacks, including those with no create function,
 782  * have an entry in zone_zsd, if the key is registered it is part of
 783  * the zone_zsd list.
 784  * Return an error if the key wasn't registerd.
 785  */
 786 int
 787 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 788 {
 789         struct zsd_entry *t;
 790 
 791         mutex_enter(&zone->zone_lock);
 792         t = zsd_find_mru(&zone->zone_zsd, key);
 793         if (t != NULL) {
 794                 /*
 795                  * Replace old value with new
 796                  */
 797                 t->zsd_data = (void *)data;
 798                 mutex_exit(&zone->zone_lock);
 799                 return (0);
 800         }
 801         mutex_exit(&zone->zone_lock);
 802         return (-1);
 803 }
 804 
 805 /*
 806  * ZSD counterpart of pthread_getspecific().
 807  */
 808 void *
 809 zone_getspecific(zone_key_t key, zone_t *zone)
 810 {
 811         struct zsd_entry *t;
 812         void *data;
 813 
 814         mutex_enter(&zone->zone_lock);
 815         t = zsd_find_mru(&zone->zone_zsd, key);
 816         data = (t == NULL ? NULL : t->zsd_data);
 817         mutex_exit(&zone->zone_lock);
 818         return (data);
 819 }
 820 
 821 /*
 822  * Function used to initialize a zone's list of ZSD callbacks and data
 823  * when the zone is being created.  The callbacks are initialized from
 824  * the template list (zsd_registered_keys). The constructor callback is
 825  * executed later (once the zone exists and with locks dropped).
 826  */
 827 static void
 828 zone_zsd_configure(zone_t *zone)
 829 {
 830         struct zsd_entry *zsdp;
 831         struct zsd_entry *t;
 832 
 833         ASSERT(MUTEX_HELD(&zonehash_lock));
 834         ASSERT(list_head(&zone->zone_zsd) == NULL);
 835         mutex_enter(&zone->zone_lock);
 836         mutex_enter(&zsd_key_lock);
 837         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 838             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 839                 /*
 840                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 841                  * should not have added anything to it.
 842                  */
 843                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 844 
 845                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 846                 t->zsd_key = zsdp->zsd_key;
 847                 t->zsd_create = zsdp->zsd_create;
 848                 t->zsd_shutdown = zsdp->zsd_shutdown;
 849                 t->zsd_destroy = zsdp->zsd_destroy;
 850                 if (zsdp->zsd_create != NULL) {
 851                         t->zsd_flags = ZSD_CREATE_NEEDED;
 852                         DTRACE_PROBE2(zsd__create__needed,
 853                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 854                 }
 855                 list_insert_tail(&zone->zone_zsd, t);
 856         }
 857         mutex_exit(&zsd_key_lock);
 858         mutex_exit(&zone->zone_lock);
 859 }
 860 
 861 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 862 
 863 /*
 864  * Helper function to execute shutdown or destructor callbacks.
 865  */
 866 static void
 867 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 868 {
 869         struct zsd_entry *t;
 870 
 871         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 872         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 873         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 874 
 875         /*
 876          * Run the callback solely based on what is registered for the zone
 877          * in zone_zsd. The global list can change independently of this
 878          * as keys are registered and unregistered and we don't register new
 879          * callbacks for a zone that is in the process of going away.
 880          */
 881         mutex_enter(&zone->zone_lock);
 882         for (t = list_head(&zone->zone_zsd); t != NULL;
 883             t = list_next(&zone->zone_zsd, t)) {
 884                 zone_key_t key = t->zsd_key;
 885 
 886                 /* Skip if no callbacks registered */
 887 
 888                 if (ct == ZSD_SHUTDOWN) {
 889                         if (t->zsd_shutdown != NULL &&
 890                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 891                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 892                                 DTRACE_PROBE2(zsd__shutdown__needed,
 893                                     zone_t *, zone, zone_key_t, key);
 894                         }
 895                 } else {
 896                         if (t->zsd_destroy != NULL &&
 897                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 898                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 899                                 DTRACE_PROBE2(zsd__destroy__needed,
 900                                     zone_t *, zone, zone_key_t, key);
 901                         }
 902                 }
 903         }
 904         mutex_exit(&zone->zone_lock);
 905 
 906         /* Now call the shutdown and destroy callback for this key */
 907         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 908         zsd_apply_all_keys(zsd_apply_destroy, zone);
 909 
 910 }
 911 
 912 /*
 913  * Called when the zone is going away; free ZSD-related memory, and
 914  * destroy the zone_zsd list.
 915  */
 916 static void
 917 zone_free_zsd(zone_t *zone)
 918 {
 919         struct zsd_entry *t, *next;
 920 
 921         /*
 922          * Free all the zsd_entry's we had on this zone.
 923          */
 924         mutex_enter(&zone->zone_lock);
 925         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 926                 next = list_next(&zone->zone_zsd, t);
 927                 list_remove(&zone->zone_zsd, t);
 928                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 929                 kmem_free(t, sizeof (*t));
 930         }
 931         list_destroy(&zone->zone_zsd);
 932         mutex_exit(&zone->zone_lock);
 933 
 934 }
 935 
 936 /*
 937  * Apply a function to all zones for particular key value.
 938  *
 939  * The applyfn has to drop zonehash_lock if it does some work, and
 940  * then reacquire it before it returns.
 941  * When the lock is dropped we don't follow list_next even
 942  * if it is possible to do so without any hazards. This is
 943  * because we want the design to allow for the list of zones
 944  * to change in any arbitrary way during the time the
 945  * lock was dropped.
 946  *
 947  * It is safe to restart the loop at list_head since the applyfn
 948  * changes the zsd_flags as it does work, so a subsequent
 949  * pass through will have no effect in applyfn, hence the loop will terminate
 950  * in at worst O(N^2).
 951  */
 952 static void
 953 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 954 {
 955         zone_t *zone;
 956 
 957         mutex_enter(&zonehash_lock);
 958         zone = list_head(&zone_active);
 959         while (zone != NULL) {
 960                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 961                         /* Lock dropped - restart at head */
 962                         zone = list_head(&zone_active);
 963                 } else {
 964                         zone = list_next(&zone_active, zone);
 965                 }
 966         }
 967         mutex_exit(&zonehash_lock);
 968 }
 969 
 970 /*
 971  * Apply a function to all keys for a particular zone.
 972  *
 973  * The applyfn has to drop zonehash_lock if it does some work, and
 974  * then reacquire it before it returns.
 975  * When the lock is dropped we don't follow list_next even
 976  * if it is possible to do so without any hazards. This is
 977  * because we want the design to allow for the list of zsd callbacks
 978  * to change in any arbitrary way during the time the
 979  * lock was dropped.
 980  *
 981  * It is safe to restart the loop at list_head since the applyfn
 982  * changes the zsd_flags as it does work, so a subsequent
 983  * pass through will have no effect in applyfn, hence the loop will terminate
 984  * in at worst O(N^2).
 985  */
 986 static void
 987 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 988 {
 989         struct zsd_entry *t;
 990 
 991         mutex_enter(&zone->zone_lock);
 992         t = list_head(&zone->zone_zsd);
 993         while (t != NULL) {
 994                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 995                         /* Lock dropped - restart at head */
 996                         t = list_head(&zone->zone_zsd);
 997                 } else {
 998                         t = list_next(&zone->zone_zsd, t);
 999                 }
1000         }
1001         mutex_exit(&zone->zone_lock);
1002 }
1003 
1004 /*
1005  * Call the create function for the zone and key if CREATE_NEEDED
1006  * is set.
1007  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1008  * we wait for that thread to complete so that we can ensure that
1009  * all the callbacks are done when we've looped over all zones/keys.
1010  *
1011  * When we call the create function, we drop the global held by the
1012  * caller, and return true to tell the caller it needs to re-evalute the
1013  * state.
1014  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1015  * remains held on exit.
1016  */
1017 static boolean_t
1018 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1019     zone_t *zone, zone_key_t key)
1020 {
1021         void *result;
1022         struct zsd_entry *t;
1023         boolean_t dropped;
1024 
1025         if (lockp != NULL) {
1026                 ASSERT(MUTEX_HELD(lockp));
1027         }
1028         if (zone_lock_held) {
1029                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1030         } else {
1031                 mutex_enter(&zone->zone_lock);
1032         }
1033 
1034         t = zsd_find(&zone->zone_zsd, key);
1035         if (t == NULL) {
1036                 /*
1037                  * Somebody else got here first e.g the zone going
1038                  * away.
1039                  */
1040                 if (!zone_lock_held)
1041                         mutex_exit(&zone->zone_lock);
1042                 return (B_FALSE);
1043         }
1044         dropped = B_FALSE;
1045         if (zsd_wait_for_inprogress(zone, t, lockp))
1046                 dropped = B_TRUE;
1047 
1048         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1049                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1050                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1051                 DTRACE_PROBE2(zsd__create__inprogress,
1052                     zone_t *, zone, zone_key_t, key);
1053                 mutex_exit(&zone->zone_lock);
1054                 if (lockp != NULL)
1055                         mutex_exit(lockp);
1056 
1057                 dropped = B_TRUE;
1058                 ASSERT(t->zsd_create != NULL);
1059                 DTRACE_PROBE2(zsd__create__start,
1060                     zone_t *, zone, zone_key_t, key);
1061 
1062                 result = (*t->zsd_create)(zone->zone_id);
1063 
1064                 DTRACE_PROBE2(zsd__create__end,
1065                     zone_t *, zone, voidn *, result);
1066 
1067                 ASSERT(result != NULL);
1068                 if (lockp != NULL)
1069                         mutex_enter(lockp);
1070                 mutex_enter(&zone->zone_lock);
1071                 t->zsd_data = result;
1072                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1073                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1074                 cv_broadcast(&t->zsd_cv);
1075                 DTRACE_PROBE2(zsd__create__completed,
1076                     zone_t *, zone, zone_key_t, key);
1077         }
1078         if (!zone_lock_held)
1079                 mutex_exit(&zone->zone_lock);
1080         return (dropped);
1081 }
1082 
1083 /*
1084  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1085  * is set.
1086  * If some other thread gets here first and sets *_INPROGRESS, then
1087  * we wait for that thread to complete so that we can ensure that
1088  * all the callbacks are done when we've looped over all zones/keys.
1089  *
1090  * When we call the shutdown function, we drop the global held by the
1091  * caller, and return true to tell the caller it needs to re-evalute the
1092  * state.
1093  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1094  * remains held on exit.
1095  */
1096 static boolean_t
1097 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1098     zone_t *zone, zone_key_t key)
1099 {
1100         struct zsd_entry *t;
1101         void *data;
1102         boolean_t dropped;
1103 
1104         if (lockp != NULL) {
1105                 ASSERT(MUTEX_HELD(lockp));
1106         }
1107         if (zone_lock_held) {
1108                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1109         } else {
1110                 mutex_enter(&zone->zone_lock);
1111         }
1112 
1113         t = zsd_find(&zone->zone_zsd, key);
1114         if (t == NULL) {
1115                 /*
1116                  * Somebody else got here first e.g the zone going
1117                  * away.
1118                  */
1119                 if (!zone_lock_held)
1120                         mutex_exit(&zone->zone_lock);
1121                 return (B_FALSE);
1122         }
1123         dropped = B_FALSE;
1124         if (zsd_wait_for_creator(zone, t, lockp))
1125                 dropped = B_TRUE;
1126 
1127         if (zsd_wait_for_inprogress(zone, t, lockp))
1128                 dropped = B_TRUE;
1129 
1130         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1131                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1132                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1133                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1134                     zone_t *, zone, zone_key_t, key);
1135                 mutex_exit(&zone->zone_lock);
1136                 if (lockp != NULL)
1137                         mutex_exit(lockp);
1138                 dropped = B_TRUE;
1139 
1140                 ASSERT(t->zsd_shutdown != NULL);
1141                 data = t->zsd_data;
1142 
1143                 DTRACE_PROBE2(zsd__shutdown__start,
1144                     zone_t *, zone, zone_key_t, key);
1145 
1146                 (t->zsd_shutdown)(zone->zone_id, data);
1147                 DTRACE_PROBE2(zsd__shutdown__end,
1148                     zone_t *, zone, zone_key_t, key);
1149 
1150                 if (lockp != NULL)
1151                         mutex_enter(lockp);
1152                 mutex_enter(&zone->zone_lock);
1153                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1154                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1155                 cv_broadcast(&t->zsd_cv);
1156                 DTRACE_PROBE2(zsd__shutdown__completed,
1157                     zone_t *, zone, zone_key_t, key);
1158         }
1159         if (!zone_lock_held)
1160                 mutex_exit(&zone->zone_lock);
1161         return (dropped);
1162 }
1163 
1164 /*
1165  * Call the destroy function for the zone and key if DESTROY_NEEDED
1166  * is set.
1167  * If some other thread gets here first and sets *_INPROGRESS, then
1168  * we wait for that thread to complete so that we can ensure that
1169  * all the callbacks are done when we've looped over all zones/keys.
1170  *
1171  * When we call the destroy function, we drop the global held by the
1172  * caller, and return true to tell the caller it needs to re-evalute the
1173  * state.
1174  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1175  * remains held on exit.
1176  */
1177 static boolean_t
1178 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1179     zone_t *zone, zone_key_t key)
1180 {
1181         struct zsd_entry *t;
1182         void *data;
1183         boolean_t dropped;
1184 
1185         if (lockp != NULL) {
1186                 ASSERT(MUTEX_HELD(lockp));
1187         }
1188         if (zone_lock_held) {
1189                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1190         } else {
1191                 mutex_enter(&zone->zone_lock);
1192         }
1193 
1194         t = zsd_find(&zone->zone_zsd, key);
1195         if (t == NULL) {
1196                 /*
1197                  * Somebody else got here first e.g the zone going
1198                  * away.
1199                  */
1200                 if (!zone_lock_held)
1201                         mutex_exit(&zone->zone_lock);
1202                 return (B_FALSE);
1203         }
1204         dropped = B_FALSE;
1205         if (zsd_wait_for_creator(zone, t, lockp))
1206                 dropped = B_TRUE;
1207 
1208         if (zsd_wait_for_inprogress(zone, t, lockp))
1209                 dropped = B_TRUE;
1210 
1211         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1212                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1213                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1214                 DTRACE_PROBE2(zsd__destroy__inprogress,
1215                     zone_t *, zone, zone_key_t, key);
1216                 mutex_exit(&zone->zone_lock);
1217                 if (lockp != NULL)
1218                         mutex_exit(lockp);
1219                 dropped = B_TRUE;
1220 
1221                 ASSERT(t->zsd_destroy != NULL);
1222                 data = t->zsd_data;
1223                 DTRACE_PROBE2(zsd__destroy__start,
1224                     zone_t *, zone, zone_key_t, key);
1225 
1226                 (t->zsd_destroy)(zone->zone_id, data);
1227                 DTRACE_PROBE2(zsd__destroy__end,
1228                     zone_t *, zone, zone_key_t, key);
1229 
1230                 if (lockp != NULL)
1231                         mutex_enter(lockp);
1232                 mutex_enter(&zone->zone_lock);
1233                 t->zsd_data = NULL;
1234                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1235                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1236                 cv_broadcast(&t->zsd_cv);
1237                 DTRACE_PROBE2(zsd__destroy__completed,
1238                     zone_t *, zone, zone_key_t, key);
1239         }
1240         if (!zone_lock_held)
1241                 mutex_exit(&zone->zone_lock);
1242         return (dropped);
1243 }
1244 
1245 /*
1246  * Wait for any CREATE_NEEDED flag to be cleared.
1247  * Returns true if lockp was temporarily dropped while waiting.
1248  */
1249 static boolean_t
1250 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1251 {
1252         boolean_t dropped = B_FALSE;
1253 
1254         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1255                 DTRACE_PROBE2(zsd__wait__for__creator,
1256                     zone_t *, zone, struct zsd_entry *, t);
1257                 if (lockp != NULL) {
1258                         dropped = B_TRUE;
1259                         mutex_exit(lockp);
1260                 }
1261                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1262                 if (lockp != NULL) {
1263                         /* First drop zone_lock to preserve order */
1264                         mutex_exit(&zone->zone_lock);
1265                         mutex_enter(lockp);
1266                         mutex_enter(&zone->zone_lock);
1267                 }
1268         }
1269         return (dropped);
1270 }
1271 
1272 /*
1273  * Wait for any INPROGRESS flag to be cleared.
1274  * Returns true if lockp was temporarily dropped while waiting.
1275  */
1276 static boolean_t
1277 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1278 {
1279         boolean_t dropped = B_FALSE;
1280 
1281         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1282                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1283                     zone_t *, zone, struct zsd_entry *, t);
1284                 if (lockp != NULL) {
1285                         dropped = B_TRUE;
1286                         mutex_exit(lockp);
1287                 }
1288                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1289                 if (lockp != NULL) {
1290                         /* First drop zone_lock to preserve order */
1291                         mutex_exit(&zone->zone_lock);
1292                         mutex_enter(lockp);
1293                         mutex_enter(&zone->zone_lock);
1294                 }
1295         }
1296         return (dropped);
1297 }
1298 
1299 /*
1300  * Frees memory associated with the zone dataset list.
1301  */
1302 static void
1303 zone_free_datasets(zone_t *zone)
1304 {
1305         zone_dataset_t *t, *next;
1306 
1307         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1308                 next = list_next(&zone->zone_datasets, t);
1309                 list_remove(&zone->zone_datasets, t);
1310                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1311                 kmem_free(t, sizeof (*t));
1312         }
1313         list_destroy(&zone->zone_datasets);
1314 }
1315 
1316 /*
1317  * zone.cpu-shares resource control support.
1318  */
1319 /*ARGSUSED*/
1320 static rctl_qty_t
1321 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1322 {
1323         ASSERT(MUTEX_HELD(&p->p_lock));
1324         return (p->p_zone->zone_shares);
1325 }
1326 
1327 /*ARGSUSED*/
1328 static int
1329 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1330     rctl_qty_t nv)
1331 {
1332         ASSERT(MUTEX_HELD(&p->p_lock));
1333         ASSERT(e->rcep_t == RCENTITY_ZONE);
1334         if (e->rcep_p.zone == NULL)
1335                 return (0);
1336 
1337         e->rcep_p.zone->zone_shares = nv;
1338         return (0);
1339 }
1340 
1341 static rctl_ops_t zone_cpu_shares_ops = {
1342         rcop_no_action,
1343         zone_cpu_shares_usage,
1344         zone_cpu_shares_set,
1345         rcop_no_test
1346 };
1347 
1348 /*
1349  * zone.cpu-cap resource control support.
1350  */
1351 /*ARGSUSED*/
1352 static rctl_qty_t
1353 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1354 {
1355         ASSERT(MUTEX_HELD(&p->p_lock));
1356         return (cpucaps_zone_get(p->p_zone));
1357 }
1358 
1359 /*ARGSUSED*/
1360 static int
1361 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1362     rctl_qty_t nv)
1363 {
1364         zone_t *zone = e->rcep_p.zone;
1365 
1366         ASSERT(MUTEX_HELD(&p->p_lock));
1367         ASSERT(e->rcep_t == RCENTITY_ZONE);
1368 
1369         if (zone == NULL)
1370                 return (0);
1371 
1372         /*
1373          * set cap to the new value.
1374          */
1375         return (cpucaps_zone_set(zone, nv));
1376 }
1377 
1378 static rctl_ops_t zone_cpu_cap_ops = {
1379         rcop_no_action,
1380         zone_cpu_cap_get,
1381         zone_cpu_cap_set,
1382         rcop_no_test
1383 };
1384 
1385 /*ARGSUSED*/
1386 static rctl_qty_t
1387 zone_cpu_base_get(rctl_t *rctl, struct proc *p)
1388 {
1389         ASSERT(MUTEX_HELD(&p->p_lock));
1390         return (cpucaps_zone_get_base(p->p_zone));
1391 }
1392 
1393 /*
1394  * The zone cpu base is used to set the baseline CPU for the zone
1395  * so we can track when the zone is bursting.
1396  */
1397 /*ARGSUSED*/
1398 static int
1399 zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1400     rctl_qty_t nv)
1401 {
1402         zone_t *zone = e->rcep_p.zone;
1403 
1404         ASSERT(MUTEX_HELD(&p->p_lock));
1405         ASSERT(e->rcep_t == RCENTITY_ZONE);
1406 
1407         if (zone == NULL)
1408                 return (0);
1409 
1410         return (cpucaps_zone_set_base(zone, nv));
1411 }
1412 
1413 static rctl_ops_t zone_cpu_base_ops = {
1414         rcop_no_action,
1415         zone_cpu_base_get,
1416         zone_cpu_base_set,
1417         rcop_no_test
1418 };
1419 
1420 /*ARGSUSED*/
1421 static rctl_qty_t
1422 zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
1423 {
1424         ASSERT(MUTEX_HELD(&p->p_lock));
1425         return (cpucaps_zone_get_burst_time(p->p_zone));
1426 }
1427 
1428 /*
1429  * The zone cpu burst time is used to set the amount of time CPU(s) can be
1430  * bursting for the zone.
1431  */
1432 /*ARGSUSED*/
1433 static int
1434 zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1435     rctl_qty_t nv)
1436 {
1437         zone_t *zone = e->rcep_p.zone;
1438 
1439         ASSERT(MUTEX_HELD(&p->p_lock));
1440         ASSERT(e->rcep_t == RCENTITY_ZONE);
1441 
1442         if (zone == NULL)
1443                 return (0);
1444 
1445         return (cpucaps_zone_set_burst_time(zone, nv));
1446 }
1447 
1448 static rctl_ops_t zone_cpu_burst_time_ops = {
1449         rcop_no_action,
1450         zone_cpu_burst_time_get,
1451         zone_cpu_burst_time_set,
1452         rcop_no_test
1453 };
1454 
1455 /*
1456  * zone.zfs-io-pri resource control support (IO priority).
1457  */
1458 /*ARGSUSED*/
1459 static rctl_qty_t
1460 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1461 {
1462         ASSERT(MUTEX_HELD(&p->p_lock));
1463         return (p->p_zone->zone_zfs_io_pri);
1464 }
1465 
1466 /*ARGSUSED*/
1467 static int
1468 zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1469     rctl_qty_t nv)
1470 {
1471         zone_t *zone = e->rcep_p.zone;
1472 
1473         ASSERT(MUTEX_HELD(&p->p_lock));
1474         ASSERT(e->rcep_t == RCENTITY_ZONE);
1475 
1476         if (zone == NULL)
1477                 return (0);
1478 
1479         /*
1480          * set priority to the new value.
1481          */
1482         zone->zone_zfs_io_pri = nv;
1483         return (0);
1484 }
1485 
1486 static rctl_ops_t zone_zfs_io_pri_ops = {
1487         rcop_no_action,
1488         zone_zfs_io_pri_get,
1489         zone_zfs_io_pri_set,
1490         rcop_no_test
1491 };
1492 
1493 /*ARGSUSED*/
1494 static rctl_qty_t
1495 zone_lwps_usage(rctl_t *r, proc_t *p)
1496 {
1497         rctl_qty_t nlwps;
1498         zone_t *zone = p->p_zone;
1499 
1500         ASSERT(MUTEX_HELD(&p->p_lock));
1501 
1502         mutex_enter(&zone->zone_nlwps_lock);
1503         nlwps = zone->zone_nlwps;
1504         mutex_exit(&zone->zone_nlwps_lock);
1505 
1506         return (nlwps);
1507 }
1508 
1509 /*ARGSUSED*/
1510 static int
1511 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1512     rctl_qty_t incr, uint_t flags)
1513 {
1514         rctl_qty_t nlwps;
1515 
1516         ASSERT(MUTEX_HELD(&p->p_lock));
1517         ASSERT(e->rcep_t == RCENTITY_ZONE);
1518         if (e->rcep_p.zone == NULL)
1519                 return (0);
1520         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1521         nlwps = e->rcep_p.zone->zone_nlwps;
1522 
1523         if (nlwps + incr > rcntl->rcv_value)
1524                 return (1);
1525 
1526         return (0);
1527 }
1528 
1529 /*ARGSUSED*/
1530 static int
1531 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1532 {
1533         ASSERT(MUTEX_HELD(&p->p_lock));
1534         ASSERT(e->rcep_t == RCENTITY_ZONE);
1535         if (e->rcep_p.zone == NULL)
1536                 return (0);
1537         e->rcep_p.zone->zone_nlwps_ctl = nv;
1538         return (0);
1539 }
1540 
1541 static rctl_ops_t zone_lwps_ops = {
1542         rcop_no_action,
1543         zone_lwps_usage,
1544         zone_lwps_set,
1545         zone_lwps_test,
1546 };
1547 
1548 /*ARGSUSED*/
1549 static rctl_qty_t
1550 zone_procs_usage(rctl_t *r, proc_t *p)
1551 {
1552         rctl_qty_t nprocs;
1553         zone_t *zone = p->p_zone;
1554 
1555         ASSERT(MUTEX_HELD(&p->p_lock));
1556 
1557         mutex_enter(&zone->zone_nlwps_lock);
1558         nprocs = zone->zone_nprocs;
1559         mutex_exit(&zone->zone_nlwps_lock);
1560 
1561         return (nprocs);
1562 }
1563 
1564 /*ARGSUSED*/
1565 static int
1566 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1567     rctl_qty_t incr, uint_t flags)
1568 {
1569         rctl_qty_t nprocs;
1570 
1571         ASSERT(MUTEX_HELD(&p->p_lock));
1572         ASSERT(e->rcep_t == RCENTITY_ZONE);
1573         if (e->rcep_p.zone == NULL)
1574                 return (0);
1575         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1576         nprocs = e->rcep_p.zone->zone_nprocs;
1577 
1578         if (nprocs + incr > rcntl->rcv_value)
1579                 return (1);
1580 
1581         return (0);
1582 }
1583 
1584 /*ARGSUSED*/
1585 static int
1586 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1587 {
1588         ASSERT(MUTEX_HELD(&p->p_lock));
1589         ASSERT(e->rcep_t == RCENTITY_ZONE);
1590         if (e->rcep_p.zone == NULL)
1591                 return (0);
1592         e->rcep_p.zone->zone_nprocs_ctl = nv;
1593         return (0);
1594 }
1595 
1596 static rctl_ops_t zone_procs_ops = {
1597         rcop_no_action,
1598         zone_procs_usage,
1599         zone_procs_set,
1600         zone_procs_test,
1601 };
1602 
1603 /*ARGSUSED*/
1604 static rctl_qty_t
1605 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1606 {
1607         ASSERT(MUTEX_HELD(&p->p_lock));
1608         return (p->p_zone->zone_shmmax);
1609 }
1610 
1611 /*ARGSUSED*/
1612 static int
1613 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1614     rctl_qty_t incr, uint_t flags)
1615 {
1616         rctl_qty_t v;
1617         ASSERT(MUTEX_HELD(&p->p_lock));
1618         ASSERT(e->rcep_t == RCENTITY_ZONE);
1619         v = e->rcep_p.zone->zone_shmmax + incr;
1620         if (v > rval->rcv_value)
1621                 return (1);
1622         return (0);
1623 }
1624 
1625 static rctl_ops_t zone_shmmax_ops = {
1626         rcop_no_action,
1627         zone_shmmax_usage,
1628         rcop_no_set,
1629         zone_shmmax_test
1630 };
1631 
1632 /*ARGSUSED*/
1633 static rctl_qty_t
1634 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1635 {
1636         ASSERT(MUTEX_HELD(&p->p_lock));
1637         return (p->p_zone->zone_ipc.ipcq_shmmni);
1638 }
1639 
1640 /*ARGSUSED*/
1641 static int
1642 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1643     rctl_qty_t incr, uint_t flags)
1644 {
1645         rctl_qty_t v;
1646         ASSERT(MUTEX_HELD(&p->p_lock));
1647         ASSERT(e->rcep_t == RCENTITY_ZONE);
1648         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1649         if (v > rval->rcv_value)
1650                 return (1);
1651         return (0);
1652 }
1653 
1654 static rctl_ops_t zone_shmmni_ops = {
1655         rcop_no_action,
1656         zone_shmmni_usage,
1657         rcop_no_set,
1658         zone_shmmni_test
1659 };
1660 
1661 /*ARGSUSED*/
1662 static rctl_qty_t
1663 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1664 {
1665         ASSERT(MUTEX_HELD(&p->p_lock));
1666         return (p->p_zone->zone_ipc.ipcq_semmni);
1667 }
1668 
1669 /*ARGSUSED*/
1670 static int
1671 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1672     rctl_qty_t incr, uint_t flags)
1673 {
1674         rctl_qty_t v;
1675         ASSERT(MUTEX_HELD(&p->p_lock));
1676         ASSERT(e->rcep_t == RCENTITY_ZONE);
1677         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1678         if (v > rval->rcv_value)
1679                 return (1);
1680         return (0);
1681 }
1682 
1683 static rctl_ops_t zone_semmni_ops = {
1684         rcop_no_action,
1685         zone_semmni_usage,
1686         rcop_no_set,
1687         zone_semmni_test
1688 };
1689 
1690 /*ARGSUSED*/
1691 static rctl_qty_t
1692 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1693 {
1694         ASSERT(MUTEX_HELD(&p->p_lock));
1695         return (p->p_zone->zone_ipc.ipcq_msgmni);
1696 }
1697 
1698 /*ARGSUSED*/
1699 static int
1700 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1701     rctl_qty_t incr, uint_t flags)
1702 {
1703         rctl_qty_t v;
1704         ASSERT(MUTEX_HELD(&p->p_lock));
1705         ASSERT(e->rcep_t == RCENTITY_ZONE);
1706         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1707         if (v > rval->rcv_value)
1708                 return (1);
1709         return (0);
1710 }
1711 
1712 static rctl_ops_t zone_msgmni_ops = {
1713         rcop_no_action,
1714         zone_msgmni_usage,
1715         rcop_no_set,
1716         zone_msgmni_test
1717 };
1718 
1719 /*ARGSUSED*/
1720 static rctl_qty_t
1721 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1722 {
1723         rctl_qty_t q;
1724         ASSERT(MUTEX_HELD(&p->p_lock));
1725         mutex_enter(&p->p_zone->zone_mem_lock);
1726         q = p->p_zone->zone_locked_mem;
1727         mutex_exit(&p->p_zone->zone_mem_lock);
1728         return (q);
1729 }
1730 
1731 /*ARGSUSED*/
1732 static int
1733 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1734     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1735 {
1736         rctl_qty_t q;
1737         zone_t *z;
1738 
1739         z = e->rcep_p.zone;
1740         ASSERT(MUTEX_HELD(&p->p_lock));
1741         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1742         q = z->zone_locked_mem;
1743         if (q + incr > rcntl->rcv_value)
1744                 return (1);
1745         return (0);
1746 }
1747 
1748 /*ARGSUSED*/
1749 static int
1750 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1751     rctl_qty_t nv)
1752 {
1753         ASSERT(MUTEX_HELD(&p->p_lock));
1754         ASSERT(e->rcep_t == RCENTITY_ZONE);
1755         if (e->rcep_p.zone == NULL)
1756                 return (0);
1757         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1758         return (0);
1759 }
1760 
1761 static rctl_ops_t zone_locked_mem_ops = {
1762         rcop_no_action,
1763         zone_locked_mem_usage,
1764         zone_locked_mem_set,
1765         zone_locked_mem_test
1766 };
1767 
1768 /*ARGSUSED*/
1769 static rctl_qty_t
1770 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1771 {
1772         rctl_qty_t q;
1773         zone_t *z = p->p_zone;
1774 
1775         ASSERT(MUTEX_HELD(&p->p_lock));
1776         mutex_enter(&z->zone_mem_lock);
1777         q = z->zone_max_swap;
1778         mutex_exit(&z->zone_mem_lock);
1779         return (q);
1780 }
1781 
1782 /*ARGSUSED*/
1783 static int
1784 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1785     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1786 {
1787         rctl_qty_t q;
1788         zone_t *z;
1789 
1790         z = e->rcep_p.zone;
1791         ASSERT(MUTEX_HELD(&p->p_lock));
1792         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1793         q = z->zone_max_swap;
1794         if (q + incr > rcntl->rcv_value)
1795                 return (1);
1796         return (0);
1797 }
1798 
1799 /*ARGSUSED*/
1800 static int
1801 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1802     rctl_qty_t nv)
1803 {
1804         ASSERT(MUTEX_HELD(&p->p_lock));
1805         ASSERT(e->rcep_t == RCENTITY_ZONE);
1806         if (e->rcep_p.zone == NULL)
1807                 return (0);
1808         e->rcep_p.zone->zone_max_swap_ctl = nv;
1809         return (0);
1810 }
1811 
1812 static rctl_ops_t zone_max_swap_ops = {
1813         rcop_no_action,
1814         zone_max_swap_usage,
1815         zone_max_swap_set,
1816         zone_max_swap_test
1817 };
1818 
1819 /*ARGSUSED*/
1820 static rctl_qty_t
1821 zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
1822 {
1823         rctl_qty_t q;
1824         zone_t *z = p->p_zone;
1825 
1826         ASSERT(MUTEX_HELD(&p->p_lock));
1827         /* No additional lock because not enforced in the kernel */
1828         q = z->zone_phys_mem;
1829         return (q);
1830 }
1831 
1832 /*ARGSUSED*/
1833 static int
1834 zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1835     rctl_qty_t nv)
1836 {
1837         ASSERT(MUTEX_HELD(&p->p_lock));
1838         ASSERT(e->rcep_t == RCENTITY_ZONE);
1839         if (e->rcep_p.zone == NULL)
1840                 return (0);
1841         e->rcep_p.zone->zone_phys_mem_ctl = nv;
1842         return (0);
1843 }
1844 
1845 static rctl_ops_t zone_phys_mem_ops = {
1846         rcop_no_action,
1847         zone_phys_mem_usage,
1848         zone_phys_mem_set,
1849         rcop_no_test
1850 };
1851 
1852 /*ARGSUSED*/
1853 static rctl_qty_t
1854 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1855 {
1856         rctl_qty_t q;
1857         zone_t *z = p->p_zone;
1858 
1859         ASSERT(MUTEX_HELD(&p->p_lock));
1860         mutex_enter(&z->zone_rctl_lock);
1861         q = z->zone_max_lofi;
1862         mutex_exit(&z->zone_rctl_lock);
1863         return (q);
1864 }
1865 
1866 /*ARGSUSED*/
1867 static int
1868 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1869     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1870 {
1871         rctl_qty_t q;
1872         zone_t *z;
1873 
1874         z = e->rcep_p.zone;
1875         ASSERT(MUTEX_HELD(&p->p_lock));
1876         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1877         q = z->zone_max_lofi;
1878         if (q + incr > rcntl->rcv_value)
1879                 return (1);
1880         return (0);
1881 }
1882 
1883 /*ARGSUSED*/
1884 static int
1885 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1886     rctl_qty_t nv)
1887 {
1888         ASSERT(MUTEX_HELD(&p->p_lock));
1889         ASSERT(e->rcep_t == RCENTITY_ZONE);
1890         if (e->rcep_p.zone == NULL)
1891                 return (0);
1892         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1893         return (0);
1894 }
1895 
1896 static rctl_ops_t zone_max_lofi_ops = {
1897         rcop_no_action,
1898         zone_max_lofi_usage,
1899         zone_max_lofi_set,
1900         zone_max_lofi_test
1901 };
1902 
1903 /*
1904  * Helper function to brand the zone with a unique ID.
1905  */
1906 static void
1907 zone_uniqid(zone_t *zone)
1908 {
1909         static uint64_t uniqid = 0;
1910 
1911         ASSERT(MUTEX_HELD(&zonehash_lock));
1912         zone->zone_uniqid = uniqid++;
1913 }
1914 
1915 /*
1916  * Returns a held pointer to the "kcred" for the specified zone.
1917  */
1918 struct cred *
1919 zone_get_kcred(zoneid_t zoneid)
1920 {
1921         zone_t *zone;
1922         cred_t *cr;
1923 
1924         if ((zone = zone_find_by_id(zoneid)) == NULL)
1925                 return (NULL);
1926         cr = zone->zone_kcred;
1927         crhold(cr);
1928         zone_rele(zone);
1929         return (cr);
1930 }
1931 
1932 static int
1933 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1934 {
1935         zone_t *zone = ksp->ks_private;
1936         zone_kstat_t *zk = ksp->ks_data;
1937 
1938         if (rw == KSTAT_WRITE)
1939                 return (EACCES);
1940 
1941         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1942         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1943         return (0);
1944 }
1945 
1946 static int
1947 zone_physmem_kstat_update(kstat_t *ksp, int rw)
1948 {
1949         zone_t *zone = ksp->ks_private;
1950         zone_kstat_t *zk = ksp->ks_data;
1951 
1952         if (rw == KSTAT_WRITE)
1953                 return (EACCES);
1954 
1955         zk->zk_usage.value.ui64 = zone->zone_phys_mem;
1956         zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
1957         return (0);
1958 }
1959 
1960 static int
1961 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1962 {
1963         zone_t *zone = ksp->ks_private;
1964         zone_kstat_t *zk = ksp->ks_data;
1965 
1966         if (rw == KSTAT_WRITE)
1967                 return (EACCES);
1968 
1969         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1970         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1971         return (0);
1972 }
1973 
1974 static int
1975 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1976 {
1977         zone_t *zone = ksp->ks_private;
1978         zone_kstat_t *zk = ksp->ks_data;
1979 
1980         if (rw == KSTAT_WRITE)
1981                 return (EACCES);
1982 
1983         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1984         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1985         return (0);
1986 }
1987 
1988 static kstat_t *
1989 zone_rctl_kstat_create_common(zone_t *zone, char *name,
1990     int (*updatefunc) (kstat_t *, int))
1991 {
1992         kstat_t *ksp;
1993         zone_kstat_t *zk;
1994 
1995         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1996             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1997             KSTAT_FLAG_VIRTUAL);
1998 
1999         if (ksp == NULL)
2000                 return (NULL);
2001 
2002         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
2003         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2004         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
2005         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
2006         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
2007         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
2008         ksp->ks_update = updatefunc;
2009         ksp->ks_private = zone;
2010         kstat_install(ksp);
2011         return (ksp);
2012 }
2013 
2014 static int
2015 zone_vfs_kstat_update(kstat_t *ksp, int rw)
2016 {
2017         zone_t *zone = ksp->ks_private;
2018         zone_vfs_kstat_t *zvp = ksp->ks_data;
2019         kstat_io_t *kiop = &zone->zone_vfs_rwstats;
2020 
2021         if (rw == KSTAT_WRITE)
2022                 return (EACCES);
2023 
2024         /*
2025          * Extract the VFS statistics from the kstat_io_t structure used by
2026          * kstat_runq_enter() and related functions.  Since the slow ops
2027          * counters are updated directly by the VFS layer, there's no need to
2028          * copy those statistics here.
2029          *
2030          * Note that kstat_runq_enter() and the related functions use
2031          * gethrtime_unscaled(), so scale the time here.
2032          */
2033         zvp->zv_nread.value.ui64 = kiop->nread;
2034         zvp->zv_reads.value.ui64 = kiop->reads;
2035         zvp->zv_rtime.value.ui64 = kiop->rtime;
2036         zvp->zv_rcnt.value.ui64 = kiop->rcnt;
2037         zvp->zv_rlentime.value.ui64 = kiop->rlentime;
2038         zvp->zv_nwritten.value.ui64 = kiop->nwritten;
2039         zvp->zv_writes.value.ui64 = kiop->writes;
2040         zvp->zv_wtime.value.ui64 = kiop->wtime;
2041         zvp->zv_wcnt.value.ui64 = kiop->wcnt;
2042         zvp->zv_wlentime.value.ui64 = kiop->wlentime;
2043 
2044         scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
2045         scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
2046         scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
2047         scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
2048 
2049         return (0);
2050 }
2051 
2052 static kstat_t *
2053 zone_vfs_kstat_create(zone_t *zone)
2054 {
2055         kstat_t *ksp;
2056         zone_vfs_kstat_t *zvp;
2057 
2058         if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
2059             zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
2060             sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
2061             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2062                 return (NULL);
2063 
2064         if (zone->zone_id != GLOBAL_ZONEID)
2065                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2066 
2067         zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
2068         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2069         ksp->ks_lock = &zone->zone_vfs_lock;
2070         zone->zone_vfs_stats = zvp;
2071 
2072         /* The kstat "name" field is not large enough for a full zonename */
2073         kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
2074         kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
2075         kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
2076         kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
2077         kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
2078         kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
2079         kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
2080         kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
2081         kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
2082         kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
2083         kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
2084         kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
2085         kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
2086         kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
2087         kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
2088         kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
2089         kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
2090         kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
2091 
2092         ksp->ks_update = zone_vfs_kstat_update;
2093         ksp->ks_private = zone;
2094 
2095         kstat_install(ksp);
2096         return (ksp);
2097 }
2098 
2099 static int
2100 zone_zfs_kstat_update(kstat_t *ksp, int rw)
2101 {
2102         zone_t *zone = ksp->ks_private;
2103         zone_zfs_kstat_t *zzp = ksp->ks_data;
2104         kstat_io_t *kiop = &zone->zone_zfs_rwstats;
2105 
2106         if (rw == KSTAT_WRITE)
2107                 return (EACCES);
2108 
2109         /*
2110          * Extract the ZFS statistics from the kstat_io_t structure used by
2111          * kstat_runq_enter() and related functions.  Since the I/O throttle
2112          * counters are updated directly by the ZFS layer, there's no need to
2113          * copy those statistics here.
2114          *
2115          * Note that kstat_runq_enter() and the related functions use
2116          * gethrtime_unscaled(), so scale the time here.
2117          */
2118         zzp->zz_nread.value.ui64 = kiop->nread;
2119         zzp->zz_reads.value.ui64 = kiop->reads;
2120         zzp->zz_rtime.value.ui64 = kiop->rtime;
2121         zzp->zz_rlentime.value.ui64 = kiop->rlentime;
2122         zzp->zz_nwritten.value.ui64 = kiop->nwritten;
2123         zzp->zz_writes.value.ui64 = kiop->writes;
2124 
2125         scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
2126         scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
2127 
2128         return (0);
2129 }
2130 
2131 static kstat_t *
2132 zone_zfs_kstat_create(zone_t *zone)
2133 {
2134         kstat_t *ksp;
2135         zone_zfs_kstat_t *zzp;
2136 
2137         if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
2138             zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
2139             sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
2140             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2141                 return (NULL);
2142 
2143         if (zone->zone_id != GLOBAL_ZONEID)
2144                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2145 
2146         zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
2147         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2148         ksp->ks_lock = &zone->zone_zfs_lock;
2149         zone->zone_zfs_stats = zzp;
2150 
2151         /* The kstat "name" field is not large enough for a full zonename */
2152         kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
2153         kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
2154         kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
2155         kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
2156         kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
2157         kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
2158         kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
2159         kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
2160         kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
2161 
2162         ksp->ks_update = zone_zfs_kstat_update;
2163         ksp->ks_private = zone;
2164 
2165         kstat_install(ksp);
2166         return (ksp);
2167 }
2168 
2169 static int
2170 zone_mcap_kstat_update(kstat_t *ksp, int rw)
2171 {
2172         zone_t *zone = ksp->ks_private;
2173         zone_mcap_kstat_t *zmp = ksp->ks_data;
2174 
2175         if (rw == KSTAT_WRITE)
2176                 return (EACCES);
2177 
2178         zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
2179         zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
2180         zmp->zm_swap.value.ui64 = zone->zone_max_swap;
2181         zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
2182         zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
2183         zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
2184         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
2185         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
2186         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
2187         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
2188         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
2189         zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
2190         zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
2191 
2192         return (0);
2193 }
2194 
2195 static kstat_t *
2196 zone_mcap_kstat_create(zone_t *zone)
2197 {
2198         kstat_t *ksp;
2199         zone_mcap_kstat_t *zmp;
2200 
2201         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2202             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2203             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2204             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2205                 return (NULL);
2206 
2207         if (zone->zone_id != GLOBAL_ZONEID)
2208                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2209 
2210         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2211         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2212         ksp->ks_lock = &zone->zone_mcap_lock;
2213         zone->zone_mcap_stats = zmp;
2214 
2215         /* The kstat "name" field is not large enough for a full zonename */
2216         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2217         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2218         kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2219         kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2220         kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2221         kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2222         kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2223         kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2224         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2225         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2226         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2227         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2228         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2229             KSTAT_DATA_UINT64);
2230         kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
2231             KSTAT_DATA_UINT64);
2232         kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
2233             KSTAT_DATA_UINT64);
2234 
2235         ksp->ks_update = zone_mcap_kstat_update;
2236         ksp->ks_private = zone;
2237 
2238         kstat_install(ksp);
2239         return (ksp);
2240 }
2241 
2242 static int
2243 zone_misc_kstat_update(kstat_t *ksp, int rw)
2244 {
2245         zone_t *zone = ksp->ks_private;
2246         zone_misc_kstat_t *zmp = ksp->ks_data;
2247         hrtime_t tmp;
2248 
2249         if (rw == KSTAT_WRITE)
2250                 return (EACCES);
2251 
2252         tmp = zone->zone_utime;
2253         scalehrtime(&tmp);
2254         zmp->zm_utime.value.ui64 = tmp;
2255         tmp = zone->zone_stime;
2256         scalehrtime(&tmp);
2257         zmp->zm_stime.value.ui64 = tmp;
2258         tmp = zone->zone_wtime;
2259         scalehrtime(&tmp);
2260         zmp->zm_wtime.value.ui64 = tmp;
2261 
2262         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
2263         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
2264         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
2265 
2266         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
2267         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
2268         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
2269         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
2270 
2271         zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
2272 
2273         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
2274 
2275         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
2276         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
2277 
2278         return (0);
2279 }
2280 
2281 static kstat_t *
2282 zone_misc_kstat_create(zone_t *zone)
2283 {
2284         kstat_t *ksp;
2285         zone_misc_kstat_t *zmp;
2286 
2287         if ((ksp = kstat_create_zone("zones", zone->zone_id,
2288             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
2289             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
2290             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2291                 return (NULL);
2292 
2293         if (zone->zone_id != GLOBAL_ZONEID)
2294                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2295 
2296         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2297         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2298         ksp->ks_lock = &zone->zone_misc_lock;
2299         zone->zone_misc_stats = zmp;
2300 
2301         /* The kstat "name" field is not large enough for a full zonename */
2302         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2303         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2304         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2305         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
2306         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2307         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2308         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2309         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2310             KSTAT_DATA_UINT32);
2311         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2312         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2313             KSTAT_DATA_UINT32);
2314         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2315         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2316         kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
2317             KSTAT_DATA_UINT32);
2318         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2319             KSTAT_DATA_UINT32);
2320         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2321         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2322 
2323         ksp->ks_update = zone_misc_kstat_update;
2324         ksp->ks_private = zone;
2325 
2326         kstat_install(ksp);
2327         return (ksp);
2328 }
2329 
2330 static void
2331 zone_kstat_create(zone_t *zone)
2332 {
2333         zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
2334             "lockedmem", zone_lockedmem_kstat_update);
2335         zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
2336             "swapresv", zone_swapresv_kstat_update);
2337         zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
2338             "physicalmem", zone_physmem_kstat_update);
2339         zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
2340             "nprocs", zone_nprocs_kstat_update);
2341 
2342         if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2343                 zone->zone_vfs_stats = kmem_zalloc(
2344                     sizeof (zone_vfs_kstat_t), KM_SLEEP);
2345         }
2346 
2347         if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
2348                 zone->zone_zfs_stats = kmem_zalloc(
2349                     sizeof (zone_zfs_kstat_t), KM_SLEEP);
2350         }
2351 
2352         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2353                 zone->zone_mcap_stats = kmem_zalloc(
2354                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2355         }
2356 
2357         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2358                 zone->zone_misc_stats = kmem_zalloc(
2359                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2360         }
2361 }
2362 
2363 static void
2364 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2365 {
2366         void *data;
2367 
2368         if (*pkstat != NULL) {
2369                 data = (*pkstat)->ks_data;
2370                 kstat_delete(*pkstat);
2371                 kmem_free(data, datasz);
2372                 *pkstat = NULL;
2373         }
2374 }
2375 
2376 static void
2377 zone_kstat_delete(zone_t *zone)
2378 {
2379         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2380             sizeof (zone_kstat_t));
2381         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2382             sizeof (zone_kstat_t));
2383         zone_kstat_delete_common(&zone->zone_physmem_kstat,
2384             sizeof (zone_kstat_t));
2385         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2386             sizeof (zone_kstat_t));
2387 
2388         zone_kstat_delete_common(&zone->zone_vfs_ksp,
2389             sizeof (zone_vfs_kstat_t));
2390         zone_kstat_delete_common(&zone->zone_zfs_ksp,
2391             sizeof (zone_zfs_kstat_t));
2392         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2393             sizeof (zone_mcap_kstat_t));
2394         zone_kstat_delete_common(&zone->zone_misc_ksp,
2395             sizeof (zone_misc_kstat_t));
2396 }
2397 
2398 /*
2399  * Called very early on in boot to initialize the ZSD list so that
2400  * zone_key_create() can be called before zone_init().  It also initializes
2401  * portions of zone0 which may be used before zone_init() is called.  The
2402  * variable "global_zone" will be set when zone0 is fully initialized by
2403  * zone_init().
2404  */
2405 void
2406 zone_zsd_init(void)
2407 {
2408         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2409         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2410         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2411             offsetof(struct zsd_entry, zsd_linkage));
2412         list_create(&zone_active, sizeof (zone_t),
2413             offsetof(zone_t, zone_linkage));
2414         list_create(&zone_deathrow, sizeof (zone_t),
2415             offsetof(zone_t, zone_linkage));
2416 
2417         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2418         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2419         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2420         zone0.zone_shares = 1;
2421         zone0.zone_nlwps = 0;
2422         zone0.zone_nlwps_ctl = INT_MAX;
2423         zone0.zone_nprocs = 0;
2424         zone0.zone_nprocs_ctl = INT_MAX;
2425         zone0.zone_locked_mem = 0;
2426         zone0.zone_locked_mem_ctl = UINT64_MAX;
2427         ASSERT(zone0.zone_max_swap == 0);
2428         zone0.zone_max_swap_ctl = UINT64_MAX;
2429         zone0.zone_phys_mem = 0;
2430         zone0.zone_phys_mem_ctl = UINT64_MAX;
2431         zone0.zone_max_lofi = 0;
2432         zone0.zone_max_lofi_ctl = UINT64_MAX;
2433         zone0.zone_shmmax = 0;
2434         zone0.zone_ipc.ipcq_shmmni = 0;
2435         zone0.zone_ipc.ipcq_semmni = 0;
2436         zone0.zone_ipc.ipcq_msgmni = 0;
2437         zone0.zone_name = GLOBAL_ZONENAME;
2438         zone0.zone_nodename = utsname.nodename;
2439         zone0.zone_domain = srpc_domain;
2440         zone0.zone_hostid = HW_INVALID_HOSTID;
2441         zone0.zone_fs_allowed = NULL;
2442         zone0.zone_ref = 1;
2443         zone0.zone_id = GLOBAL_ZONEID;
2444         zone0.zone_status = ZONE_IS_RUNNING;
2445         zone0.zone_rootpath = "/";
2446         zone0.zone_rootpathlen = 2;
2447         zone0.zone_psetid = ZONE_PS_INVAL;
2448         zone0.zone_ncpus = 0;
2449         zone0.zone_ncpus_online = 0;
2450         zone0.zone_proc_initpid = 1;
2451         zone0.zone_initname = initname;
2452         zone0.zone_lockedmem_kstat = NULL;
2453         zone0.zone_swapresv_kstat = NULL;
2454         zone0.zone_physmem_kstat = NULL;
2455         zone0.zone_nprocs_kstat = NULL;
2456         zone0.zone_zfs_io_pri = 1;
2457         zone0.zone_stime = 0;
2458         zone0.zone_utime = 0;
2459         zone0.zone_wtime = 0;
2460 
2461         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2462             offsetof(zone_ref_t, zref_linkage));
2463         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2464             offsetof(struct zsd_entry, zsd_linkage));
2465         list_insert_head(&zone_active, &zone0);
2466 
2467         /*
2468          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2469          * to anything meaningful.  It is assigned to be 'rootdir' in
2470          * vfs_mountroot().
2471          */
2472         zone0.zone_rootvp = NULL;
2473         zone0.zone_vfslist = NULL;
2474         zone0.zone_bootargs = initargs;
2475         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2476         /*
2477          * The global zone has all privileges
2478          */
2479         priv_fillset(zone0.zone_privset);
2480         /*
2481          * Add p0 to the global zone
2482          */
2483         zone0.zone_zsched = &p0;
2484         p0.p_zone = &zone0;
2485 }
2486 
2487 /*
2488  * Compute a hash value based on the contents of the label and the DOI.  The
2489  * hash algorithm is somewhat arbitrary, but is based on the observation that
2490  * humans will likely pick labels that differ by amounts that work out to be
2491  * multiples of the number of hash chains, and thus stirring in some primes
2492  * should help.
2493  */
2494 static uint_t
2495 hash_bylabel(void *hdata, mod_hash_key_t key)
2496 {
2497         const ts_label_t *lab = (ts_label_t *)key;
2498         const uint32_t *up, *ue;
2499         uint_t hash;
2500         int i;
2501 
2502         _NOTE(ARGUNUSED(hdata));
2503 
2504         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2505         /* we depend on alignment of label, but not representation */
2506         up = (const uint32_t *)&lab->tsl_label;
2507         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2508         i = 1;
2509         while (up < ue) {
2510                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2511                 hash += *up + (*up << ((i % 16) + 1));
2512                 up++;
2513                 i++;
2514         }
2515         return (hash);
2516 }
2517 
2518 /*
2519  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2520  * equal).  This may need to be changed if less than / greater than is ever
2521  * needed.
2522  */
2523 static int
2524 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2525 {
2526         ts_label_t *lab1 = (ts_label_t *)key1;
2527         ts_label_t *lab2 = (ts_label_t *)key2;
2528 
2529         return (label_equal(lab1, lab2) ? 0 : 1);
2530 }
2531 
2532 /*
2533  * Called by main() to initialize the zones framework.
2534  */
2535 void
2536 zone_init(void)
2537 {
2538         rctl_dict_entry_t *rde;
2539         rctl_val_t *dval;
2540         rctl_set_t *set;
2541         rctl_alloc_gp_t *gp;
2542         rctl_entity_p_t e;
2543         int res;
2544 
2545         ASSERT(curproc == &p0);
2546 
2547         /*
2548          * Create ID space for zone IDs.  ID 0 is reserved for the
2549          * global zone.
2550          */
2551         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2552 
2553         /*
2554          * Initialize generic zone resource controls, if any.
2555          */
2556         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2557             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2558             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2559             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2560 
2561         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2562             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2563             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2564             RCTL_GLOBAL_INFINITE,
2565             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2566 
2567         rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
2568             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2569             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2570             MAXCAP, MAXCAP, &zone_cpu_base_ops);
2571 
2572         rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
2573             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2574             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2575             INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
2576 
2577         rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2578             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2579             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2580             16384, 16384, &zone_zfs_io_pri_ops);
2581 
2582         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2583             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2584             INT_MAX, INT_MAX, &zone_lwps_ops);
2585 
2586         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2587             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2588             INT_MAX, INT_MAX, &zone_procs_ops);
2589 
2590         /*
2591          * System V IPC resource controls
2592          */
2593         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2594             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2595             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2596 
2597         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2598             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2599             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2600 
2601         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2602             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2603             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2604 
2605         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2606             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2607             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2608 
2609         /*
2610          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2611          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2612          */
2613         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2614         bzero(dval, sizeof (rctl_val_t));
2615         dval->rcv_value = 1;
2616         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2617         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2618         dval->rcv_action_recip_pid = -1;
2619 
2620         rde = rctl_dict_lookup("zone.cpu-shares");
2621         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2622 
2623         /*
2624          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2625          * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
2626          */
2627         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2628         bzero(dval, sizeof (rctl_val_t));
2629         dval->rcv_value = 1;
2630         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2631         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2632         dval->rcv_action_recip_pid = -1;
2633 
2634         rde = rctl_dict_lookup("zone.zfs-io-priority");
2635         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2636 
2637         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2638             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2639             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2640             &zone_locked_mem_ops);
2641 
2642         rc_zone_max_swap = rctl_register("zone.max-swap",
2643             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2644             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2645             &zone_max_swap_ops);
2646 
2647         rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2648             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2649             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2650             &zone_phys_mem_ops);
2651 
2652         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2653             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2654             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2655             &zone_max_lofi_ops);
2656 
2657         /*
2658          * Initialize the ``global zone''.
2659          */
2660         set = rctl_set_create();
2661         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2662         mutex_enter(&p0.p_lock);
2663         e.rcep_p.zone = &zone0;
2664         e.rcep_t = RCENTITY_ZONE;
2665         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2666             gp);
2667 
2668         zone0.zone_nlwps = p0.p_lwpcnt;
2669         zone0.zone_nprocs = 1;
2670         zone0.zone_ntasks = 1;
2671         mutex_exit(&p0.p_lock);
2672         zone0.zone_restart_init = B_TRUE;
2673         zone0.zone_reboot_on_init_exit = B_FALSE;
2674         zone0.zone_init_status = -1;
2675         zone0.zone_brand = &native_brand;
2676         rctl_prealloc_destroy(gp);
2677         /*
2678          * pool_default hasn't been initialized yet, so we let pool_init()
2679          * take care of making sure the global zone is in the default pool.
2680          */
2681 
2682         /*
2683          * Initialize global zone kstats
2684          */
2685         zone_kstat_create(&zone0);
2686 
2687         /*
2688          * Initialize zone label.
2689          * mlp are initialized when tnzonecfg is loaded.
2690          */
2691         zone0.zone_slabel = l_admin_low;
2692         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2693         label_hold(l_admin_low);
2694 
2695         /*
2696          * Initialise the lock for the database structure used by mntfs.
2697          */
2698         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2699 
2700         mutex_enter(&zonehash_lock);
2701         zone_uniqid(&zone0);
2702         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2703 
2704         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2705             mod_hash_null_valdtor);
2706         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2707             zone_hash_size, mod_hash_null_valdtor);
2708         /*
2709          * maintain zonehashbylabel only for labeled systems
2710          */
2711         if (is_system_labeled())
2712                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2713                     zone_hash_size, mod_hash_null_keydtor,
2714                     mod_hash_null_valdtor, hash_bylabel, NULL,
2715                     hash_labelkey_cmp, KM_SLEEP);
2716         zonecount = 1;
2717 
2718         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2719             (mod_hash_val_t)&zone0);
2720         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2721             (mod_hash_val_t)&zone0);
2722         if (is_system_labeled()) {
2723                 zone0.zone_flags |= ZF_HASHED_LABEL;
2724                 (void) mod_hash_insert(zonehashbylabel,
2725                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2726         }
2727         mutex_exit(&zonehash_lock);
2728 
2729         /*
2730          * We avoid setting zone_kcred until now, since kcred is initialized
2731          * sometime after zone_zsd_init() and before zone_init().
2732          */
2733         zone0.zone_kcred = kcred;
2734         /*
2735          * The global zone is fully initialized (except for zone_rootvp which
2736          * will be set when the root filesystem is mounted).
2737          */
2738         global_zone = &zone0;
2739 
2740         /*
2741          * Setup an event channel to send zone status change notifications on
2742          */
2743         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2744             EVCH_CREAT);
2745 
2746         if (res)
2747                 panic("Sysevent_evc_bind failed during zone setup.\n");
2748 
2749 }
2750 
2751 static void
2752 zone_free(zone_t *zone)
2753 {
2754         zone_dl_t *zdl;
2755 
2756         ASSERT(zone != global_zone);
2757         ASSERT(zone->zone_ntasks == 0);
2758         ASSERT(zone->zone_nlwps == 0);
2759         ASSERT(zone->zone_nprocs == 0);
2760         ASSERT(zone->zone_cred_ref == 0);
2761         ASSERT(zone->zone_kcred == NULL);
2762         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2763             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2764         ASSERT(list_is_empty(&zone->zone_ref_list));
2765 
2766         /*
2767          * Remove any zone caps.
2768          */
2769         cpucaps_zone_remove(zone);
2770 
2771         ASSERT(zone->zone_cpucap == NULL);
2772 
2773         /* remove from deathrow list */
2774         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2775                 ASSERT(zone->zone_ref == 0);
2776                 mutex_enter(&zone_deathrow_lock);
2777                 list_remove(&zone_deathrow, zone);
2778                 mutex_exit(&zone_deathrow_lock);
2779         }
2780 
2781         list_destroy(&zone->zone_ref_list);
2782         zone_free_zsd(zone);
2783         zone_free_datasets(zone);
2784 
2785         /*
2786          * While dlmgmtd should have removed all of these, it could have left
2787          * something behind or crashed. In which case it's not safe for us to
2788          * assume that the list is empty which list_destroy() will ASSERT. We
2789          * clean up for our userland comrades which may have crashed, or worse,
2790          * been disabled by SMF.
2791          */
2792         while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
2793                 if (zdl->zdl_net != NULL)
2794                         nvlist_free(zdl->zdl_net);
2795                 kmem_free(zdl, sizeof (zone_dl_t));
2796         }
2797         list_destroy(&zone->zone_dl_list);
2798 
2799         if (zone->zone_rootvp != NULL)
2800                 VN_RELE(zone->zone_rootvp);
2801         if (zone->zone_rootpath)
2802                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2803         if (zone->zone_name != NULL)
2804                 kmem_free(zone->zone_name, ZONENAME_MAX);
2805         if (zone->zone_slabel != NULL)
2806                 label_rele(zone->zone_slabel);
2807         if (zone->zone_nodename != NULL)
2808                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2809         if (zone->zone_domain != NULL)
2810                 kmem_free(zone->zone_domain, _SYS_NMLN);
2811         if (zone->zone_privset != NULL)
2812                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2813         if (zone->zone_rctls != NULL)
2814                 rctl_set_free(zone->zone_rctls);
2815         if (zone->zone_bootargs != NULL)
2816                 strfree(zone->zone_bootargs);
2817         if (zone->zone_initname != NULL)
2818                 strfree(zone->zone_initname);
2819         if (zone->zone_fs_allowed != NULL)
2820                 strfree(zone->zone_fs_allowed);
2821         if (zone->zone_pfexecd != NULL)
2822                 klpd_freelist(&zone->zone_pfexecd);
2823         id_free(zoneid_space, zone->zone_id);
2824         mutex_destroy(&zone->zone_lock);
2825         cv_destroy(&zone->zone_cv);
2826         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2827         rw_destroy(&zone->zone_mntfs_db_lock);
2828         kmem_free(zone, sizeof (zone_t));
2829 }
2830 
2831 /*
2832  * See block comment at the top of this file for information about zone
2833  * status values.
2834  */
2835 /*
2836  * Convenience function for setting zone status.
2837  */
2838 static void
2839 zone_status_set(zone_t *zone, zone_status_t status)
2840 {
2841         timestruc_t now;
2842         uint64_t t;
2843 
2844         nvlist_t *nvl = NULL;
2845         ASSERT(MUTEX_HELD(&zone_status_lock));
2846         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2847             status >= zone_status_get(zone));
2848 
2849         /* Current time since Jan 1 1970 but consumers expect NS */
2850         gethrestime(&now);
2851         t = (now.tv_sec * NANOSEC) + now.tv_nsec;
2852 
2853         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2854             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2855             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2856             zone_status_table[status]) ||
2857             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2858             zone_status_table[zone->zone_status]) ||
2859             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2860             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
2861             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2862             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2863 #ifdef DEBUG
2864                 (void) printf(
2865                     "Failed to allocate and send zone state change event.\n");
2866 #endif
2867         }
2868         nvlist_free(nvl);
2869 
2870         zone->zone_status = status;
2871 
2872         cv_broadcast(&zone->zone_cv);
2873 }
2874 
2875 /*
2876  * Public function to retrieve the zone status.  The zone status may
2877  * change after it is retrieved.
2878  */
2879 zone_status_t
2880 zone_status_get(zone_t *zone)
2881 {
2882         return (zone->zone_status);
2883 }
2884 
2885 static int
2886 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2887 {
2888         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2889         int err = 0;
2890 
2891         ASSERT(zone != global_zone);
2892         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2893                 goto done;      /* EFAULT or ENAMETOOLONG */
2894 
2895         if (zone->zone_bootargs != NULL)
2896                 strfree(zone->zone_bootargs);
2897 
2898         zone->zone_bootargs = strdup(buf);
2899 
2900 done:
2901         kmem_free(buf, BOOTARGS_MAX);
2902         return (err);
2903 }
2904 
2905 static int
2906 zone_set_brand(zone_t *zone, const char *brand)
2907 {
2908         struct brand_attr *attrp;
2909         brand_t *bp;
2910 
2911         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2912         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2913                 kmem_free(attrp, sizeof (struct brand_attr));
2914                 return (EFAULT);
2915         }
2916 
2917         bp = brand_register_zone(attrp);
2918         kmem_free(attrp, sizeof (struct brand_attr));
2919         if (bp == NULL)
2920                 return (EINVAL);
2921 
2922         /*
2923          * This is the only place where a zone can change it's brand.
2924          * We already need to hold zone_status_lock to check the zone
2925          * status, so we'll just use that lock to serialize zone
2926          * branding requests as well.
2927          */
2928         mutex_enter(&zone_status_lock);
2929 
2930         /* Re-Branding is not allowed and the zone can't be booted yet */
2931         if ((ZONE_IS_BRANDED(zone)) ||
2932             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2933                 mutex_exit(&zone_status_lock);
2934                 brand_unregister_zone(bp);
2935                 return (EINVAL);
2936         }
2937 
2938         /*
2939          * Set up the brand specific data.
2940          * Note that it's possible that the hook has to drop the
2941          * zone_status_lock and reaquire it before returning so we can't
2942          * assume the lock has been held the entire time.
2943          */
2944         zone->zone_brand = bp;
2945         ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
2946 
2947         mutex_exit(&zone_status_lock);
2948         return (0);
2949 }
2950 
2951 static int
2952 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2953 {
2954         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2955         int err = 0;
2956 
2957         ASSERT(zone != global_zone);
2958         if ((err = copyinstr(zone_fs_allowed, buf,
2959             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2960                 goto done;
2961 
2962         if (zone->zone_fs_allowed != NULL)
2963                 strfree(zone->zone_fs_allowed);
2964 
2965         zone->zone_fs_allowed = strdup(buf);
2966 
2967 done:
2968         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2969         return (err);
2970 }
2971 
2972 static int
2973 zone_set_initname(zone_t *zone, const char *zone_initname)
2974 {
2975         char initname[INITNAME_SZ];
2976         size_t len;
2977         int err = 0;
2978 
2979         ASSERT(zone != global_zone);
2980         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2981                 return (err);   /* EFAULT or ENAMETOOLONG */
2982 
2983         if (zone->zone_initname != NULL)
2984                 strfree(zone->zone_initname);
2985 
2986         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2987         (void) strcpy(zone->zone_initname, initname);
2988         return (0);
2989 }
2990 
2991 /*
2992  * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
2993  * to provide the physical memory capping kstats.  Since physical memory
2994  * capping is currently implemented in userland, that code uses the setattr
2995  * entry point to increment the kstats.  We always simply increment nover
2996  * every time that setattr is called and we always add in the input value
2997  * to zone_mcap_pagedout every time that is called.
2998  */
2999 /*ARGSUSED*/
3000 static int
3001 zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
3002 {
3003         zone->zone_mcap_nover++;
3004 
3005         return (0);
3006 }
3007 
3008 static int
3009 zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
3010 {
3011         uint64_t pageout;
3012         int err;
3013 
3014         if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
3015                 zone->zone_mcap_pagedout += pageout;
3016 
3017         return (err);
3018 }
3019 
3020 /*
3021  * The zone_set_page_fault_delay function is used to set the number of usecs
3022  * to throttle page faults.  This is normally 0 but can be set to a non-0 value
3023  * by the user-land memory capping code when the zone is over its physcial
3024  * memory cap.
3025  */
3026 static int
3027 zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
3028 {
3029         uint32_t dusec;
3030         int err;
3031 
3032         if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
3033                 zone->zone_pg_flt_delay = dusec;
3034 
3035         return (err);
3036 }
3037 
3038 /*
3039  * The zone_set_rss function is used to set the zone's RSS when we do the
3040  * fast, approximate calculation in user-land.
3041  */
3042 static int
3043 zone_set_rss(zone_t *zone, const uint64_t *prss)
3044 {
3045         uint64_t rss;
3046         int err;
3047 
3048         if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
3049                 zone->zone_phys_mem = rss;
3050 
3051         return (err);
3052 }
3053 
3054 static int
3055 zone_set_sched_class(zone_t *zone, const char *new_class)
3056 {
3057         char sched_class[PC_CLNMSZ];
3058         id_t classid;
3059         int err;
3060 
3061         ASSERT(zone != global_zone);
3062         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
3063                 return (err);   /* EFAULT or ENAMETOOLONG */
3064 
3065         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
3066                 return (set_errno(EINVAL));
3067         zone->zone_defaultcid = classid;
3068         ASSERT(zone->zone_defaultcid > 0 &&
3069             zone->zone_defaultcid < loaded_classes);
3070 
3071         return (0);
3072 }
3073 
3074 /*
3075  * Block indefinitely waiting for (zone_status >= status)
3076  */
3077 void
3078 zone_status_wait(zone_t *zone, zone_status_t status)
3079 {
3080         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3081 
3082         mutex_enter(&zone_status_lock);
3083         while (zone->zone_status < status) {
3084                 cv_wait(&zone->zone_cv, &zone_status_lock);
3085         }
3086         mutex_exit(&zone_status_lock);
3087 }
3088 
3089 /*
3090  * Private CPR-safe version of zone_status_wait().
3091  */
3092 static void
3093 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
3094 {
3095         callb_cpr_t cprinfo;
3096 
3097         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3098 
3099         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
3100             str);
3101         mutex_enter(&zone_status_lock);
3102         while (zone->zone_status < status) {
3103                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
3104                 cv_wait(&zone->zone_cv, &zone_status_lock);
3105                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
3106         }
3107         /*
3108          * zone_status_lock is implicitly released by the following.
3109          */
3110         CALLB_CPR_EXIT(&cprinfo);
3111 }
3112 
3113 /*
3114  * Block until zone enters requested state or signal is received.  Return (0)
3115  * if signaled, non-zero otherwise.
3116  */
3117 int
3118 zone_status_wait_sig(zone_t *zone, zone_status_t status)
3119 {
3120         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3121 
3122         mutex_enter(&zone_status_lock);
3123         while (zone->zone_status < status) {
3124                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
3125                         mutex_exit(&zone_status_lock);
3126                         return (0);
3127                 }
3128         }
3129         mutex_exit(&zone_status_lock);
3130         return (1);
3131 }
3132 
3133 /*
3134  * Block until the zone enters the requested state or the timeout expires,
3135  * whichever happens first.  Return (-1) if operation timed out, time remaining
3136  * otherwise.
3137  */
3138 clock_t
3139 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
3140 {
3141         clock_t timeleft = 0;
3142 
3143         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3144 
3145         mutex_enter(&zone_status_lock);
3146         while (zone->zone_status < status && timeleft != -1) {
3147                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
3148         }
3149         mutex_exit(&zone_status_lock);
3150         return (timeleft);
3151 }
3152 
3153 /*
3154  * Block until the zone enters the requested state, the current process is
3155  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
3156  * operation timed out, 0 if signaled, time remaining otherwise.
3157  */
3158 clock_t
3159 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
3160 {
3161         clock_t timeleft = tim - ddi_get_lbolt();
3162 
3163         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3164 
3165         mutex_enter(&zone_status_lock);
3166         while (zone->zone_status < status) {
3167                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
3168                     tim);
3169                 if (timeleft <= 0)
3170                         break;
3171         }
3172         mutex_exit(&zone_status_lock);
3173         return (timeleft);
3174 }
3175 
3176 /*
3177  * Zones have two reference counts: one for references from credential
3178  * structures (zone_cred_ref), and one (zone_ref) for everything else.
3179  * This is so we can allow a zone to be rebooted while there are still
3180  * outstanding cred references, since certain drivers cache dblks (which
3181  * implicitly results in cached creds).  We wait for zone_ref to drop to
3182  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
3183  * later freed when the zone_cred_ref drops to 0, though nothing other
3184  * than the zone id and privilege set should be accessed once the zone
3185  * is "dead".
3186  *
3187  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
3188  * to force halt/reboot to block waiting for the zone_cred_ref to drop
3189  * to 0.  This can be useful to flush out other sources of cached creds
3190  * that may be less innocuous than the driver case.
3191  *
3192  * Zones also provide a tracked reference counting mechanism in which zone
3193  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
3194  * debuggers determine the sources of leaked zone references.  See
3195  * zone_hold_ref() and zone_rele_ref() below for more information.
3196  */
3197 
3198 int zone_wait_for_cred = 0;
3199 
3200 static void
3201 zone_hold_locked(zone_t *z)
3202 {
3203         ASSERT(MUTEX_HELD(&z->zone_lock));
3204         z->zone_ref++;
3205         ASSERT(z->zone_ref != 0);
3206 }
3207 
3208 /*
3209  * Increment the specified zone's reference count.  The zone's zone_t structure
3210  * will not be freed as long as the zone's reference count is nonzero.
3211  * Decrement the zone's reference count via zone_rele().
3212  *
3213  * NOTE: This function should only be used to hold zones for short periods of
3214  * time.  Use zone_hold_ref() if the zone must be held for a long time.
3215  */
3216 void
3217 zone_hold(zone_t *z)
3218 {
3219         mutex_enter(&z->zone_lock);
3220         zone_hold_locked(z);
3221         mutex_exit(&z->zone_lock);
3222 }
3223 
3224 /*
3225  * If the non-cred ref count drops to 1 and either the cred ref count
3226  * is 0 or we aren't waiting for cred references, the zone is ready to
3227  * be destroyed.
3228  */
3229 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
3230             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
3231 
3232 /*
3233  * Common zone reference release function invoked by zone_rele() and
3234  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
3235  * zone's subsystem-specific reference counters are not affected by the
3236  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
3237  * removed from the specified zone's reference list.  ref must be non-NULL iff
3238  * subsys is not ZONE_REF_NUM_SUBSYS.
3239  */
3240 static void
3241 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3242 {
3243         boolean_t wakeup;
3244 
3245         mutex_enter(&z->zone_lock);
3246         ASSERT(z->zone_ref != 0);
3247         z->zone_ref--;
3248         if (subsys != ZONE_REF_NUM_SUBSYS) {
3249                 ASSERT(z->zone_subsys_ref[subsys] != 0);
3250                 z->zone_subsys_ref[subsys]--;
3251                 list_remove(&z->zone_ref_list, ref);
3252         }
3253         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3254                 /* no more refs, free the structure */
3255                 mutex_exit(&z->zone_lock);
3256                 zone_free(z);
3257                 return;
3258         }
3259         /* signal zone_destroy so the zone can finish halting */
3260         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
3261         mutex_exit(&z->zone_lock);
3262 
3263         if (wakeup) {
3264                 /*
3265                  * Grabbing zonehash_lock here effectively synchronizes with
3266                  * zone_destroy() to avoid missed signals.
3267                  */
3268                 mutex_enter(&zonehash_lock);
3269                 cv_broadcast(&zone_destroy_cv);
3270                 mutex_exit(&zonehash_lock);
3271         }
3272 }
3273 
3274 /*
3275  * Decrement the specified zone's reference count.  The specified zone will
3276  * cease to exist after this function returns if the reference count drops to
3277  * zero.  This function should be paired with zone_hold().
3278  */
3279 void
3280 zone_rele(zone_t *z)
3281 {
3282         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
3283 }
3284 
3285 /*
3286  * Initialize a zone reference structure.  This function must be invoked for
3287  * a reference structure before the structure is passed to zone_hold_ref().
3288  */
3289 void
3290 zone_init_ref(zone_ref_t *ref)
3291 {
3292         ref->zref_zone = NULL;
3293         list_link_init(&ref->zref_linkage);
3294 }
3295 
3296 /*
3297  * Acquire a reference to zone z.  The caller must specify the
3298  * zone_ref_subsys_t constant associated with its subsystem.  The specified
3299  * zone_ref_t structure will represent a reference to the specified zone.  Use
3300  * zone_rele_ref() to release the reference.
3301  *
3302  * The referenced zone_t structure will not be freed as long as the zone_t's
3303  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
3304  * references.
3305  *
3306  * NOTE: The zone_ref_t structure must be initialized before it is used.
3307  * See zone_init_ref() above.
3308  */
3309 void
3310 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3311 {
3312         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
3313 
3314         /*
3315          * Prevent consumers from reusing a reference structure before
3316          * releasing it.
3317          */
3318         VERIFY(ref->zref_zone == NULL);
3319 
3320         ref->zref_zone = z;
3321         mutex_enter(&z->zone_lock);
3322         zone_hold_locked(z);
3323         z->zone_subsys_ref[subsys]++;
3324         ASSERT(z->zone_subsys_ref[subsys] != 0);
3325         list_insert_head(&z->zone_ref_list, ref);
3326         mutex_exit(&z->zone_lock);
3327 }
3328 
3329 /*
3330  * Release the zone reference represented by the specified zone_ref_t.
3331  * The reference is invalid after it's released; however, the zone_ref_t
3332  * structure can be reused without having to invoke zone_init_ref().
3333  * subsys should be the same value that was passed to zone_hold_ref()
3334  * when the reference was acquired.
3335  */
3336 void
3337 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
3338 {
3339         zone_rele_common(ref->zref_zone, ref, subsys);
3340 
3341         /*
3342          * Set the zone_ref_t's zref_zone field to NULL to generate panics
3343          * when consumers dereference the reference.  This helps us catch
3344          * consumers who use released references.  Furthermore, this lets
3345          * consumers reuse the zone_ref_t structure without having to
3346          * invoke zone_init_ref().
3347          */
3348         ref->zref_zone = NULL;
3349 }
3350 
3351 void
3352 zone_cred_hold(zone_t *z)
3353 {
3354         mutex_enter(&z->zone_lock);
3355         z->zone_cred_ref++;
3356         ASSERT(z->zone_cred_ref != 0);
3357         mutex_exit(&z->zone_lock);
3358 }
3359 
3360 void
3361 zone_cred_rele(zone_t *z)
3362 {
3363         boolean_t wakeup;
3364 
3365         mutex_enter(&z->zone_lock);
3366         ASSERT(z->zone_cred_ref != 0);
3367         z->zone_cred_ref--;
3368         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3369                 /* no more refs, free the structure */
3370                 mutex_exit(&z->zone_lock);
3371                 zone_free(z);
3372                 return;
3373         }
3374         /*
3375          * If zone_destroy is waiting for the cred references to drain
3376          * out, and they have, signal it.
3377          */
3378         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
3379             zone_status_get(z) >= ZONE_IS_DEAD);
3380         mutex_exit(&z->zone_lock);
3381 
3382         if (wakeup) {
3383                 /*
3384                  * Grabbing zonehash_lock here effectively synchronizes with
3385                  * zone_destroy() to avoid missed signals.
3386                  */
3387                 mutex_enter(&zonehash_lock);
3388                 cv_broadcast(&zone_destroy_cv);
3389                 mutex_exit(&zonehash_lock);
3390         }
3391 }
3392 
3393 void
3394 zone_task_hold(zone_t *z)
3395 {
3396         mutex_enter(&z->zone_lock);
3397         z->zone_ntasks++;
3398         ASSERT(z->zone_ntasks != 0);
3399         mutex_exit(&z->zone_lock);
3400 }
3401 
3402 void
3403 zone_task_rele(zone_t *zone)
3404 {
3405         uint_t refcnt;
3406 
3407         mutex_enter(&zone->zone_lock);
3408         ASSERT(zone->zone_ntasks != 0);
3409         refcnt = --zone->zone_ntasks;
3410         if (refcnt > 1)      {       /* Common case */
3411                 mutex_exit(&zone->zone_lock);
3412                 return;
3413         }
3414         zone_hold_locked(zone); /* so we can use the zone_t later */
3415         mutex_exit(&zone->zone_lock);
3416         if (refcnt == 1) {
3417                 /*
3418                  * See if the zone is shutting down.
3419                  */
3420                 mutex_enter(&zone_status_lock);
3421                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
3422                         goto out;
3423                 }
3424 
3425                 /*
3426                  * Make sure the ntasks didn't change since we
3427                  * dropped zone_lock.
3428                  */
3429                 mutex_enter(&zone->zone_lock);
3430                 if (refcnt != zone->zone_ntasks) {
3431                         mutex_exit(&zone->zone_lock);
3432                         goto out;
3433                 }
3434                 mutex_exit(&zone->zone_lock);
3435 
3436                 /*
3437                  * No more user processes in the zone.  The zone is empty.
3438                  */
3439                 zone_status_set(zone, ZONE_IS_EMPTY);
3440                 goto out;
3441         }
3442 
3443         ASSERT(refcnt == 0);
3444         /*
3445          * zsched has exited; the zone is dead.
3446          */
3447         zone->zone_zsched = NULL;            /* paranoia */
3448         mutex_enter(&zone_status_lock);
3449         zone_status_set(zone, ZONE_IS_DEAD);
3450 out:
3451         mutex_exit(&zone_status_lock);
3452         zone_rele(zone);
3453 }
3454 
3455 zoneid_t
3456 getzoneid(void)
3457 {
3458         return (curproc->p_zone->zone_id);
3459 }
3460 
3461 zoneid_t
3462 getzonedid(void)
3463 {
3464         return (curproc->p_zone->zone_did);
3465 }
3466 
3467 /*
3468  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3469  * check the validity of a zone's state.
3470  */
3471 static zone_t *
3472 zone_find_all_by_id(zoneid_t zoneid)
3473 {
3474         mod_hash_val_t hv;
3475         zone_t *zone = NULL;
3476 
3477         ASSERT(MUTEX_HELD(&zonehash_lock));
3478 
3479         if (mod_hash_find(zonehashbyid,
3480             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3481                 zone = (zone_t *)hv;
3482         return (zone);
3483 }
3484 
3485 static zone_t *
3486 zone_find_all_by_label(const ts_label_t *label)
3487 {
3488         mod_hash_val_t hv;
3489         zone_t *zone = NULL;
3490 
3491         ASSERT(MUTEX_HELD(&zonehash_lock));
3492 
3493         /*
3494          * zonehashbylabel is not maintained for unlabeled systems
3495          */
3496         if (!is_system_labeled())
3497                 return (NULL);
3498         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3499                 zone = (zone_t *)hv;
3500         return (zone);
3501 }
3502 
3503 static zone_t *
3504 zone_find_all_by_name(char *name)
3505 {
3506         mod_hash_val_t hv;
3507         zone_t *zone = NULL;
3508 
3509         ASSERT(MUTEX_HELD(&zonehash_lock));
3510 
3511         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3512                 zone = (zone_t *)hv;
3513         return (zone);
3514 }
3515 
3516 /*
3517  * Public interface for looking up a zone by zoneid.  Only returns the zone if
3518  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3519  * Caller must call zone_rele() once it is done with the zone.
3520  *
3521  * The zone may begin the zone_destroy() sequence immediately after this
3522  * function returns, but may be safely used until zone_rele() is called.
3523  */
3524 zone_t *
3525 zone_find_by_id(zoneid_t zoneid)
3526 {
3527         zone_t *zone;
3528         zone_status_t status;
3529 
3530         mutex_enter(&zonehash_lock);
3531         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3532                 mutex_exit(&zonehash_lock);
3533                 return (NULL);
3534         }
3535         status = zone_status_get(zone);
3536         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3537                 /*
3538                  * For all practical purposes the zone doesn't exist.
3539                  */
3540                 mutex_exit(&zonehash_lock);
3541                 return (NULL);
3542         }
3543         zone_hold(zone);
3544         mutex_exit(&zonehash_lock);
3545         return (zone);
3546 }
3547 
3548 /*
3549  * Similar to zone_find_by_id, but using zone label as the key.
3550  */
3551 zone_t *
3552 zone_find_by_label(const ts_label_t *label)
3553 {
3554         zone_t *zone;
3555         zone_status_t status;
3556 
3557         mutex_enter(&zonehash_lock);
3558         if ((zone = zone_find_all_by_label(label)) == NULL) {
3559                 mutex_exit(&zonehash_lock);
3560                 return (NULL);
3561         }
3562 
3563         status = zone_status_get(zone);
3564         if (status > ZONE_IS_DOWN) {
3565                 /*
3566                  * For all practical purposes the zone doesn't exist.
3567                  */
3568                 mutex_exit(&zonehash_lock);
3569                 return (NULL);
3570         }
3571         zone_hold(zone);
3572         mutex_exit(&zonehash_lock);
3573         return (zone);
3574 }
3575 
3576 /*
3577  * Similar to zone_find_by_id, but using zone name as the key.
3578  */
3579 zone_t *
3580 zone_find_by_name(char *name)
3581 {
3582         zone_t *zone;
3583         zone_status_t status;
3584 
3585         mutex_enter(&zonehash_lock);
3586         if ((zone = zone_find_all_by_name(name)) == NULL) {
3587                 mutex_exit(&zonehash_lock);
3588                 return (NULL);
3589         }
3590         status = zone_status_get(zone);
3591         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3592                 /*
3593                  * For all practical purposes the zone doesn't exist.
3594                  */
3595                 mutex_exit(&zonehash_lock);
3596                 return (NULL);
3597         }
3598         zone_hold(zone);
3599         mutex_exit(&zonehash_lock);
3600         return (zone);
3601 }
3602 
3603 /*
3604  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3605  * if there is a zone "foo" rooted at /foo/root, and the path argument
3606  * is "/foo/root/proc", it will return the held zone_t corresponding to
3607  * zone "foo".
3608  *
3609  * zone_find_by_path() always returns a non-NULL value, since at the
3610  * very least every path will be contained in the global zone.
3611  *
3612  * As with the other zone_find_by_*() functions, the caller is
3613  * responsible for zone_rele()ing the return value of this function.
3614  */
3615 zone_t *
3616 zone_find_by_path(const char *path)
3617 {
3618         zone_t *zone;
3619         zone_t *zret = NULL;
3620         zone_status_t status;
3621 
3622         if (path == NULL) {
3623                 /*
3624                  * Call from rootconf().
3625                  */
3626                 zone_hold(global_zone);
3627                 return (global_zone);
3628         }
3629         ASSERT(*path == '/');
3630         mutex_enter(&zonehash_lock);
3631         for (zone = list_head(&zone_active); zone != NULL;
3632             zone = list_next(&zone_active, zone)) {
3633                 if (ZONE_PATH_VISIBLE(path, zone))
3634                         zret = zone;
3635         }
3636         ASSERT(zret != NULL);
3637         status = zone_status_get(zret);
3638         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3639                 /*
3640                  * Zone practically doesn't exist.
3641                  */
3642                 zret = global_zone;
3643         }
3644         zone_hold(zret);
3645         mutex_exit(&zonehash_lock);
3646         return (zret);
3647 }
3648 
3649 /*
3650  * Public interface for updating per-zone load averages.  Called once per
3651  * second.
3652  *
3653  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3654  */
3655 void
3656 zone_loadavg_update()
3657 {
3658         zone_t *zp;
3659         zone_status_t status;
3660         struct loadavg_s *lavg;
3661         hrtime_t zone_total;
3662         int i;
3663         hrtime_t hr_avg;
3664         int nrun;
3665         static int64_t f[3] = { 135, 27, 9 };
3666         int64_t q, r;
3667 
3668         mutex_enter(&zonehash_lock);
3669         for (zp = list_head(&zone_active); zp != NULL;
3670             zp = list_next(&zone_active, zp)) {
3671                 mutex_enter(&zp->zone_lock);
3672 
3673                 /* Skip zones that are on the way down or not yet up */
3674                 status = zone_status_get(zp);
3675                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3676                         /* For all practical purposes the zone doesn't exist. */
3677                         mutex_exit(&zp->zone_lock);
3678                         continue;
3679                 }
3680 
3681                 /*
3682                  * Update the 10 second moving average data in zone_loadavg.
3683                  */
3684                 lavg = &zp->zone_loadavg;
3685 
3686                 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3687                 scalehrtime(&zone_total);
3688 
3689                 /* The zone_total should always be increasing. */
3690                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3691                     zone_total - lavg->lg_total : 0;
3692                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3693                 /* lg_total holds the prev. 1 sec. total */
3694                 lavg->lg_total = zone_total;
3695 
3696                 /*
3697                  * To simplify the calculation, we don't calculate the load avg.
3698                  * until the zone has been up for at least 10 seconds and our
3699                  * moving average is thus full.
3700                  */
3701                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3702                         lavg->lg_len++;
3703                         mutex_exit(&zp->zone_lock);
3704                         continue;
3705                 }
3706 
3707                 /* Now calculate the 1min, 5min, 15 min load avg. */
3708                 hr_avg = 0;
3709                 for (i = 0; i < S_LOADAVG_SZ; i++)
3710                         hr_avg += lavg->lg_loads[i];
3711                 hr_avg = hr_avg / S_LOADAVG_SZ;
3712                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3713 
3714                 /* Compute load avg. See comment in calcloadavg() */
3715                 for (i = 0; i < 3; i++) {
3716                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3717                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3718                         zp->zone_hp_avenrun[i] +=
3719                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3720 
3721                         /* avenrun[] can only hold 31 bits of load avg. */
3722                         if (zp->zone_hp_avenrun[i] <
3723                             ((uint64_t)1<<(31+16-FSHIFT)))
3724                                 zp->zone_avenrun[i] = (int32_t)
3725                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3726                         else
3727                                 zp->zone_avenrun[i] = 0x7fffffff;
3728                 }
3729 
3730                 mutex_exit(&zp->zone_lock);
3731         }
3732         mutex_exit(&zonehash_lock);
3733 }
3734 
3735 /*
3736  * Get the number of cpus visible to this zone.  The system-wide global
3737  * 'ncpus' is returned if pools are disabled, the caller is in the
3738  * global zone, or a NULL zone argument is passed in.
3739  */
3740 int
3741 zone_ncpus_get(zone_t *zone)
3742 {
3743         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3744 
3745         return (myncpus != 0 ? myncpus : ncpus);
3746 }
3747 
3748 /*
3749  * Get the number of online cpus visible to this zone.  The system-wide
3750  * global 'ncpus_online' is returned if pools are disabled, the caller
3751  * is in the global zone, or a NULL zone argument is passed in.
3752  */
3753 int
3754 zone_ncpus_online_get(zone_t *zone)
3755 {
3756         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3757 
3758         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3759 }
3760 
3761 /*
3762  * Return the pool to which the zone is currently bound.
3763  */
3764 pool_t *
3765 zone_pool_get(zone_t *zone)
3766 {
3767         ASSERT(pool_lock_held());
3768 
3769         return (zone->zone_pool);
3770 }
3771 
3772 /*
3773  * Set the zone's pool pointer and update the zone's visibility to match
3774  * the resources in the new pool.
3775  */
3776 void
3777 zone_pool_set(zone_t *zone, pool_t *pool)
3778 {
3779         ASSERT(pool_lock_held());
3780         ASSERT(MUTEX_HELD(&cpu_lock));
3781 
3782         zone->zone_pool = pool;
3783         zone_pset_set(zone, pool->pool_pset->pset_id);
3784 }
3785 
3786 /*
3787  * Return the cached value of the id of the processor set to which the
3788  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3789  * facility is disabled.
3790  */
3791 psetid_t
3792 zone_pset_get(zone_t *zone)
3793 {
3794         ASSERT(MUTEX_HELD(&cpu_lock));
3795 
3796         return (zone->zone_psetid);
3797 }
3798 
3799 /*
3800  * Set the cached value of the id of the processor set to which the zone
3801  * is currently bound.  Also update the zone's visibility to match the
3802  * resources in the new processor set.
3803  */
3804 void
3805 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3806 {
3807         psetid_t oldpsetid;
3808 
3809         ASSERT(MUTEX_HELD(&cpu_lock));
3810         oldpsetid = zone_pset_get(zone);
3811 
3812         if (oldpsetid == newpsetid)
3813                 return;
3814         /*
3815          * Global zone sees all.
3816          */
3817         if (zone != global_zone) {
3818                 zone->zone_psetid = newpsetid;
3819                 if (newpsetid != ZONE_PS_INVAL)
3820                         pool_pset_visibility_add(newpsetid, zone);
3821                 if (oldpsetid != ZONE_PS_INVAL)
3822                         pool_pset_visibility_remove(oldpsetid, zone);
3823         }
3824         /*
3825          * Disabling pools, so we should start using the global values
3826          * for ncpus and ncpus_online.
3827          */
3828         if (newpsetid == ZONE_PS_INVAL) {
3829                 zone->zone_ncpus = 0;
3830                 zone->zone_ncpus_online = 0;
3831         }
3832 }
3833 
3834 /*
3835  * Walk the list of active zones and issue the provided callback for
3836  * each of them.
3837  *
3838  * Caller must not be holding any locks that may be acquired under
3839  * zonehash_lock.  See comment at the beginning of the file for a list of
3840  * common locks and their interactions with zones.
3841  */
3842 int
3843 zone_walk(int (*cb)(zone_t *, void *), void *data)
3844 {
3845         zone_t *zone;
3846         int ret = 0;
3847         zone_status_t status;
3848 
3849         mutex_enter(&zonehash_lock);
3850         for (zone = list_head(&zone_active); zone != NULL;
3851             zone = list_next(&zone_active, zone)) {
3852                 /*
3853                  * Skip zones that shouldn't be externally visible.
3854                  */
3855                 status = zone_status_get(zone);
3856                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3857                         continue;
3858                 /*
3859                  * Bail immediately if any callback invocation returns a
3860                  * non-zero value.
3861                  */
3862                 ret = (*cb)(zone, data);
3863                 if (ret != 0)
3864                         break;
3865         }
3866         mutex_exit(&zonehash_lock);
3867         return (ret);
3868 }
3869 
3870 static int
3871 zone_set_root(zone_t *zone, const char *upath)
3872 {
3873         vnode_t *vp;
3874         int trycount;
3875         int error = 0;
3876         char *path;
3877         struct pathname upn, pn;
3878         size_t pathlen;
3879 
3880         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3881                 return (error);
3882 
3883         pn_alloc(&pn);
3884 
3885         /* prevent infinite loop */
3886         trycount = 10;
3887         for (;;) {
3888                 if (--trycount <= 0) {
3889                         error = ESTALE;
3890                         goto out;
3891                 }
3892 
3893                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3894                         /*
3895                          * VOP_ACCESS() may cover 'vp' with a new
3896                          * filesystem, if 'vp' is an autoFS vnode.
3897                          * Get the new 'vp' if so.
3898                          */
3899                         if ((error =
3900                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3901                             (!vn_ismntpt(vp) ||
3902                             (error = traverse(&vp)) == 0)) {
3903                                 pathlen = pn.pn_pathlen + 2;
3904                                 path = kmem_alloc(pathlen, KM_SLEEP);
3905                                 (void) strncpy(path, pn.pn_path,
3906                                     pn.pn_pathlen + 1);
3907                                 path[pathlen - 2] = '/';
3908                                 path[pathlen - 1] = '\0';
3909                                 pn_free(&pn);
3910                                 pn_free(&upn);
3911 
3912                                 /* Success! */
3913                                 break;
3914                         }
3915                         VN_RELE(vp);
3916                 }
3917                 if (error != ESTALE)
3918                         goto out;
3919         }
3920 
3921         ASSERT(error == 0);
3922         zone->zone_rootvp = vp;              /* we hold a reference to vp */
3923         zone->zone_rootpath = path;
3924         zone->zone_rootpathlen = pathlen;
3925         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3926                 zone->zone_flags |= ZF_IS_SCRATCH;
3927         return (0);
3928 
3929 out:
3930         pn_free(&pn);
3931         pn_free(&upn);
3932         return (error);
3933 }
3934 
3935 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3936                         ((c) >= 'a' && (c) <= 'z') || \
3937                         ((c) >= 'A' && (c) <= 'Z'))
3938 
3939 static int
3940 zone_set_name(zone_t *zone, const char *uname)
3941 {
3942         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3943         size_t len;
3944         int i, err;
3945 
3946         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3947                 kmem_free(kname, ZONENAME_MAX);
3948                 return (err);   /* EFAULT or ENAMETOOLONG */
3949         }
3950 
3951         /* must be less than ZONENAME_MAX */
3952         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3953                 kmem_free(kname, ZONENAME_MAX);
3954                 return (EINVAL);
3955         }
3956 
3957         /*
3958          * Name must start with an alphanumeric and must contain only
3959          * alphanumerics, '-', '_' and '.'.
3960          */
3961         if (!isalnum(kname[0])) {
3962                 kmem_free(kname, ZONENAME_MAX);
3963                 return (EINVAL);
3964         }
3965         for (i = 1; i < len - 1; i++) {
3966                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3967                     kname[i] != '.') {
3968                         kmem_free(kname, ZONENAME_MAX);
3969                         return (EINVAL);
3970                 }
3971         }
3972 
3973         zone->zone_name = kname;
3974         return (0);
3975 }
3976 
3977 /*
3978  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3979  * is NULL or it points to a zone with no hostid emulation, then the machine's
3980  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3981  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3982  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3983  * hostid and the machine's hostid is invalid.
3984  */
3985 uint32_t
3986 zone_get_hostid(zone_t *zonep)
3987 {
3988         unsigned long machine_hostid;
3989 
3990         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3991                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3992                         return (HW_INVALID_HOSTID);
3993                 return ((uint32_t)machine_hostid);
3994         }
3995         return (zonep->zone_hostid);
3996 }
3997 
3998 /*
3999  * Similar to thread_create(), but makes sure the thread is in the appropriate
4000  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
4001  */
4002 /*ARGSUSED*/
4003 kthread_t *
4004 zthread_create(
4005     caddr_t stk,
4006     size_t stksize,
4007     void (*proc)(),
4008     void *arg,
4009     size_t len,
4010     pri_t pri)
4011 {
4012         kthread_t *t;
4013         zone_t *zone = curproc->p_zone;
4014         proc_t *pp = zone->zone_zsched;
4015 
4016         zone_hold(zone);        /* Reference to be dropped when thread exits */
4017 
4018         /*
4019          * No-one should be trying to create threads if the zone is shutting
4020          * down and there aren't any kernel threads around.  See comment
4021          * in zthread_exit().
4022          */
4023         ASSERT(!(zone->zone_kthreads == NULL &&
4024             zone_status_get(zone) >= ZONE_IS_EMPTY));
4025         /*
4026          * Create a thread, but don't let it run until we've finished setting
4027          * things up.
4028          */
4029         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
4030         ASSERT(t->t_forw == NULL);
4031         mutex_enter(&zone_status_lock);
4032         if (zone->zone_kthreads == NULL) {
4033                 t->t_forw = t->t_back = t;
4034         } else {
4035                 kthread_t *tx = zone->zone_kthreads;
4036 
4037                 t->t_forw = tx;
4038                 t->t_back = tx->t_back;
4039                 tx->t_back->t_forw = t;
4040                 tx->t_back = t;
4041         }
4042         zone->zone_kthreads = t;
4043         mutex_exit(&zone_status_lock);
4044 
4045         mutex_enter(&pp->p_lock);
4046         t->t_proc_flag |= TP_ZTHREAD;
4047         project_rele(t->t_proj);
4048         t->t_proj = project_hold(pp->p_task->tk_proj);
4049 
4050         /*
4051          * Setup complete, let it run.
4052          */
4053         thread_lock(t);
4054         t->t_schedflag |= TS_ALLSTART;
4055         setrun_locked(t);
4056         thread_unlock(t);
4057 
4058         mutex_exit(&pp->p_lock);
4059 
4060         return (t);
4061 }
4062 
4063 /*
4064  * Similar to thread_exit().  Must be called by threads created via
4065  * zthread_exit().
4066  */
4067 void
4068 zthread_exit(void)
4069 {
4070         kthread_t *t = curthread;
4071         proc_t *pp = curproc;
4072         zone_t *zone = pp->p_zone;
4073 
4074         mutex_enter(&zone_status_lock);
4075 
4076         /*
4077          * Reparent to p0
4078          */
4079         kpreempt_disable();
4080         mutex_enter(&pp->p_lock);
4081         t->t_proc_flag &= ~TP_ZTHREAD;
4082         t->t_procp = &p0;
4083         hat_thread_exit(t);
4084         mutex_exit(&pp->p_lock);
4085         kpreempt_enable();
4086 
4087         if (t->t_back == t) {
4088                 ASSERT(t->t_forw == t);
4089                 /*
4090                  * If the zone is empty, once the thread count
4091                  * goes to zero no further kernel threads can be
4092                  * created.  This is because if the creator is a process
4093                  * in the zone, then it must have exited before the zone
4094                  * state could be set to ZONE_IS_EMPTY.
4095                  * Otherwise, if the creator is a kernel thread in the
4096                  * zone, the thread count is non-zero.
4097                  *
4098                  * This really means that non-zone kernel threads should
4099                  * not create zone kernel threads.
4100                  */
4101                 zone->zone_kthreads = NULL;
4102                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
4103                         zone_status_set(zone, ZONE_IS_DOWN);
4104                         /*
4105                          * Remove any CPU caps on this zone.
4106                          */
4107                         cpucaps_zone_remove(zone);
4108                 }
4109         } else {
4110                 t->t_forw->t_back = t->t_back;
4111                 t->t_back->t_forw = t->t_forw;
4112                 if (zone->zone_kthreads == t)
4113                         zone->zone_kthreads = t->t_forw;
4114         }
4115         mutex_exit(&zone_status_lock);
4116         zone_rele(zone);
4117         thread_exit();
4118         /* NOTREACHED */
4119 }
4120 
4121 static void
4122 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
4123 {
4124         vnode_t *oldvp;
4125 
4126         /* we're going to hold a reference here to the directory */
4127         VN_HOLD(vp);
4128 
4129         /* update abs cwd/root path see c2/audit.c */
4130         if (AU_AUDITING())
4131                 audit_chdirec(vp, vpp);
4132 
4133         mutex_enter(&pp->p_lock);
4134         oldvp = *vpp;
4135         *vpp = vp;
4136         mutex_exit(&pp->p_lock);
4137         if (oldvp != NULL)
4138                 VN_RELE(oldvp);
4139 }
4140 
4141 /*
4142  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
4143  */
4144 static int
4145 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
4146 {
4147         nvpair_t *nvp = NULL;
4148         boolean_t priv_set = B_FALSE;
4149         boolean_t limit_set = B_FALSE;
4150         boolean_t action_set = B_FALSE;
4151 
4152         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4153                 const char *name;
4154                 uint64_t ui64;
4155 
4156                 name = nvpair_name(nvp);
4157                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
4158                         return (EINVAL);
4159                 (void) nvpair_value_uint64(nvp, &ui64);
4160                 if (strcmp(name, "privilege") == 0) {
4161                         /*
4162                          * Currently only privileged values are allowed, but
4163                          * this may change in the future.
4164                          */
4165                         if (ui64 != RCPRIV_PRIVILEGED)
4166                                 return (EINVAL);
4167                         rv->rcv_privilege = ui64;
4168                         priv_set = B_TRUE;
4169                 } else if (strcmp(name, "limit") == 0) {
4170                         rv->rcv_value = ui64;
4171                         limit_set = B_TRUE;
4172                 } else if (strcmp(name, "action") == 0) {
4173                         if (ui64 != RCTL_LOCAL_NOACTION &&
4174                             ui64 != RCTL_LOCAL_DENY)
4175                                 return (EINVAL);
4176                         rv->rcv_flagaction = ui64;
4177                         action_set = B_TRUE;
4178                 } else {
4179                         return (EINVAL);
4180                 }
4181         }
4182 
4183         if (!(priv_set && limit_set && action_set))
4184                 return (EINVAL);
4185         rv->rcv_action_signal = 0;
4186         rv->rcv_action_recipient = NULL;
4187         rv->rcv_action_recip_pid = -1;
4188         rv->rcv_firing_time = 0;
4189 
4190         return (0);
4191 }
4192 
4193 /*
4194  * Non-global zone version of start_init.
4195  */
4196 void
4197 zone_start_init(void)
4198 {
4199         proc_t *p = ttoproc(curthread);
4200         zone_t *z = p->p_zone;
4201 
4202         ASSERT(!INGLOBALZONE(curproc));
4203 
4204         /*
4205          * For all purposes (ZONE_ATTR_INITPID and restart_init),
4206          * storing just the pid of init is sufficient.
4207          */
4208         z->zone_proc_initpid = p->p_pid;
4209 
4210         if (z->zone_setup_app_contract == B_TRUE) {
4211                 /*
4212                  * Normally a process cannot modify its own contract, but we're
4213                  * just starting the zone's init process and its contract is
4214                  * always initialized from the sys_process_tmpl template, so
4215                  * this is the simplest way to setup init's contract to kill
4216                  * the process if any other process in the contract exits.
4217                  */
4218                 p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
4219         }
4220 
4221         /*
4222          * We maintain zone_boot_err so that we can return the cause of the
4223          * failure back to the caller of the zone_boot syscall.
4224          */
4225         p->p_zone->zone_boot_err = start_init_common();
4226 
4227         /*
4228          * We will prevent booting zones from becoming running zones if the
4229          * global zone is shutting down.
4230          */
4231         mutex_enter(&zone_status_lock);
4232         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
4233             ZONE_IS_SHUTTING_DOWN) {
4234                 /*
4235                  * Make sure we are still in the booting state-- we could have
4236                  * raced and already be shutting down, or even further along.
4237                  */
4238                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
4239                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
4240                 }
4241                 mutex_exit(&zone_status_lock);
4242                 /* It's gone bad, dispose of the process */
4243                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
4244                         mutex_enter(&p->p_lock);
4245                         ASSERT(p->p_flag & SEXITLWPS);
4246                         lwp_exit();
4247                 }
4248         } else {
4249                 id_t cid = curthread->t_cid;
4250 
4251                 if (zone_status_get(z) == ZONE_IS_BOOTING)
4252                         zone_status_set(z, ZONE_IS_RUNNING);
4253                 mutex_exit(&zone_status_lock);
4254 
4255                 mutex_enter(&class_lock);
4256                 ASSERT(cid < loaded_classes);
4257                 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
4258                     z->zone_fixed_hipri) {
4259                         /*
4260                          * If the zone is using FX then by default all
4261                          * processes start at the lowest priority and stay
4262                          * there. We provide a mechanism for the zone to
4263                          * indicate that it should run at "high priority". In
4264                          * this case we setup init to run at the highest FX
4265                          * priority (which is one level higher than the
4266                          * non-fixed scheduling classes can use).
4267                          */
4268                         pcparms_t pcparms;
4269 
4270                         pcparms.pc_cid = cid;
4271                         ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
4272                         ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
4273                             FXMAXUPRI;
4274                         ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
4275                             FX_DOUPRILIM | FX_DOUPRI;
4276 
4277                         mutex_enter(&pidlock);
4278                         mutex_enter(&curproc->p_lock);
4279 
4280                         (void) parmsset(&pcparms, curthread);
4281 
4282                         mutex_exit(&curproc->p_lock);
4283                         mutex_exit(&pidlock);
4284                 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
4285                         /*
4286                          * zsched always starts the init lwp at priority
4287                          * minclsyspri - 1. This priority gets set in t_pri and
4288                          * is invalid for RT, but RT never uses t_pri. However
4289                          * t_pri is used by procfs, so we always see processes
4290                          * within an RT zone with an invalid priority value.
4291                          * We fix that up now.
4292                          */
4293                         curthread->t_pri = RTGPPRIO0;
4294                 }
4295                 mutex_exit(&class_lock);
4296 
4297                 /* cause the process to return to userland. */
4298                 lwp_rtt();
4299         }
4300 }
4301 
4302 struct zsched_arg {
4303         zone_t *zone;
4304         nvlist_t *nvlist;
4305 };
4306 
4307 /*
4308  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
4309  * anything to do with scheduling, but rather with the fact that
4310  * per-zone kernel threads are parented to zsched, just like regular
4311  * kernel threads are parented to sched (p0).
4312  *
4313  * zsched is also responsible for launching init for the zone.
4314  */
4315 static void
4316 zsched(void *arg)
4317 {
4318         struct zsched_arg *za = arg;
4319         proc_t *pp = curproc;
4320         proc_t *initp = proc_init;
4321         zone_t *zone = za->zone;
4322         cred_t *cr, *oldcred;
4323         rctl_set_t *set;
4324         rctl_alloc_gp_t *gp;
4325         contract_t *ct = NULL;
4326         task_t *tk, *oldtk;
4327         rctl_entity_p_t e;
4328         kproject_t *pj;
4329 
4330         nvlist_t *nvl = za->nvlist;
4331         nvpair_t *nvp = NULL;
4332 
4333         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
4334         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
4335         PTOU(pp)->u_argc = 0;
4336         PTOU(pp)->u_argv = NULL;
4337         PTOU(pp)->u_envp = NULL;
4338         PTOU(pp)->u_commpagep = NULL;
4339         closeall(P_FINFO(pp));
4340 
4341         /*
4342          * We are this zone's "zsched" process.  As the zone isn't generally
4343          * visible yet we don't need to grab any locks before initializing its
4344          * zone_proc pointer.
4345          */
4346         zone_hold(zone);  /* this hold is released by zone_destroy() */
4347         zone->zone_zsched = pp;
4348         mutex_enter(&pp->p_lock);
4349         pp->p_zone = zone;
4350         mutex_exit(&pp->p_lock);
4351 
4352         /*
4353          * Disassociate process from its 'parent'; parent ourselves to init
4354          * (pid 1) and change other values as needed.
4355          */
4356         sess_create();
4357 
4358         mutex_enter(&pidlock);
4359         proc_detach(pp);
4360         pp->p_ppid = 1;
4361         pp->p_flag |= SZONETOP;
4362         pp->p_ancpid = 1;
4363         pp->p_parent = initp;
4364         pp->p_psibling = NULL;
4365         if (initp->p_child)
4366                 initp->p_child->p_psibling = pp;
4367         pp->p_sibling = initp->p_child;
4368         initp->p_child = pp;
4369 
4370         /* Decrement what newproc() incremented. */
4371         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
4372         /*
4373          * Our credentials are about to become kcred-like, so we don't care
4374          * about the caller's ruid.
4375          */
4376         upcount_inc(crgetruid(kcred), zone->zone_id);
4377         mutex_exit(&pidlock);
4378 
4379         /*
4380          * getting out of global zone, so decrement lwp and process counts
4381          */
4382         pj = pp->p_task->tk_proj;
4383         mutex_enter(&global_zone->zone_nlwps_lock);
4384         pj->kpj_nlwps -= pp->p_lwpcnt;
4385         global_zone->zone_nlwps -= pp->p_lwpcnt;
4386         pj->kpj_nprocs--;
4387         global_zone->zone_nprocs--;
4388         mutex_exit(&global_zone->zone_nlwps_lock);
4389 
4390         /*
4391          * Decrement locked memory counts on old zone and project.
4392          */
4393         mutex_enter(&global_zone->zone_mem_lock);
4394         global_zone->zone_locked_mem -= pp->p_locked_mem;
4395         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
4396         mutex_exit(&global_zone->zone_mem_lock);
4397 
4398         /*
4399          * Create and join a new task in project '0' of this zone.
4400          *
4401          * We don't need to call holdlwps() since we know we're the only lwp in
4402          * this process.
4403          *
4404          * task_join() returns with p_lock held.
4405          */
4406         tk = task_create(0, zone);
4407         mutex_enter(&cpu_lock);
4408         oldtk = task_join(tk, 0);
4409 
4410         pj = pp->p_task->tk_proj;
4411 
4412         mutex_enter(&zone->zone_mem_lock);
4413         zone->zone_locked_mem += pp->p_locked_mem;
4414         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
4415         mutex_exit(&zone->zone_mem_lock);
4416 
4417         /*
4418          * add lwp and process counts to zsched's zone, and increment
4419          * project's task and process count due to the task created in
4420          * the above task_create.
4421          */
4422         mutex_enter(&zone->zone_nlwps_lock);
4423         pj->kpj_nlwps += pp->p_lwpcnt;
4424         pj->kpj_ntasks += 1;
4425         zone->zone_nlwps += pp->p_lwpcnt;
4426         pj->kpj_nprocs++;
4427         zone->zone_nprocs++;
4428         mutex_exit(&zone->zone_nlwps_lock);
4429 
4430         mutex_exit(&curproc->p_lock);
4431         mutex_exit(&cpu_lock);
4432         task_rele(oldtk);
4433 
4434         /*
4435          * The process was created by a process in the global zone, hence the
4436          * credentials are wrong.  We might as well have kcred-ish credentials.
4437          */
4438         cr = zone->zone_kcred;
4439         crhold(cr);
4440         mutex_enter(&pp->p_crlock);
4441         oldcred = pp->p_cred;
4442         pp->p_cred = cr;
4443         mutex_exit(&pp->p_crlock);
4444         crfree(oldcred);
4445 
4446         /*
4447          * Hold credentials again (for thread)
4448          */
4449         crhold(cr);
4450 
4451         /*
4452          * p_lwpcnt can't change since this is a kernel process.
4453          */
4454         crset(pp, cr);
4455 
4456         /*
4457          * Chroot
4458          */
4459         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
4460         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
4461 
4462         /*
4463          * Initialize zone's rctl set.
4464          */
4465         set = rctl_set_create();
4466         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
4467         mutex_enter(&pp->p_lock);
4468         e.rcep_p.zone = zone;
4469         e.rcep_t = RCENTITY_ZONE;
4470         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
4471         mutex_exit(&pp->p_lock);
4472         rctl_prealloc_destroy(gp);
4473 
4474         /*
4475          * Apply the rctls passed in to zone_create().  This is basically a list
4476          * assignment: all of the old values are removed and the new ones
4477          * inserted.  That is, if an empty list is passed in, all values are
4478          * removed.
4479          */
4480         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4481                 rctl_dict_entry_t *rde;
4482                 rctl_hndl_t hndl;
4483                 char *name;
4484                 nvlist_t **nvlarray;
4485                 uint_t i, nelem;
4486                 int error;      /* For ASSERT()s */
4487 
4488                 name = nvpair_name(nvp);
4489                 hndl = rctl_hndl_lookup(name);
4490                 ASSERT(hndl != -1);
4491                 rde = rctl_dict_lookup_hndl(hndl);
4492                 ASSERT(rde != NULL);
4493 
4494                 for (; /* ever */; ) {
4495                         rctl_val_t oval;
4496 
4497                         mutex_enter(&pp->p_lock);
4498                         error = rctl_local_get(hndl, NULL, &oval, pp);
4499                         mutex_exit(&pp->p_lock);
4500                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4501                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4502                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
4503                                 break;
4504                         mutex_enter(&pp->p_lock);
4505                         error = rctl_local_delete(hndl, &oval, pp);
4506                         mutex_exit(&pp->p_lock);
4507                         ASSERT(error == 0);
4508                 }
4509                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4510                 ASSERT(error == 0);
4511                 for (i = 0; i < nelem; i++) {
4512                         rctl_val_t *nvalp;
4513 
4514                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4515                         error = nvlist2rctlval(nvlarray[i], nvalp);
4516                         ASSERT(error == 0);
4517                         /*
4518                          * rctl_local_insert can fail if the value being
4519                          * inserted is a duplicate; this is OK.
4520                          */
4521                         mutex_enter(&pp->p_lock);
4522                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
4523                                 kmem_cache_free(rctl_val_cache, nvalp);
4524                         mutex_exit(&pp->p_lock);
4525                 }
4526         }
4527         /*
4528          * Tell the world that we're done setting up.
4529          *
4530          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4531          * and atomically set the zone's processor set visibility.  Once
4532          * we drop pool_lock() this zone will automatically get updated
4533          * to reflect any future changes to the pools configuration.
4534          *
4535          * Note that after we drop the locks below (zonehash_lock in
4536          * particular) other operations such as a zone_getattr call can
4537          * now proceed and observe the zone. That is the reason for doing a
4538          * state transition to the INITIALIZED state.
4539          */
4540         pool_lock();
4541         mutex_enter(&cpu_lock);
4542         mutex_enter(&zonehash_lock);
4543         zone_uniqid(zone);
4544         zone_zsd_configure(zone);
4545         if (pool_state == POOL_ENABLED)
4546                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4547         mutex_enter(&zone_status_lock);
4548         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4549         zone_status_set(zone, ZONE_IS_INITIALIZED);
4550         mutex_exit(&zone_status_lock);
4551         mutex_exit(&zonehash_lock);
4552         mutex_exit(&cpu_lock);
4553         pool_unlock();
4554 
4555         /* Now call the create callback for this key */
4556         zsd_apply_all_keys(zsd_apply_create, zone);
4557 
4558         /* The callbacks are complete. Mark ZONE_IS_READY */
4559         mutex_enter(&zone_status_lock);
4560         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4561         zone_status_set(zone, ZONE_IS_READY);
4562         mutex_exit(&zone_status_lock);
4563 
4564         /*
4565          * Once we see the zone transition to the ZONE_IS_BOOTING state,
4566          * we launch init, and set the state to running.
4567          */
4568         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4569 
4570         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4571                 id_t cid;
4572 
4573                 /*
4574                  * Ok, this is a little complicated.  We need to grab the
4575                  * zone's pool's scheduling class ID; note that by now, we
4576                  * are already bound to a pool if we need to be (zoneadmd
4577                  * will have done that to us while we're in the READY
4578                  * state).  *But* the scheduling class for the zone's 'init'
4579                  * must be explicitly passed to newproc, which doesn't
4580                  * respect pool bindings.
4581                  *
4582                  * We hold the pool_lock across the call to newproc() to
4583                  * close the obvious race: the pool's scheduling class
4584                  * could change before we manage to create the LWP with
4585                  * classid 'cid'.
4586                  */
4587                 pool_lock();
4588                 if (zone->zone_defaultcid > 0)
4589                         cid = zone->zone_defaultcid;
4590                 else
4591                         cid = pool_get_class(zone->zone_pool);
4592                 if (cid == -1)
4593                         cid = defaultcid;
4594 
4595                 /*
4596                  * If this fails, zone_boot will ultimately fail.  The
4597                  * state of the zone will be set to SHUTTING_DOWN-- userland
4598                  * will have to tear down the zone, and fail, or try again.
4599                  */
4600                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4601                     minclsyspri - 1, &ct, 0)) != 0) {
4602                         mutex_enter(&zone_status_lock);
4603                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4604                         mutex_exit(&zone_status_lock);
4605                 } else {
4606                         zone->zone_boot_time = gethrestime_sec();
4607                 }
4608 
4609                 pool_unlock();
4610         }
4611 
4612         /*
4613          * Wait for zone_destroy() to be called.  This is what we spend
4614          * most of our life doing.
4615          */
4616         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4617 
4618         if (ct)
4619                 /*
4620                  * At this point the process contract should be empty.
4621                  * (Though if it isn't, it's not the end of the world.)
4622                  */
4623                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4624 
4625         /*
4626          * Allow kcred to be freed when all referring processes
4627          * (including this one) go away.  We can't just do this in
4628          * zone_free because we need to wait for the zone_cred_ref to
4629          * drop to 0 before calling zone_free, and the existence of
4630          * zone_kcred will prevent that.  Thus, we call crfree here to
4631          * balance the crdup in zone_create.  The crhold calls earlier
4632          * in zsched will be dropped when the thread and process exit.
4633          */
4634         crfree(zone->zone_kcred);
4635         zone->zone_kcred = NULL;
4636 
4637         exit(CLD_EXITED, 0);
4638 }
4639 
4640 /*
4641  * Helper function to determine if there are any submounts of the
4642  * provided path.  Used to make sure the zone doesn't "inherit" any
4643  * mounts from before it is created.
4644  */
4645 static uint_t
4646 zone_mount_count(const char *rootpath)
4647 {
4648         vfs_t *vfsp;
4649         uint_t count = 0;
4650         size_t rootpathlen = strlen(rootpath);
4651 
4652         /*
4653          * Holding zonehash_lock prevents race conditions with
4654          * vfs_list_add()/vfs_list_remove() since we serialize with
4655          * zone_find_by_path().
4656          */
4657         ASSERT(MUTEX_HELD(&zonehash_lock));
4658         /*
4659          * The rootpath must end with a '/'
4660          */
4661         ASSERT(rootpath[rootpathlen - 1] == '/');
4662 
4663         /*
4664          * This intentionally does not count the rootpath itself if that
4665          * happens to be a mount point.
4666          */
4667         vfs_list_read_lock();
4668         vfsp = rootvfs;
4669         do {
4670                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4671                     rootpathlen) == 0)
4672                         count++;
4673                 vfsp = vfsp->vfs_next;
4674         } while (vfsp != rootvfs);
4675         vfs_list_unlock();
4676         return (count);
4677 }
4678 
4679 /*
4680  * Helper function to make sure that a zone created on 'rootpath'
4681  * wouldn't end up containing other zones' rootpaths.
4682  */
4683 static boolean_t
4684 zone_is_nested(const char *rootpath)
4685 {
4686         zone_t *zone;
4687         size_t rootpathlen = strlen(rootpath);
4688         size_t len;
4689 
4690         ASSERT(MUTEX_HELD(&zonehash_lock));
4691 
4692         /*
4693          * zone_set_root() appended '/' and '\0' at the end of rootpath
4694          */
4695         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4696             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4697                 return (B_TRUE);
4698 
4699         for (zone = list_head(&zone_active); zone != NULL;
4700             zone = list_next(&zone_active, zone)) {
4701                 if (zone == global_zone)
4702                         continue;
4703                 len = strlen(zone->zone_rootpath);
4704                 if (strncmp(rootpath, zone->zone_rootpath,
4705                     MIN(rootpathlen, len)) == 0)
4706                         return (B_TRUE);
4707         }
4708         return (B_FALSE);
4709 }
4710 
4711 static int
4712 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4713     size_t zone_privssz)
4714 {
4715         priv_set_t *privs;
4716 
4717         if (zone_privssz < sizeof (priv_set_t))
4718                 return (ENOMEM);
4719 
4720         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4721 
4722         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4723                 kmem_free(privs, sizeof (priv_set_t));
4724                 return (EFAULT);
4725         }
4726 
4727         zone->zone_privset = privs;
4728         return (0);
4729 }
4730 
4731 /*
4732  * We make creative use of nvlists to pass in rctls from userland.  The list is
4733  * a list of the following structures:
4734  *
4735  * (name = rctl_name, value = nvpair_list_array)
4736  *
4737  * Where each element of the nvpair_list_array is of the form:
4738  *
4739  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4740  *      (name = "limit", value = uint64_t),
4741  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4742  */
4743 static int
4744 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4745 {
4746         nvpair_t *nvp = NULL;
4747         nvlist_t *nvl = NULL;
4748         char *kbuf;
4749         int error;
4750         rctl_val_t rv;
4751 
4752         *nvlp = NULL;
4753 
4754         if (buflen == 0)
4755                 return (0);
4756 
4757         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4758                 return (ENOMEM);
4759         if (copyin(ubuf, kbuf, buflen)) {
4760                 error = EFAULT;
4761                 goto out;
4762         }
4763         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4764                 /*
4765                  * nvl may have been allocated/free'd, but the value set to
4766                  * non-NULL, so we reset it here.
4767                  */
4768                 nvl = NULL;
4769                 error = EINVAL;
4770                 goto out;
4771         }
4772         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4773                 rctl_dict_entry_t *rde;
4774                 rctl_hndl_t hndl;
4775                 nvlist_t **nvlarray;
4776                 uint_t i, nelem;
4777                 char *name;
4778 
4779                 error = EINVAL;
4780                 name = nvpair_name(nvp);
4781                 if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
4782                     strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
4783                     nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4784                         goto out;
4785                 }
4786                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4787                         goto out;
4788                 }
4789                 rde = rctl_dict_lookup_hndl(hndl);
4790                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4791                 ASSERT(error == 0);
4792                 for (i = 0; i < nelem; i++) {
4793                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4794                                 goto out;
4795                 }
4796                 if (rctl_invalid_value(rde, &rv)) {
4797                         error = EINVAL;
4798                         goto out;
4799                 }
4800         }
4801         error = 0;
4802         *nvlp = nvl;
4803 out:
4804         kmem_free(kbuf, buflen);
4805         if (error && nvl != NULL)
4806                 nvlist_free(nvl);
4807         return (error);
4808 }
4809 
4810 int
4811 zone_create_error(int er_error, int er_ext, int *er_out)
4812 {
4813         if (er_out != NULL) {
4814                 if (copyout(&er_ext, er_out, sizeof (int))) {
4815                         return (set_errno(EFAULT));
4816                 }
4817         }
4818         return (set_errno(er_error));
4819 }
4820 
4821 static int
4822 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4823 {
4824         ts_label_t *tsl;
4825         bslabel_t blab;
4826 
4827         /* Get label from user */
4828         if (copyin(lab, &blab, sizeof (blab)) != 0)
4829                 return (EFAULT);
4830         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4831         if (tsl == NULL)
4832                 return (ENOMEM);
4833 
4834         zone->zone_slabel = tsl;
4835         return (0);
4836 }
4837 
4838 /*
4839  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4840  */
4841 static int
4842 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4843 {
4844         char *kbuf;
4845         char *dataset, *next;
4846         zone_dataset_t *zd;
4847         size_t len;
4848 
4849         if (ubuf == NULL || buflen == 0)
4850                 return (0);
4851 
4852         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4853                 return (ENOMEM);
4854 
4855         if (copyin(ubuf, kbuf, buflen) != 0) {
4856                 kmem_free(kbuf, buflen);
4857                 return (EFAULT);
4858         }
4859 
4860         dataset = next = kbuf;
4861         for (;;) {
4862                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4863 
4864                 next = strchr(dataset, ',');
4865 
4866                 if (next == NULL)
4867                         len = strlen(dataset);
4868                 else
4869                         len = next - dataset;
4870 
4871                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4872                 bcopy(dataset, zd->zd_dataset, len);
4873                 zd->zd_dataset[len] = '\0';
4874 
4875                 list_insert_head(&zone->zone_datasets, zd);
4876 
4877                 if (next == NULL)
4878                         break;
4879 
4880                 dataset = next + 1;
4881         }
4882 
4883         kmem_free(kbuf, buflen);
4884         return (0);
4885 }
4886 
4887 /*
4888  * System call to create/initialize a new zone named 'zone_name', rooted
4889  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4890  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4891  * with labeling set by 'match', 'doi', and 'label'.
4892  *
4893  * If extended error is non-null, we may use it to return more detailed
4894  * error information.
4895  */
4896 static zoneid_t
4897 zone_create(const char *zone_name, const char *zone_root,
4898     const priv_set_t *zone_privs, size_t zone_privssz,
4899     caddr_t rctlbuf, size_t rctlbufsz,
4900     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4901     int match, uint32_t doi, const bslabel_t *label,
4902     int flags, zoneid_t zone_did)
4903 {
4904         struct zsched_arg zarg;
4905         nvlist_t *rctls = NULL;
4906         proc_t *pp = curproc;
4907         zone_t *zone, *ztmp;
4908         zoneid_t zoneid, start = GLOBAL_ZONEID;
4909         int error;
4910         int error2 = 0;
4911         char *str;
4912         cred_t *zkcr;
4913         boolean_t insert_label_hash;
4914 
4915         if (secpolicy_zone_config(CRED()) != 0)
4916                 return (set_errno(EPERM));
4917 
4918         /* can't boot zone from within chroot environment */
4919         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4920                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4921                     extended_error));
4922 
4923         /*
4924          * As the first step of zone creation, we want to allocate a zoneid.
4925          * This allocation is complicated by the fact that netstacks use the
4926          * zoneid to determine their stackid, but netstacks themselves are
4927          * freed asynchronously with respect to zone destruction.  This means
4928          * that a netstack reference leak (or in principle, an extraordinarily
4929          * long netstack reference hold) could result in a zoneid being
4930          * allocated that in fact corresponds to a stackid from an active
4931          * (referenced) netstack -- unleashing all sorts of havoc when that
4932          * netstack is actually (re)used.  (In the abstract, we might wish a
4933          * zoneid to not be deallocated until its last referencing netstack
4934          * has been released, but netstacks lack a backpointer into their
4935          * referencing zone -- and changing them to have such a pointer would
4936          * be substantial, to put it euphemistically.)  To avoid this, we
4937          * detect this condition on allocation: if we have allocated a zoneid
4938          * that corresponds to a netstack that's still in use, we warn about
4939          * it (as it is much more likely to be a reference leak than an actual
4940          * netstack reference), free it, and allocate another.  That these
4941          * identifers are allocated out of an ID space assures that we won't
4942          * see the identifier we just allocated.
4943          */
4944         for (;;) {
4945                 zoneid = id_alloc(zoneid_space);
4946 
4947                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4948                         break;
4949 
4950                 id_free(zoneid_space, zoneid);
4951 
4952                 if (start == GLOBAL_ZONEID) {
4953                         start = zoneid;
4954                 } else if (zoneid == start) {
4955                         /*
4956                          * We have managed to iterate over the entire available
4957                          * zoneid space -- there are no identifiers available,
4958                          * presumably due to some number of leaked netstack
4959                          * references.  While it's in principle possible for us
4960                          * to continue to try, it seems wiser to give up at
4961                          * this point to warn and fail explicitly with a
4962                          * distinctive error.
4963                          */
4964                         cmn_err(CE_WARN, "zone_create() failed: all available "
4965                             "zone IDs have netstacks still in use");
4966                         return (set_errno(ENFILE));
4967                 }
4968 
4969                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4970                     "netstack still in use", zoneid);
4971         }
4972 
4973         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4974         zone->zone_id = zoneid;
4975         zone->zone_did = zone_did;
4976         zone->zone_status = ZONE_IS_UNINITIALIZED;
4977         zone->zone_pool = pool_default;
4978         zone->zone_pool_mod = gethrtime();
4979         zone->zone_psetid = ZONE_PS_INVAL;
4980         zone->zone_ncpus = 0;
4981         zone->zone_ncpus_online = 0;
4982         zone->zone_restart_init = B_TRUE;
4983         zone->zone_reboot_on_init_exit = B_FALSE;
4984         zone->zone_init_status = -1;
4985         zone->zone_brand = &native_brand;
4986         zone->zone_initname = NULL;
4987         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4988         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4989         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4990         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4991         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4992             offsetof(zone_ref_t, zref_linkage));
4993         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4994             offsetof(struct zsd_entry, zsd_linkage));
4995         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4996             offsetof(zone_dataset_t, zd_linkage));
4997         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4998             offsetof(zone_dl_t, zdl_linkage));
4999         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
5000         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
5001 
5002         if (flags & ZCF_NET_EXCL) {
5003                 zone->zone_flags |= ZF_NET_EXCL;
5004         }
5005 
5006         if ((error = zone_set_name(zone, zone_name)) != 0) {
5007                 zone_free(zone);
5008                 return (zone_create_error(error, 0, extended_error));
5009         }
5010 
5011         if ((error = zone_set_root(zone, zone_root)) != 0) {
5012                 zone_free(zone);
5013                 return (zone_create_error(error, 0, extended_error));
5014         }
5015         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
5016                 zone_free(zone);
5017                 return (zone_create_error(error, 0, extended_error));
5018         }
5019 
5020         /* initialize node name to be the same as zone name */
5021         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
5022         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
5023         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
5024 
5025         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
5026         zone->zone_domain[0] = '\0';
5027         zone->zone_hostid = HW_INVALID_HOSTID;
5028         zone->zone_shares = 1;
5029         zone->zone_shmmax = 0;
5030         zone->zone_ipc.ipcq_shmmni = 0;
5031         zone->zone_ipc.ipcq_semmni = 0;
5032         zone->zone_ipc.ipcq_msgmni = 0;
5033         zone->zone_bootargs = NULL;
5034         zone->zone_fs_allowed = NULL;
5035         zone->zone_initname =
5036             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
5037         (void) strcpy(zone->zone_initname, zone_default_initname);
5038         zone->zone_nlwps = 0;
5039         zone->zone_nlwps_ctl = INT_MAX;
5040         zone->zone_nprocs = 0;
5041         zone->zone_nprocs_ctl = INT_MAX;
5042         zone->zone_locked_mem = 0;
5043         zone->zone_locked_mem_ctl = UINT64_MAX;
5044         zone->zone_max_swap = 0;
5045         zone->zone_max_swap_ctl = UINT64_MAX;
5046         zone->zone_phys_mem = 0;
5047         zone->zone_phys_mem_ctl = UINT64_MAX;
5048         zone->zone_max_lofi = 0;
5049         zone->zone_max_lofi_ctl = UINT64_MAX;
5050         zone->zone_lockedmem_kstat = NULL;
5051         zone->zone_swapresv_kstat = NULL;
5052         zone->zone_physmem_kstat = NULL;
5053         zone->zone_zfs_io_pri = 1;
5054 
5055         /*
5056          * Zsched initializes the rctls.
5057          */
5058         zone->zone_rctls = NULL;
5059 
5060         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
5061                 zone_free(zone);
5062                 return (zone_create_error(error, 0, extended_error));
5063         }
5064 
5065         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
5066                 zone_free(zone);
5067                 return (set_errno(error));
5068         }
5069 
5070         /*
5071          * Read in the trusted system parameters:
5072          * match flag and sensitivity label.
5073          */
5074         zone->zone_match = match;
5075         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
5076                 /* Fail if requested to set doi to anything but system's doi */
5077                 if (doi != 0 && doi != default_doi) {
5078                         zone_free(zone);
5079                         return (set_errno(EINVAL));
5080                 }
5081                 /* Always apply system's doi to the zone */
5082                 error = zone_set_label(zone, label, default_doi);
5083                 if (error != 0) {
5084                         zone_free(zone);
5085                         return (set_errno(error));
5086                 }
5087                 insert_label_hash = B_TRUE;
5088         } else {
5089                 /* all zones get an admin_low label if system is not labeled */
5090                 zone->zone_slabel = l_admin_low;
5091                 label_hold(l_admin_low);
5092                 insert_label_hash = B_FALSE;
5093         }
5094 
5095         /*
5096          * Stop all lwps since that's what normally happens as part of fork().
5097          * This needs to happen before we grab any locks to avoid deadlock
5098          * (another lwp in the process could be waiting for the held lock).
5099          */
5100         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
5101                 zone_free(zone);
5102                 nvlist_free(rctls);
5103                 return (zone_create_error(error, 0, extended_error));
5104         }
5105 
5106         if (block_mounts(zone) == 0) {
5107                 mutex_enter(&pp->p_lock);
5108                 if (curthread != pp->p_agenttp)
5109                         continuelwps(pp);
5110                 mutex_exit(&pp->p_lock);
5111                 zone_free(zone);
5112                 nvlist_free(rctls);
5113                 return (zone_create_error(error, 0, extended_error));
5114         }
5115 
5116         /*
5117          * Set up credential for kernel access.  After this, any errors
5118          * should go through the dance in errout rather than calling
5119          * zone_free directly.
5120          */
5121         zone->zone_kcred = crdup(kcred);
5122         crsetzone(zone->zone_kcred, zone);
5123         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
5124         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
5125         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
5126         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
5127 
5128         mutex_enter(&zonehash_lock);
5129         /*
5130          * Make sure zone doesn't already exist.
5131          *
5132          * If the system and zone are labeled,
5133          * make sure no other zone exists that has the same label.
5134          */
5135         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
5136             (insert_label_hash &&
5137             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
5138                 zone_status_t status;
5139 
5140                 status = zone_status_get(ztmp);
5141                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
5142                         error = EEXIST;
5143                 else
5144                         error = EBUSY;
5145 
5146                 if (insert_label_hash)
5147                         error2 = ZE_LABELINUSE;
5148 
5149                 goto errout;
5150         }
5151 
5152         /*
5153          * Don't allow zone creations which would cause one zone's rootpath to
5154          * be accessible from that of another (non-global) zone.
5155          */
5156         if (zone_is_nested(zone->zone_rootpath)) {
5157                 error = EBUSY;
5158                 goto errout;
5159         }
5160 
5161         ASSERT(zonecount != 0);         /* check for leaks */
5162         if (zonecount + 1 > maxzones) {
5163                 error = ENOMEM;
5164                 goto errout;
5165         }
5166 
5167         if (zone_mount_count(zone->zone_rootpath) != 0) {
5168                 error = EBUSY;
5169                 error2 = ZE_AREMOUNTS;
5170                 goto errout;
5171         }
5172 
5173         /*
5174          * Zone is still incomplete, but we need to drop all locks while
5175          * zsched() initializes this zone's kernel process.  We
5176          * optimistically add the zone to the hashtable and associated
5177          * lists so a parallel zone_create() doesn't try to create the
5178          * same zone.
5179          */
5180         zonecount++;
5181         (void) mod_hash_insert(zonehashbyid,
5182             (mod_hash_key_t)(uintptr_t)zone->zone_id,
5183             (mod_hash_val_t)(uintptr_t)zone);
5184         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
5185         (void) strcpy(str, zone->zone_name);
5186         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
5187             (mod_hash_val_t)(uintptr_t)zone);
5188         if (insert_label_hash) {
5189                 (void) mod_hash_insert(zonehashbylabel,
5190                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
5191                 zone->zone_flags |= ZF_HASHED_LABEL;
5192         }
5193 
5194         /*
5195          * Insert into active list.  At this point there are no 'hold's
5196          * on the zone, but everyone else knows not to use it, so we can
5197          * continue to use it.  zsched() will do a zone_hold() if the
5198          * newproc() is successful.
5199          */
5200         list_insert_tail(&zone_active, zone);
5201         mutex_exit(&zonehash_lock);
5202 
5203         zarg.zone = zone;
5204         zarg.nvlist = rctls;
5205         /*
5206          * The process, task, and project rctls are probably wrong;
5207          * we need an interface to get the default values of all rctls,
5208          * and initialize zsched appropriately. However, we allow zoneadmd
5209          * to pass down both zone and project rctls for the zone's init.
5210          */
5211         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
5212         if (error != 0) {
5213                 /*
5214                  * We need to undo all globally visible state.
5215                  */
5216                 mutex_enter(&zonehash_lock);
5217                 list_remove(&zone_active, zone);
5218                 if (zone->zone_flags & ZF_HASHED_LABEL) {
5219                         ASSERT(zone->zone_slabel != NULL);
5220                         (void) mod_hash_destroy(zonehashbylabel,
5221                             (mod_hash_key_t)zone->zone_slabel);
5222                 }
5223                 (void) mod_hash_destroy(zonehashbyname,
5224                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
5225                 (void) mod_hash_destroy(zonehashbyid,
5226                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
5227                 ASSERT(zonecount > 1);
5228                 zonecount--;
5229                 goto errout;
5230         }
5231 
5232         /*
5233          * Zone creation can't fail from now on.
5234          */
5235 
5236         /*
5237          * Create zone kstats
5238          */
5239         zone_kstat_create(zone);
5240 
5241         /*
5242          * Let the other lwps continue.
5243          */
5244         mutex_enter(&pp->p_lock);
5245         if (curthread != pp->p_agenttp)
5246                 continuelwps(pp);
5247         mutex_exit(&pp->p_lock);
5248 
5249         /*
5250          * Wait for zsched to finish initializing the zone.
5251          */
5252         zone_status_wait(zone, ZONE_IS_READY);
5253         /*
5254          * The zone is fully visible, so we can let mounts progress.
5255          */
5256         resume_mounts(zone);
5257         nvlist_free(rctls);
5258 
5259         return (zoneid);
5260 
5261 errout:
5262         mutex_exit(&zonehash_lock);
5263         /*
5264          * Let the other lwps continue.
5265          */
5266         mutex_enter(&pp->p_lock);
5267         if (curthread != pp->p_agenttp)
5268                 continuelwps(pp);
5269         mutex_exit(&pp->p_lock);
5270 
5271         resume_mounts(zone);
5272         nvlist_free(rctls);
5273         /*
5274          * There is currently one reference to the zone, a cred_ref from
5275          * zone_kcred.  To free the zone, we call crfree, which will call
5276          * zone_cred_rele, which will call zone_free.
5277          */
5278         ASSERT(zone->zone_cred_ref == 1);
5279         ASSERT(zone->zone_kcred->cr_ref == 1);
5280         ASSERT(zone->zone_ref == 0);
5281         zkcr = zone->zone_kcred;
5282         zone->zone_kcred = NULL;
5283         crfree(zkcr);                           /* triggers call to zone_free */
5284         return (zone_create_error(error, error2, extended_error));
5285 }
5286 
5287 /*
5288  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
5289  * the heavy lifting.  initname is the path to the program to launch
5290  * at the "top" of the zone; if this is NULL, we use the system default,
5291  * which is stored at zone_default_initname.
5292  */
5293 static int
5294 zone_boot(zoneid_t zoneid)
5295 {
5296         int err;
5297         zone_t *zone;
5298 
5299         if (secpolicy_zone_config(CRED()) != 0)
5300                 return (set_errno(EPERM));
5301         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5302                 return (set_errno(EINVAL));
5303 
5304         mutex_enter(&zonehash_lock);
5305         /*
5306          * Look for zone under hash lock to prevent races with calls to
5307          * zone_shutdown, zone_destroy, etc.
5308          */
5309         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5310                 mutex_exit(&zonehash_lock);
5311                 return (set_errno(EINVAL));
5312         }
5313 
5314         mutex_enter(&zone_status_lock);
5315         if (zone_status_get(zone) != ZONE_IS_READY) {
5316                 mutex_exit(&zone_status_lock);
5317                 mutex_exit(&zonehash_lock);
5318                 return (set_errno(EINVAL));
5319         }
5320         zone_status_set(zone, ZONE_IS_BOOTING);
5321         mutex_exit(&zone_status_lock);
5322 
5323         zone_hold(zone);        /* so we can use the zone_t later */
5324         mutex_exit(&zonehash_lock);
5325 
5326         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
5327                 zone_rele(zone);
5328                 return (set_errno(EINTR));
5329         }
5330 
5331         /*
5332          * Boot (starting init) might have failed, in which case the zone
5333          * will go to the SHUTTING_DOWN state; an appropriate errno will
5334          * be placed in zone->zone_boot_err, and so we return that.
5335          */
5336         err = zone->zone_boot_err;
5337         zone_rele(zone);
5338         return (err ? set_errno(err) : 0);
5339 }
5340 
5341 /*
5342  * Kills all user processes in the zone, waiting for them all to exit
5343  * before returning.
5344  */
5345 static int
5346 zone_empty(zone_t *zone)
5347 {
5348         int cnt = 0;
5349         int waitstatus;
5350 
5351         /*
5352          * We need to drop zonehash_lock before killing all
5353          * processes, otherwise we'll deadlock with zone_find_*
5354          * which can be called from the exit path.
5355          */
5356         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
5357         while ((waitstatus = zone_status_timedwait_sig(zone,
5358             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
5359                 boolean_t force = B_FALSE;
5360 
5361                 /* Every 30 seconds, try harder */
5362                 if (cnt++ >= 30) {
5363                         cmn_err(CE_WARN, "attempt to force kill zone %d\n",
5364                             zone->zone_id);
5365                         force = B_TRUE;
5366                         cnt = 0;
5367                 }
5368                 killall(zone->zone_id, force);
5369         }
5370         /*
5371          * return EINTR if we were signaled
5372          */
5373         if (waitstatus == 0)
5374                 return (EINTR);
5375         return (0);
5376 }
5377 
5378 /*
5379  * This function implements the policy for zone visibility.
5380  *
5381  * In standard Solaris, a non-global zone can only see itself.
5382  *
5383  * In Trusted Extensions, a labeled zone can lookup any zone whose label
5384  * it dominates. For this test, the label of the global zone is treated as
5385  * admin_high so it is special-cased instead of being checked for dominance.
5386  *
5387  * Returns true if zone attributes are viewable, false otherwise.
5388  */
5389 static boolean_t
5390 zone_list_access(zone_t *zone)
5391 {
5392 
5393         if (curproc->p_zone == global_zone ||
5394             curproc->p_zone == zone) {
5395                 return (B_TRUE);
5396         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
5397                 bslabel_t *curproc_label;
5398                 bslabel_t *zone_label;
5399 
5400                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
5401                 zone_label = label2bslabel(zone->zone_slabel);
5402 
5403                 if (zone->zone_id != GLOBAL_ZONEID &&
5404                     bldominates(curproc_label, zone_label)) {
5405                         return (B_TRUE);
5406                 } else {
5407                         return (B_FALSE);
5408                 }
5409         } else {
5410                 return (B_FALSE);
5411         }
5412 }
5413 
5414 /*
5415  * Systemcall to start the zone's halt sequence.  By the time this
5416  * function successfully returns, all user processes and kernel threads
5417  * executing in it will have exited, ZSD shutdown callbacks executed,
5418  * and the zone status set to ZONE_IS_DOWN.
5419  *
5420  * It is possible that the call will interrupt itself if the caller is the
5421  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
5422  */
5423 static int
5424 zone_shutdown(zoneid_t zoneid)
5425 {
5426         int error;
5427         zone_t *zone;
5428         zone_status_t status;
5429 
5430         if (secpolicy_zone_config(CRED()) != 0)
5431                 return (set_errno(EPERM));
5432         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5433                 return (set_errno(EINVAL));
5434 
5435         mutex_enter(&zonehash_lock);
5436         /*
5437          * Look for zone under hash lock to prevent races with other
5438          * calls to zone_shutdown and zone_destroy.
5439          */
5440         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5441                 mutex_exit(&zonehash_lock);
5442                 return (set_errno(EINVAL));
5443         }
5444 
5445         /*
5446          * We have to drop zonehash_lock before calling block_mounts.
5447          * Hold the zone so we can continue to use the zone_t.
5448          */
5449         zone_hold(zone);
5450         mutex_exit(&zonehash_lock);
5451 
5452         /*
5453          * Block mounts so that VFS_MOUNT() can get an accurate view of
5454          * the zone's status with regards to ZONE_IS_SHUTTING down.
5455          *
5456          * e.g. NFS can fail the mount if it determines that the zone
5457          * has already begun the shutdown sequence.
5458          *
5459          */
5460         if (block_mounts(zone) == 0) {
5461                 zone_rele(zone);
5462                 return (set_errno(EINTR));
5463         }
5464 
5465         mutex_enter(&zonehash_lock);
5466         mutex_enter(&zone_status_lock);
5467         status = zone_status_get(zone);
5468         /*
5469          * Fail if the zone isn't fully initialized yet.
5470          */
5471         if (status < ZONE_IS_READY) {
5472                 mutex_exit(&zone_status_lock);
5473                 mutex_exit(&zonehash_lock);
5474                 resume_mounts(zone);
5475                 zone_rele(zone);
5476                 return (set_errno(EINVAL));
5477         }
5478         /*
5479          * If conditions required for zone_shutdown() to return have been met,
5480          * return success.
5481          */
5482         if (status >= ZONE_IS_DOWN) {
5483                 mutex_exit(&zone_status_lock);
5484                 mutex_exit(&zonehash_lock);
5485                 resume_mounts(zone);
5486                 zone_rele(zone);
5487                 return (0);
5488         }
5489         /*
5490          * If zone_shutdown() hasn't been called before, go through the motions.
5491          * If it has, there's nothing to do but wait for the kernel threads to
5492          * drain.
5493          */
5494         if (status < ZONE_IS_EMPTY) {
5495                 uint_t ntasks;
5496 
5497                 mutex_enter(&zone->zone_lock);
5498                 if ((ntasks = zone->zone_ntasks) != 1) {
5499                         /*
5500                          * There's still stuff running.
5501                          */
5502                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5503                 }
5504                 mutex_exit(&zone->zone_lock);
5505                 if (ntasks == 1) {
5506                         /*
5507                          * The only way to create another task is through
5508                          * zone_enter(), which will block until we drop
5509                          * zonehash_lock.  The zone is empty.
5510                          */
5511                         if (zone->zone_kthreads == NULL) {
5512                                 /*
5513                                  * Skip ahead to ZONE_IS_DOWN
5514                                  */
5515                                 zone_status_set(zone, ZONE_IS_DOWN);
5516                         } else {
5517                                 zone_status_set(zone, ZONE_IS_EMPTY);
5518                         }
5519                 }
5520         }
5521         mutex_exit(&zone_status_lock);
5522         mutex_exit(&zonehash_lock);
5523         resume_mounts(zone);
5524 
5525         if (error = zone_empty(zone)) {
5526                 zone_rele(zone);
5527                 return (set_errno(error));
5528         }
5529         /*
5530          * After the zone status goes to ZONE_IS_DOWN this zone will no
5531          * longer be notified of changes to the pools configuration, so
5532          * in order to not end up with a stale pool pointer, we point
5533          * ourselves at the default pool and remove all resource
5534          * visibility.  This is especially important as the zone_t may
5535          * languish on the deathrow for a very long time waiting for
5536          * cred's to drain out.
5537          *
5538          * This rebinding of the zone can happen multiple times
5539          * (presumably due to interrupted or parallel systemcalls)
5540          * without any adverse effects.
5541          */
5542         if (pool_lock_intr() != 0) {
5543                 zone_rele(zone);
5544                 return (set_errno(EINTR));
5545         }
5546         if (pool_state == POOL_ENABLED) {
5547                 mutex_enter(&cpu_lock);
5548                 zone_pool_set(zone, pool_default);
5549                 /*
5550                  * The zone no longer needs to be able to see any cpus.
5551                  */
5552                 zone_pset_set(zone, ZONE_PS_INVAL);
5553                 mutex_exit(&cpu_lock);
5554         }
5555         pool_unlock();
5556 
5557         /*
5558          * ZSD shutdown callbacks can be executed multiple times, hence
5559          * it is safe to not be holding any locks across this call.
5560          */
5561         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5562 
5563         mutex_enter(&zone_status_lock);
5564         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5565                 zone_status_set(zone, ZONE_IS_DOWN);
5566         mutex_exit(&zone_status_lock);
5567 
5568         /*
5569          * Wait for kernel threads to drain.
5570          */
5571         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5572                 zone_rele(zone);
5573                 return (set_errno(EINTR));
5574         }
5575 
5576         /*
5577          * Zone can be become down/destroyable even if the above wait
5578          * returns EINTR, so any code added here may never execute.
5579          * (i.e. don't add code here)
5580          */
5581 
5582         zone_rele(zone);
5583         return (0);
5584 }
5585 
5586 /*
5587  * Log the specified zone's reference counts.  The caller should not be
5588  * holding the zone's zone_lock.
5589  */
5590 static void
5591 zone_log_refcounts(zone_t *zone)
5592 {
5593         char *buffer;
5594         char *buffer_position;
5595         uint32_t buffer_size;
5596         uint32_t index;
5597         uint_t ref;
5598         uint_t cred_ref;
5599 
5600         /*
5601          * Construct a string representing the subsystem-specific reference
5602          * counts.  The counts are printed in ascending order by index into the
5603          * zone_t::zone_subsys_ref array.  The list will be surrounded by
5604          * square brackets [] and will only contain nonzero reference counts.
5605          *
5606          * The buffer will hold two square bracket characters plus ten digits,
5607          * one colon, one space, one comma, and some characters for a
5608          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5609          * bit integers have at most ten decimal digits.)  The last
5610          * reference count's comma is replaced by the closing square
5611          * bracket and a NULL character to terminate the string.
5612          *
5613          * NOTE: We have to grab the zone's zone_lock to create a consistent
5614          * snapshot of the zone's reference counters.
5615          *
5616          * First, figure out how much space the string buffer will need.
5617          * The buffer's size is stored in buffer_size.
5618          */
5619         buffer_size = 2;                        /* for the square brackets */
5620         mutex_enter(&zone->zone_lock);
5621         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5622         ref = zone->zone_ref;
5623         cred_ref = zone->zone_cred_ref;
5624         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5625                 if (zone->zone_subsys_ref[index] != 0)
5626                         buffer_size += strlen(zone_ref_subsys_names[index]) +
5627                             13;
5628         if (buffer_size == 2) {
5629                 /*
5630                  * No subsystems had nonzero reference counts.  Don't bother
5631                  * with allocating a buffer; just log the general-purpose and
5632                  * credential reference counts.
5633                  */
5634                 mutex_exit(&zone->zone_lock);
5635                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5636                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
5637                     "references and %u credential references are still extant",
5638                     zone->zone_name, zone->zone_id, ref, cred_ref);
5639                 return;
5640         }
5641 
5642         /*
5643          * buffer_size contains the exact number of characters that the
5644          * buffer will need.  Allocate the buffer and fill it with nonzero
5645          * subsystem-specific reference counts.  Surround the results with
5646          * square brackets afterwards.
5647          */
5648         buffer = kmem_alloc(buffer_size, KM_SLEEP);
5649         buffer_position = &buffer[1];
5650         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5651                 /*
5652                  * NOTE: The DDI's version of sprintf() returns a pointer to
5653                  * the modified buffer rather than the number of bytes written
5654                  * (as in snprintf(3C)).  This is unfortunate and annoying.
5655                  * Therefore, we'll use snprintf() with INT_MAX to get the
5656                  * number of bytes written.  Using INT_MAX is safe because
5657                  * the buffer is perfectly sized for the data: we'll never
5658                  * overrun the buffer.
5659                  */
5660                 if (zone->zone_subsys_ref[index] != 0)
5661                         buffer_position += snprintf(buffer_position, INT_MAX,
5662                             "%s: %u,", zone_ref_subsys_names[index],
5663                             zone->zone_subsys_ref[index]);
5664         }
5665         mutex_exit(&zone->zone_lock);
5666         buffer[0] = '[';
5667         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5668         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5669         buffer_position[-1] = ']';
5670 
5671         /*
5672          * Log the reference counts and free the message buffer.
5673          */
5674         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5675             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5676             "%u credential references are still extant %s", zone->zone_name,
5677             zone->zone_id, ref, cred_ref, buffer);
5678         kmem_free(buffer, buffer_size);
5679 }
5680 
5681 /*
5682  * Systemcall entry point to finalize the zone halt process.  The caller
5683  * must have already successfully called zone_shutdown().
5684  *
5685  * Upon successful completion, the zone will have been fully destroyed:
5686  * zsched will have exited, destructor callbacks executed, and the zone
5687  * removed from the list of active zones.
5688  */
5689 static int
5690 zone_destroy(zoneid_t zoneid)
5691 {
5692         uint64_t uniqid;
5693         zone_t *zone;
5694         zone_status_t status;
5695         clock_t wait_time;
5696         boolean_t log_refcounts;
5697 
5698         if (secpolicy_zone_config(CRED()) != 0)
5699                 return (set_errno(EPERM));
5700         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5701                 return (set_errno(EINVAL));
5702 
5703         mutex_enter(&zonehash_lock);
5704         /*
5705          * Look for zone under hash lock to prevent races with other
5706          * calls to zone_destroy.
5707          */
5708         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5709                 mutex_exit(&zonehash_lock);
5710                 return (set_errno(EINVAL));
5711         }
5712 
5713         if (zone_mount_count(zone->zone_rootpath) != 0) {
5714                 mutex_exit(&zonehash_lock);
5715                 return (set_errno(EBUSY));
5716         }
5717         mutex_enter(&zone_status_lock);
5718         status = zone_status_get(zone);
5719         if (status < ZONE_IS_DOWN) {
5720                 mutex_exit(&zone_status_lock);
5721                 mutex_exit(&zonehash_lock);
5722                 return (set_errno(EBUSY));
5723         } else if (status == ZONE_IS_DOWN) {
5724                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5725         }
5726         mutex_exit(&zone_status_lock);
5727         zone_hold(zone);
5728         mutex_exit(&zonehash_lock);
5729 
5730         /*
5731          * wait for zsched to exit
5732          */
5733         zone_status_wait(zone, ZONE_IS_DEAD);
5734         zone_zsd_callbacks(zone, ZSD_DESTROY);
5735         zone->zone_netstack = NULL;
5736         uniqid = zone->zone_uniqid;
5737         zone_rele(zone);
5738         zone = NULL;    /* potentially free'd */
5739 
5740         log_refcounts = B_FALSE;
5741         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5742         mutex_enter(&zonehash_lock);
5743         for (; /* ever */; ) {
5744                 boolean_t unref;
5745                 boolean_t refs_have_been_logged;
5746 
5747                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5748                     zone->zone_uniqid != uniqid) {
5749                         /*
5750                          * The zone has gone away.  Necessary conditions
5751                          * are met, so we return success.
5752                          */
5753                         mutex_exit(&zonehash_lock);
5754                         return (0);
5755                 }
5756                 mutex_enter(&zone->zone_lock);
5757                 unref = ZONE_IS_UNREF(zone);
5758                 refs_have_been_logged = (zone->zone_flags &
5759                     ZF_REFCOUNTS_LOGGED);
5760                 mutex_exit(&zone->zone_lock);
5761                 if (unref) {
5762                         /*
5763                          * There is only one reference to the zone -- that
5764                          * added when the zone was added to the hashtables --
5765                          * and things will remain this way until we drop
5766                          * zonehash_lock... we can go ahead and cleanup the
5767                          * zone.
5768                          */
5769                         break;
5770                 }
5771 
5772                 /*
5773                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5774                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5775                  * some zone's general-purpose reference count reaches one.
5776                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5777                  * on zone_destroy_cv, then log the zone's reference counts and
5778                  * continue to wait for zone_rele() and zone_cred_rele().
5779                  */
5780                 if (!refs_have_been_logged) {
5781                         if (!log_refcounts) {
5782                                 /*
5783                                  * This thread hasn't timed out waiting on
5784                                  * zone_destroy_cv yet.  Wait wait_time clock
5785                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5786                                  * seconds) for the zone's references to clear.
5787                                  */
5788                                 ASSERT(wait_time > 0);
5789                                 wait_time = cv_reltimedwait_sig(
5790                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5791                                     TR_SEC);
5792                                 if (wait_time > 0) {
5793                                         /*
5794                                          * A thread in zone_rele() or
5795                                          * zone_cred_rele() signaled
5796                                          * zone_destroy_cv before this thread's
5797                                          * wait timed out.  The zone might have
5798                                          * only one reference left; find out!
5799                                          */
5800                                         continue;
5801                                 } else if (wait_time == 0) {
5802                                         /* The thread's process was signaled. */
5803                                         mutex_exit(&zonehash_lock);
5804                                         return (set_errno(EINTR));
5805                                 }
5806 
5807                                 /*
5808                                  * The thread timed out while waiting on
5809                                  * zone_destroy_cv.  Even though the thread
5810                                  * timed out, it has to check whether another
5811                                  * thread woke up from zone_destroy_cv and
5812                                  * destroyed the zone.
5813                                  *
5814                                  * If the zone still exists and has more than
5815                                  * one unreleased general-purpose reference,
5816                                  * then log the zone's reference counts.
5817                                  */
5818                                 log_refcounts = B_TRUE;
5819                                 continue;
5820                         }
5821 
5822                         /*
5823                          * The thread already timed out on zone_destroy_cv while
5824                          * waiting for subsystems to release the zone's last
5825                          * general-purpose references.  Log the zone's reference
5826                          * counts and wait indefinitely on zone_destroy_cv.
5827                          */
5828                         zone_log_refcounts(zone);
5829                 }
5830                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5831                         /* The thread's process was signaled. */
5832                         mutex_exit(&zonehash_lock);
5833                         return (set_errno(EINTR));
5834                 }
5835         }
5836 
5837         /*
5838          * Remove CPU cap for this zone now since we're not going to
5839          * fail below this point.
5840          */
5841         cpucaps_zone_remove(zone);
5842 
5843         /* Get rid of the zone's kstats */
5844         zone_kstat_delete(zone);
5845 
5846         /* remove the pfexecd doors */
5847         if (zone->zone_pfexecd != NULL) {
5848                 klpd_freelist(&zone->zone_pfexecd);
5849                 zone->zone_pfexecd = NULL;
5850         }
5851 
5852         /* free brand specific data */
5853         if (ZONE_IS_BRANDED(zone))
5854                 ZBROP(zone)->b_free_brand_data(zone);
5855 
5856         /* Say goodbye to brand framework. */
5857         brand_unregister_zone(zone->zone_brand);
5858 
5859         /*
5860          * It is now safe to let the zone be recreated; remove it from the
5861          * lists.  The memory will not be freed until the last cred
5862          * reference goes away.
5863          */
5864         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
5865         zonecount--;
5866         /* remove from active list and hash tables */
5867         list_remove(&zone_active, zone);
5868         (void) mod_hash_destroy(zonehashbyname,
5869             (mod_hash_key_t)zone->zone_name);
5870         (void) mod_hash_destroy(zonehashbyid,
5871             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5872         if (zone->zone_flags & ZF_HASHED_LABEL)
5873                 (void) mod_hash_destroy(zonehashbylabel,
5874                     (mod_hash_key_t)zone->zone_slabel);
5875         mutex_exit(&zonehash_lock);
5876 
5877         /*
5878          * Release the root vnode; we're not using it anymore.  Nor should any
5879          * other thread that might access it exist.
5880          */
5881         if (zone->zone_rootvp != NULL) {
5882                 VN_RELE(zone->zone_rootvp);
5883                 zone->zone_rootvp = NULL;
5884         }
5885 
5886         /* add to deathrow list */
5887         mutex_enter(&zone_deathrow_lock);
5888         list_insert_tail(&zone_deathrow, zone);
5889         mutex_exit(&zone_deathrow_lock);
5890 
5891         /*
5892          * Drop last reference (which was added by zsched()), this will
5893          * free the zone unless there are outstanding cred references.
5894          */
5895         zone_rele(zone);
5896         return (0);
5897 }
5898 
5899 /*
5900  * Systemcall entry point for zone_getattr(2).
5901  */
5902 static ssize_t
5903 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5904 {
5905         size_t size;
5906         int error = 0, err;
5907         zone_t *zone;
5908         char *zonepath;
5909         char *outstr;
5910         zone_status_t zone_status;
5911         pid_t initpid;
5912         boolean_t global = (curzone == global_zone);
5913         boolean_t inzone = (curzone->zone_id == zoneid);
5914         ushort_t flags;
5915         zone_net_data_t *zbuf;
5916 
5917         mutex_enter(&zonehash_lock);
5918         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5919                 mutex_exit(&zonehash_lock);
5920                 return (set_errno(EINVAL));
5921         }
5922         zone_status = zone_status_get(zone);
5923         if (zone_status < ZONE_IS_INITIALIZED) {
5924                 mutex_exit(&zonehash_lock);
5925                 return (set_errno(EINVAL));
5926         }
5927         zone_hold(zone);
5928         mutex_exit(&zonehash_lock);
5929 
5930         /*
5931          * If not in the global zone, don't show information about other zones,
5932          * unless the system is labeled and the local zone's label dominates
5933          * the other zone.
5934          */
5935         if (!zone_list_access(zone)) {
5936                 zone_rele(zone);
5937                 return (set_errno(EINVAL));
5938         }
5939 
5940         switch (attr) {
5941         case ZONE_ATTR_ROOT:
5942                 if (global) {
5943                         /*
5944                          * Copy the path to trim the trailing "/" (except for
5945                          * the global zone).
5946                          */
5947                         if (zone != global_zone)
5948                                 size = zone->zone_rootpathlen - 1;
5949                         else
5950                                 size = zone->zone_rootpathlen;
5951                         zonepath = kmem_alloc(size, KM_SLEEP);
5952                         bcopy(zone->zone_rootpath, zonepath, size);
5953                         zonepath[size - 1] = '\0';
5954                 } else {
5955                         if (inzone || !is_system_labeled()) {
5956                                 /*
5957                                  * Caller is not in the global zone.
5958                                  * if the query is on the current zone
5959                                  * or the system is not labeled,
5960                                  * just return faked-up path for current zone.
5961                                  */
5962                                 zonepath = "/";
5963                                 size = 2;
5964                         } else {
5965                                 /*
5966                                  * Return related path for current zone.
5967                                  */
5968                                 int prefix_len = strlen(zone_prefix);
5969                                 int zname_len = strlen(zone->zone_name);
5970 
5971                                 size = prefix_len + zname_len + 1;
5972                                 zonepath = kmem_alloc(size, KM_SLEEP);
5973                                 bcopy(zone_prefix, zonepath, prefix_len);
5974                                 bcopy(zone->zone_name, zonepath +
5975                                     prefix_len, zname_len);
5976                                 zonepath[size - 1] = '\0';
5977                         }
5978                 }
5979                 if (bufsize > size)
5980                         bufsize = size;
5981                 if (buf != NULL) {
5982                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5983                         if (err != 0 && err != ENAMETOOLONG)
5984                                 error = EFAULT;
5985                 }
5986                 if (global || (is_system_labeled() && !inzone))
5987                         kmem_free(zonepath, size);
5988                 break;
5989 
5990         case ZONE_ATTR_NAME:
5991                 size = strlen(zone->zone_name) + 1;
5992                 if (bufsize > size)
5993                         bufsize = size;
5994                 if (buf != NULL) {
5995                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5996                         if (err != 0 && err != ENAMETOOLONG)
5997                                 error = EFAULT;
5998                 }
5999                 break;
6000 
6001         case ZONE_ATTR_STATUS:
6002                 /*
6003                  * Since we're not holding zonehash_lock, the zone status
6004                  * may be anything; leave it up to userland to sort it out.
6005                  */
6006                 size = sizeof (zone_status);
6007                 if (bufsize > size)
6008                         bufsize = size;
6009                 zone_status = zone_status_get(zone);
6010                 if (buf != NULL &&
6011                     copyout(&zone_status, buf, bufsize) != 0)
6012                         error = EFAULT;
6013                 break;
6014         case ZONE_ATTR_FLAGS:
6015                 size = sizeof (zone->zone_flags);
6016                 if (bufsize > size)
6017                         bufsize = size;
6018                 flags = zone->zone_flags;
6019                 if (buf != NULL &&
6020                     copyout(&flags, buf, bufsize) != 0)
6021                         error = EFAULT;
6022                 break;
6023         case ZONE_ATTR_PRIVSET:
6024                 size = sizeof (priv_set_t);
6025                 if (bufsize > size)
6026                         bufsize = size;
6027                 if (buf != NULL &&
6028                     copyout(zone->zone_privset, buf, bufsize) != 0)
6029                         error = EFAULT;
6030                 break;
6031         case ZONE_ATTR_UNIQID:
6032                 size = sizeof (zone->zone_uniqid);
6033                 if (bufsize > size)
6034                         bufsize = size;
6035                 if (buf != NULL &&
6036                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
6037                         error = EFAULT;
6038                 break;
6039         case ZONE_ATTR_POOLID:
6040                 {
6041                         pool_t *pool;
6042                         poolid_t poolid;
6043 
6044                         if (pool_lock_intr() != 0) {
6045                                 error = EINTR;
6046                                 break;
6047                         }
6048                         pool = zone_pool_get(zone);
6049                         poolid = pool->pool_id;
6050                         pool_unlock();
6051                         size = sizeof (poolid);
6052                         if (bufsize > size)
6053                                 bufsize = size;
6054                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
6055                                 error = EFAULT;
6056                 }
6057                 break;
6058         case ZONE_ATTR_SLBL:
6059                 size = sizeof (bslabel_t);
6060                 if (bufsize > size)
6061                         bufsize = size;
6062                 if (zone->zone_slabel == NULL)
6063                         error = EINVAL;
6064                 else if (buf != NULL &&
6065                     copyout(label2bslabel(zone->zone_slabel), buf,
6066                     bufsize) != 0)
6067                         error = EFAULT;
6068                 break;
6069         case ZONE_ATTR_INITPID:
6070                 size = sizeof (initpid);
6071                 if (bufsize > size)
6072                         bufsize = size;
6073                 initpid = zone->zone_proc_initpid;
6074                 if (initpid == -1) {
6075                         error = ESRCH;
6076                         break;
6077                 }
6078                 if (buf != NULL &&
6079                     copyout(&initpid, buf, bufsize) != 0)
6080                         error = EFAULT;
6081                 break;
6082         case ZONE_ATTR_BRAND:
6083                 size = strlen(zone->zone_brand->b_name) + 1;
6084 
6085                 if (bufsize > size)
6086                         bufsize = size;
6087                 if (buf != NULL) {
6088                         err = copyoutstr(zone->zone_brand->b_name, buf,
6089                             bufsize, NULL);
6090                         if (err != 0 && err != ENAMETOOLONG)
6091                                 error = EFAULT;
6092                 }
6093                 break;
6094         case ZONE_ATTR_INITNAME:
6095                 size = strlen(zone->zone_initname) + 1;
6096                 if (bufsize > size)
6097                         bufsize = size;
6098                 if (buf != NULL) {
6099                         err = copyoutstr(zone->zone_initname, buf, bufsize,
6100                             NULL);
6101                         if (err != 0 && err != ENAMETOOLONG)
6102                                 error = EFAULT;
6103                 }
6104                 break;
6105         case ZONE_ATTR_BOOTARGS:
6106                 if (zone->zone_bootargs == NULL)
6107                         outstr = "";
6108                 else
6109                         outstr = zone->zone_bootargs;
6110                 size = strlen(outstr) + 1;
6111                 if (bufsize > size)
6112                         bufsize = size;
6113                 if (buf != NULL) {
6114                         err = copyoutstr(outstr, buf, bufsize, NULL);
6115                         if (err != 0 && err != ENAMETOOLONG)
6116                                 error = EFAULT;
6117                 }
6118                 break;
6119         case ZONE_ATTR_SCHED_CLASS:
6120                 mutex_enter(&class_lock);
6121 
6122                 if (zone->zone_defaultcid >= loaded_classes)
6123                         outstr = "";
6124                 else
6125                         outstr = sclass[zone->zone_defaultcid].cl_name;
6126                 size = strlen(outstr) + 1;
6127                 if (bufsize > size)
6128                         bufsize = size;
6129                 if (buf != NULL) {
6130                         err = copyoutstr(outstr, buf, bufsize, NULL);
6131                         if (err != 0 && err != ENAMETOOLONG)
6132                                 error = EFAULT;
6133                 }
6134 
6135                 mutex_exit(&class_lock);
6136                 break;
6137         case ZONE_ATTR_HOSTID:
6138                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
6139                     bufsize == sizeof (zone->zone_hostid)) {
6140                         size = sizeof (zone->zone_hostid);
6141                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
6142                             bufsize) != 0)
6143                                 error = EFAULT;
6144                 } else {
6145                         error = EINVAL;
6146                 }
6147                 break;
6148         case ZONE_ATTR_FS_ALLOWED:
6149                 if (zone->zone_fs_allowed == NULL)
6150                         outstr = "";
6151                 else
6152                         outstr = zone->zone_fs_allowed;
6153                 size = strlen(outstr) + 1;
6154                 if (bufsize > size)
6155                         bufsize = size;
6156                 if (buf != NULL) {
6157                         err = copyoutstr(outstr, buf, bufsize, NULL);
6158                         if (err != 0 && err != ENAMETOOLONG)
6159                                 error = EFAULT;
6160                 }
6161                 break;
6162         case ZONE_ATTR_NETWORK:
6163                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6164                 if (copyin(buf, zbuf, bufsize) != 0) {
6165                         error = EFAULT;
6166                 } else {
6167                         error = zone_get_network(zoneid, zbuf);
6168                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
6169                                 error = EFAULT;
6170                 }
6171                 kmem_free(zbuf, bufsize);
6172                 break;
6173         case ZONE_ATTR_DID:
6174                 size = sizeof (zoneid_t);
6175                 if (bufsize > size)
6176                         bufsize = size;
6177 
6178                 if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
6179                         error = EFAULT;
6180                 break;
6181         case ZONE_ATTR_SCHED_FIXEDHI:
6182                 size = sizeof (boolean_t);
6183                 if (bufsize > size)
6184                         bufsize = size;
6185 
6186                 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6187                     bufsize) != 0)
6188                         error = EFAULT;
6189                 break;
6190         default:
6191                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6192                         size = bufsize;
6193                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6194                 } else {
6195                         error = EINVAL;
6196                 }
6197         }
6198         zone_rele(zone);
6199 
6200         if (error)
6201                 return (set_errno(error));
6202         return ((ssize_t)size);
6203 }
6204 
6205 /*
6206  * Systemcall entry point for zone_setattr(2).
6207  */
6208 /*ARGSUSED*/
6209 static int
6210 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
6211 {
6212         zone_t *zone;
6213         zone_status_t zone_status;
6214         int err = -1;
6215         zone_net_data_t *zbuf;
6216 
6217         if (secpolicy_zone_config(CRED()) != 0)
6218                 return (set_errno(EPERM));
6219 
6220         /*
6221          * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
6222          * attributes can be set on the global zone.
6223          */
6224         if (zoneid == GLOBAL_ZONEID &&
6225             attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
6226                 return (set_errno(EINVAL));
6227         }
6228 
6229         mutex_enter(&zonehash_lock);
6230         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
6231                 mutex_exit(&zonehash_lock);
6232                 return (set_errno(EINVAL));
6233         }
6234         zone_hold(zone);
6235         mutex_exit(&zonehash_lock);
6236 
6237         /*
6238          * At present most attributes can only be set on non-running,
6239          * non-global zones.
6240          */
6241         zone_status = zone_status_get(zone);
6242         if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
6243             attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
6244             zone_status > ZONE_IS_READY) {
6245                 err = EINVAL;
6246                 goto done;
6247         }
6248 
6249         switch (attr) {
6250         case ZONE_ATTR_INITNAME:
6251                 err = zone_set_initname(zone, (const char *)buf);
6252                 break;
6253         case ZONE_ATTR_INITNORESTART:
6254                 zone->zone_restart_init = B_FALSE;
6255                 err = 0;
6256                 break;
6257         case ZONE_ATTR_BOOTARGS:
6258                 err = zone_set_bootargs(zone, (const char *)buf);
6259                 break;
6260         case ZONE_ATTR_BRAND:
6261                 err = zone_set_brand(zone, (const char *)buf);
6262                 break;
6263         case ZONE_ATTR_FS_ALLOWED:
6264                 err = zone_set_fs_allowed(zone, (const char *)buf);
6265                 break;
6266         case ZONE_ATTR_PMCAP_NOVER:
6267                 err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
6268                 break;
6269         case ZONE_ATTR_PMCAP_PAGEOUT:
6270                 err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
6271                 break;
6272         case ZONE_ATTR_PG_FLT_DELAY:
6273                 err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
6274                 break;
6275         case ZONE_ATTR_RSS:
6276                 err = zone_set_rss(zone, (const uint64_t *)buf);
6277                 break;
6278         case ZONE_ATTR_SCHED_CLASS:
6279                 err = zone_set_sched_class(zone, (const char *)buf);
6280                 break;
6281         case ZONE_ATTR_HOSTID:
6282                 if (bufsize == sizeof (zone->zone_hostid)) {
6283                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
6284                                 err = 0;
6285                         else
6286                                 err = EFAULT;
6287                 } else {
6288                         err = EINVAL;
6289                 }
6290                 break;
6291         case ZONE_ATTR_NETWORK:
6292                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
6293                         err = EINVAL;
6294                         break;
6295                 }
6296                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6297                 if (copyin(buf, zbuf, bufsize) != 0) {
6298                         kmem_free(zbuf, bufsize);
6299                         err = EFAULT;
6300                         break;
6301                 }
6302                 err = zone_set_network(zoneid, zbuf);
6303                 kmem_free(zbuf, bufsize);
6304                 break;
6305         case ZONE_ATTR_APP_SVC_CT:
6306                 if (bufsize != sizeof (boolean_t)) {
6307                         err = EINVAL;
6308                 } else {
6309                         zone->zone_setup_app_contract = (boolean_t)buf;
6310                         err = 0;
6311                 }
6312                 break;
6313         case ZONE_ATTR_SCHED_FIXEDHI:
6314                 if (bufsize != sizeof (boolean_t)) {
6315                         err = EINVAL;
6316                 } else {
6317                         zone->zone_fixed_hipri = (boolean_t)buf;
6318                         err = 0;
6319                 }
6320                 break;
6321         default:
6322                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
6323                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
6324                 else
6325                         err = EINVAL;
6326         }
6327 
6328 done:
6329         zone_rele(zone);
6330         ASSERT(err != -1);
6331         return (err != 0 ? set_errno(err) : 0);
6332 }
6333 
6334 /*
6335  * Return zero if the process has at least one vnode mapped in to its
6336  * address space which shouldn't be allowed to change zones.
6337  *
6338  * Also return zero if the process has any shared mappings which reserve
6339  * swap.  This is because the counting for zone.max-swap does not allow swap
6340  * reservation to be shared between zones.  zone swap reservation is counted
6341  * on zone->zone_max_swap.
6342  */
6343 static int
6344 as_can_change_zones(void)
6345 {
6346         proc_t *pp = curproc;
6347         struct seg *seg;
6348         struct as *as = pp->p_as;
6349         vnode_t *vp;
6350         int allow = 1;
6351 
6352         ASSERT(pp->p_as != &kas);
6353         AS_LOCK_ENTER(as, RW_READER);
6354         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
6355 
6356                 /*
6357                  * Cannot enter zone with shared anon memory which
6358                  * reserves swap.  See comment above.
6359                  */
6360                 if (seg_can_change_zones(seg) == B_FALSE) {
6361                         allow = 0;
6362                         break;
6363                 }
6364                 /*
6365                  * if we can't get a backing vnode for this segment then skip
6366                  * it.
6367                  */
6368                 vp = NULL;
6369                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
6370                         continue;
6371                 if (!vn_can_change_zones(vp)) { /* bail on first match */
6372                         allow = 0;
6373                         break;
6374                 }
6375         }
6376         AS_LOCK_EXIT(as);
6377         return (allow);
6378 }
6379 
6380 /*
6381  * Count swap reserved by curproc's address space
6382  */
6383 static size_t
6384 as_swresv(void)
6385 {
6386         proc_t *pp = curproc;
6387         struct seg *seg;
6388         struct as *as = pp->p_as;
6389         size_t swap = 0;
6390 
6391         ASSERT(pp->p_as != &kas);
6392         ASSERT(AS_WRITE_HELD(as));
6393         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
6394                 swap += seg_swresv(seg);
6395 
6396         return (swap);
6397 }
6398 
6399 /*
6400  * Systemcall entry point for zone_enter().
6401  *
6402  * The current process is injected into said zone.  In the process
6403  * it will change its project membership, privileges, rootdir/cwd,
6404  * zone-wide rctls, and pool association to match those of the zone.
6405  *
6406  * The first zone_enter() called while the zone is in the ZONE_IS_READY
6407  * state will transition it to ZONE_IS_RUNNING.  Processes may only
6408  * enter a zone that is "ready" or "running".
6409  */
6410 static int
6411 zone_enter(zoneid_t zoneid)
6412 {
6413         zone_t *zone;
6414         vnode_t *vp;
6415         proc_t *pp = curproc;
6416         contract_t *ct;
6417         cont_process_t *ctp;
6418         task_t *tk, *oldtk;
6419         kproject_t *zone_proj0;
6420         cred_t *cr, *newcr;
6421         pool_t *oldpool, *newpool;
6422         sess_t *sp;
6423         uid_t uid;
6424         zone_status_t status;
6425         int err = 0;
6426         rctl_entity_p_t e;
6427         size_t swap;
6428         kthread_id_t t;
6429 
6430         if (secpolicy_zone_config(CRED()) != 0)
6431                 return (set_errno(EPERM));
6432         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
6433                 return (set_errno(EINVAL));
6434 
6435         /*
6436          * Stop all lwps so we don't need to hold a lock to look at
6437          * curproc->p_zone.  This needs to happen before we grab any
6438          * locks to avoid deadlock (another lwp in the process could
6439          * be waiting for the held lock).
6440          */
6441         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
6442                 return (set_errno(EINTR));
6443 
6444         /*
6445          * Make sure we're not changing zones with files open or mapped in
6446          * to our address space which shouldn't be changing zones.
6447          */
6448         if (!files_can_change_zones()) {
6449                 err = EBADF;
6450                 goto out;
6451         }
6452         if (!as_can_change_zones()) {
6453                 err = EFAULT;
6454                 goto out;
6455         }
6456 
6457         mutex_enter(&zonehash_lock);
6458         if (pp->p_zone != global_zone) {
6459                 mutex_exit(&zonehash_lock);
6460                 err = EINVAL;
6461                 goto out;
6462         }
6463 
6464         zone = zone_find_all_by_id(zoneid);
6465         if (zone == NULL) {
6466                 mutex_exit(&zonehash_lock);
6467                 err = EINVAL;
6468                 goto out;
6469         }
6470 
6471         /*
6472          * To prevent processes in a zone from holding contracts on
6473          * extrazonal resources, and to avoid process contract
6474          * memberships which span zones, contract holders and processes
6475          * which aren't the sole members of their encapsulating process
6476          * contracts are not allowed to zone_enter.
6477          */
6478         ctp = pp->p_ct_process;
6479         ct = &ctp->conp_contract;
6480         mutex_enter(&ct->ct_lock);
6481         mutex_enter(&pp->p_lock);
6482         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
6483                 mutex_exit(&pp->p_lock);
6484                 mutex_exit(&ct->ct_lock);
6485                 mutex_exit(&zonehash_lock);
6486                 err = EINVAL;
6487                 goto out;
6488         }
6489 
6490         /*
6491          * Moreover, we don't allow processes whose encapsulating
6492          * process contracts have inherited extrazonal contracts.
6493          * While it would be easier to eliminate all process contracts
6494          * with inherited contracts, we need to be able to give a
6495          * restarted init (or other zone-penetrating process) its
6496          * predecessor's contracts.
6497          */
6498         if (ctp->conp_ninherited != 0) {
6499                 contract_t *next;
6500                 for (next = list_head(&ctp->conp_inherited); next;
6501                     next = list_next(&ctp->conp_inherited, next)) {
6502                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
6503                                 mutex_exit(&pp->p_lock);
6504                                 mutex_exit(&ct->ct_lock);
6505                                 mutex_exit(&zonehash_lock);
6506                                 err = EINVAL;
6507                                 goto out;
6508                         }
6509                 }
6510         }
6511 
6512         mutex_exit(&pp->p_lock);
6513         mutex_exit(&ct->ct_lock);
6514 
6515         status = zone_status_get(zone);
6516         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
6517                 /*
6518                  * Can't join
6519                  */
6520                 mutex_exit(&zonehash_lock);
6521                 err = EINVAL;
6522                 goto out;
6523         }
6524 
6525         /*
6526          * Make sure new priv set is within the permitted set for caller
6527          */
6528         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
6529                 mutex_exit(&zonehash_lock);
6530                 err = EPERM;
6531                 goto out;
6532         }
6533         /*
6534          * We want to momentarily drop zonehash_lock while we optimistically
6535          * bind curproc to the pool it should be running in.  This is safe
6536          * since the zone can't disappear (we have a hold on it).
6537          */
6538         zone_hold(zone);
6539         mutex_exit(&zonehash_lock);
6540 
6541         /*
6542          * Grab pool_lock to keep the pools configuration from changing
6543          * and to stop ourselves from getting rebound to another pool
6544          * until we join the zone.
6545          */
6546         if (pool_lock_intr() != 0) {
6547                 zone_rele(zone);
6548                 err = EINTR;
6549                 goto out;
6550         }
6551         ASSERT(secpolicy_pool(CRED()) == 0);
6552         /*
6553          * Bind ourselves to the pool currently associated with the zone.
6554          */
6555         oldpool = curproc->p_pool;
6556         newpool = zone_pool_get(zone);
6557         if (pool_state == POOL_ENABLED && newpool != oldpool &&
6558             (err = pool_do_bind(newpool, P_PID, P_MYID,
6559             POOL_BIND_ALL)) != 0) {
6560                 pool_unlock();
6561                 zone_rele(zone);
6562                 goto out;
6563         }
6564 
6565         /*
6566          * Grab cpu_lock now; we'll need it later when we call
6567          * task_join().
6568          */
6569         mutex_enter(&cpu_lock);
6570         mutex_enter(&zonehash_lock);
6571         /*
6572          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6573          */
6574         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6575                 /*
6576                  * Can't join anymore.
6577                  */
6578                 mutex_exit(&zonehash_lock);
6579                 mutex_exit(&cpu_lock);
6580                 if (pool_state == POOL_ENABLED &&
6581                     newpool != oldpool)
6582                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
6583                             POOL_BIND_ALL);
6584                 pool_unlock();
6585                 zone_rele(zone);
6586                 err = EINVAL;
6587                 goto out;
6588         }
6589 
6590         /*
6591          * a_lock must be held while transfering locked memory and swap
6592          * reservation from the global zone to the non global zone because
6593          * asynchronous faults on the processes' address space can lock
6594          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6595          * segments respectively.
6596          */
6597         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6598         swap = as_swresv();
6599         mutex_enter(&pp->p_lock);
6600         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6601         /* verify that we do not exceed and task or lwp limits */
6602         mutex_enter(&zone->zone_nlwps_lock);
6603         /* add new lwps to zone and zone's proj0 */
6604         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6605         zone->zone_nlwps += pp->p_lwpcnt;
6606         /* add 1 task to zone's proj0 */
6607         zone_proj0->kpj_ntasks += 1;
6608 
6609         zone_proj0->kpj_nprocs++;
6610         zone->zone_nprocs++;
6611         mutex_exit(&zone->zone_nlwps_lock);
6612 
6613         mutex_enter(&zone->zone_mem_lock);
6614         zone->zone_locked_mem += pp->p_locked_mem;
6615         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6616         zone->zone_max_swap += swap;
6617         mutex_exit(&zone->zone_mem_lock);
6618 
6619         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6620         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6621         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6622 
6623         /* remove lwps and process from proc's old zone and old project */
6624         mutex_enter(&pp->p_zone->zone_nlwps_lock);
6625         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6626         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6627         pp->p_task->tk_proj->kpj_nprocs--;
6628         pp->p_zone->zone_nprocs--;
6629         mutex_exit(&pp->p_zone->zone_nlwps_lock);
6630 
6631         mutex_enter(&pp->p_zone->zone_mem_lock);
6632         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6633         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6634         pp->p_zone->zone_max_swap -= swap;
6635         mutex_exit(&pp->p_zone->zone_mem_lock);
6636 
6637         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6638         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6639         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6640 
6641         pp->p_flag |= SZONETOP;
6642         pp->p_zone = zone;
6643         mutex_exit(&pp->p_lock);
6644         AS_LOCK_EXIT(pp->p_as);
6645 
6646         /*
6647          * Joining the zone cannot fail from now on.
6648          *
6649          * This means that a lot of the following code can be commonized and
6650          * shared with zsched().
6651          */
6652 
6653         /*
6654          * If the process contract fmri was inherited, we need to
6655          * flag this so that any contract status will not leak
6656          * extra zone information, svc_fmri in this case
6657          */
6658         if (ctp->conp_svc_ctid != ct->ct_id) {
6659                 mutex_enter(&ct->ct_lock);
6660                 ctp->conp_svc_zone_enter = ct->ct_id;
6661                 mutex_exit(&ct->ct_lock);
6662         }
6663 
6664         /*
6665          * Reset the encapsulating process contract's zone.
6666          */
6667         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6668         contract_setzuniqid(ct, zone->zone_uniqid);
6669 
6670         /*
6671          * Create a new task and associate the process with the project keyed
6672          * by (projid,zoneid).
6673          *
6674          * We might as well be in project 0; the global zone's projid doesn't
6675          * make much sense in a zone anyhow.
6676          *
6677          * This also increments zone_ntasks, and returns with p_lock held.
6678          */
6679         tk = task_create(0, zone);
6680         oldtk = task_join(tk, 0);
6681         mutex_exit(&cpu_lock);
6682 
6683         /*
6684          * call RCTLOP_SET functions on this proc
6685          */
6686         e.rcep_p.zone = zone;
6687         e.rcep_t = RCENTITY_ZONE;
6688         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6689             RCD_CALLBACK);
6690         mutex_exit(&pp->p_lock);
6691 
6692         /*
6693          * We don't need to hold any of zsched's locks here; not only do we know
6694          * the process and zone aren't going away, we know its session isn't
6695          * changing either.
6696          *
6697          * By joining zsched's session here, we mimic the behavior in the
6698          * global zone of init's sid being the pid of sched.  We extend this
6699          * to all zlogin-like zone_enter()'ing processes as well.
6700          */
6701         mutex_enter(&pidlock);
6702         sp = zone->zone_zsched->p_sessp;
6703         sess_hold(zone->zone_zsched);
6704         mutex_enter(&pp->p_lock);
6705         pgexit(pp);
6706         sess_rele(pp->p_sessp, B_TRUE);
6707         pp->p_sessp = sp;
6708         pgjoin(pp, zone->zone_zsched->p_pidp);
6709 
6710         /*
6711          * If any threads are scheduled to be placed on zone wait queue they
6712          * should abandon the idea since the wait queue is changing.
6713          * We need to be holding pidlock & p_lock to do this.
6714          */
6715         if ((t = pp->p_tlist) != NULL) {
6716                 do {
6717                         thread_lock(t);
6718                         /*
6719                          * Kick this thread so that he doesn't sit
6720                          * on a wrong wait queue.
6721                          */
6722                         if (ISWAITING(t))
6723                                 setrun_locked(t);
6724 
6725                         if (t->t_schedflag & TS_ANYWAITQ)
6726                                 t->t_schedflag &= ~ TS_ANYWAITQ;
6727 
6728                         thread_unlock(t);
6729                 } while ((t = t->t_forw) != pp->p_tlist);
6730         }
6731 
6732         /*
6733          * If there is a default scheduling class for the zone and it is not
6734          * the class we are currently in, change all of the threads in the
6735          * process to the new class.  We need to be holding pidlock & p_lock
6736          * when we call parmsset so this is a good place to do it.
6737          */
6738         if (zone->zone_defaultcid > 0 &&
6739             zone->zone_defaultcid != curthread->t_cid) {
6740                 pcparms_t pcparms;
6741 
6742                 pcparms.pc_cid = zone->zone_defaultcid;
6743                 pcparms.pc_clparms[0] = 0;
6744 
6745                 /*
6746                  * If setting the class fails, we still want to enter the zone.
6747                  */
6748                 if ((t = pp->p_tlist) != NULL) {
6749                         do {
6750                                 (void) parmsset(&pcparms, t);
6751                         } while ((t = t->t_forw) != pp->p_tlist);
6752                 }
6753         }
6754 
6755         mutex_exit(&pp->p_lock);
6756         mutex_exit(&pidlock);
6757 
6758         mutex_exit(&zonehash_lock);
6759         /*
6760          * We're firmly in the zone; let pools progress.
6761          */
6762         pool_unlock();
6763         task_rele(oldtk);
6764         /*
6765          * We don't need to retain a hold on the zone since we already
6766          * incremented zone_ntasks, so the zone isn't going anywhere.
6767          */
6768         zone_rele(zone);
6769 
6770         /*
6771          * Chroot
6772          */
6773         vp = zone->zone_rootvp;
6774         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6775         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6776 
6777         /*
6778          * Change process credentials
6779          */
6780         newcr = cralloc();
6781         mutex_enter(&pp->p_crlock);
6782         cr = pp->p_cred;
6783         crcopy_to(cr, newcr);
6784         crsetzone(newcr, zone);
6785         pp->p_cred = newcr;
6786 
6787         /*
6788          * Restrict all process privilege sets to zone limit
6789          */
6790         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6791         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6792         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6793         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6794         mutex_exit(&pp->p_crlock);
6795         crset(pp, newcr);
6796 
6797         /*
6798          * Adjust upcount to reflect zone entry.
6799          */
6800         uid = crgetruid(newcr);
6801         mutex_enter(&pidlock);
6802         upcount_dec(uid, GLOBAL_ZONEID);
6803         upcount_inc(uid, zoneid);
6804         mutex_exit(&pidlock);
6805 
6806         /*
6807          * Set up core file path and content.
6808          */
6809         set_core_defaults();
6810 
6811 out:
6812         /*
6813          * Let the other lwps continue.
6814          */
6815         mutex_enter(&pp->p_lock);
6816         if (curthread != pp->p_agenttp)
6817                 continuelwps(pp);
6818         mutex_exit(&pp->p_lock);
6819 
6820         return (err != 0 ? set_errno(err) : 0);
6821 }
6822 
6823 /*
6824  * Systemcall entry point for zone_list(2).
6825  *
6826  * Processes running in a (non-global) zone only see themselves.
6827  * On labeled systems, they see all zones whose label they dominate.
6828  */
6829 static int
6830 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6831 {
6832         zoneid_t *zoneids;
6833         zone_t *zone, *myzone;
6834         uint_t user_nzones, real_nzones;
6835         uint_t domi_nzones;
6836         int error;
6837 
6838         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6839                 return (set_errno(EFAULT));
6840 
6841         myzone = curproc->p_zone;
6842         if (myzone != global_zone) {
6843                 bslabel_t *mybslab;
6844 
6845                 if (!is_system_labeled()) {
6846                         /* just return current zone */
6847                         real_nzones = domi_nzones = 1;
6848                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6849                         zoneids[0] = myzone->zone_id;
6850                 } else {
6851                         /* return all zones that are dominated */
6852                         mutex_enter(&zonehash_lock);
6853                         real_nzones = zonecount;
6854                         domi_nzones = 0;
6855                         if (real_nzones > 0) {
6856                                 zoneids = kmem_alloc(real_nzones *
6857                                     sizeof (zoneid_t), KM_SLEEP);
6858                                 mybslab = label2bslabel(myzone->zone_slabel);
6859                                 for (zone = list_head(&zone_active);
6860                                     zone != NULL;
6861                                     zone = list_next(&zone_active, zone)) {
6862                                         if (zone->zone_id == GLOBAL_ZONEID)
6863                                                 continue;
6864                                         if (zone != myzone &&
6865                                             (zone->zone_flags & ZF_IS_SCRATCH))
6866                                                 continue;
6867                                         /*
6868                                          * Note that a label always dominates
6869                                          * itself, so myzone is always included
6870                                          * in the list.
6871                                          */
6872                                         if (bldominates(mybslab,
6873                                             label2bslabel(zone->zone_slabel))) {
6874                                                 zoneids[domi_nzones++] =
6875                                                     zone->zone_id;
6876                                         }
6877                                 }
6878                         }
6879                         mutex_exit(&zonehash_lock);
6880                 }
6881         } else {
6882                 mutex_enter(&zonehash_lock);
6883                 real_nzones = zonecount;
6884                 domi_nzones = 0;
6885                 if (real_nzones > 0) {
6886                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6887                             KM_SLEEP);
6888                         for (zone = list_head(&zone_active); zone != NULL;
6889                             zone = list_next(&zone_active, zone))
6890                                 zoneids[domi_nzones++] = zone->zone_id;
6891                         ASSERT(domi_nzones == real_nzones);
6892                 }
6893                 mutex_exit(&zonehash_lock);
6894         }
6895 
6896         /*
6897          * If user has allocated space for fewer entries than we found, then
6898          * return only up to his limit.  Either way, tell him exactly how many
6899          * we found.
6900          */
6901         if (domi_nzones < user_nzones)
6902                 user_nzones = domi_nzones;
6903         error = 0;
6904         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6905                 error = EFAULT;
6906         } else if (zoneidlist != NULL && user_nzones != 0) {
6907                 if (copyout(zoneids, zoneidlist,
6908                     user_nzones * sizeof (zoneid_t)) != 0)
6909                         error = EFAULT;
6910         }
6911 
6912         if (real_nzones > 0)
6913                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6914 
6915         if (error != 0)
6916                 return (set_errno(error));
6917         else
6918                 return (0);
6919 }
6920 
6921 /*
6922  * Systemcall entry point for zone_lookup(2).
6923  *
6924  * Non-global zones are only able to see themselves and (on labeled systems)
6925  * the zones they dominate.
6926  */
6927 static zoneid_t
6928 zone_lookup(const char *zone_name)
6929 {
6930         char *kname;
6931         zone_t *zone;
6932         zoneid_t zoneid;
6933         int err;
6934 
6935         if (zone_name == NULL) {
6936                 /* return caller's zone id */
6937                 return (getzoneid());
6938         }
6939 
6940         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6941         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6942                 kmem_free(kname, ZONENAME_MAX);
6943                 return (set_errno(err));
6944         }
6945 
6946         mutex_enter(&zonehash_lock);
6947         zone = zone_find_all_by_name(kname);
6948         kmem_free(kname, ZONENAME_MAX);
6949         /*
6950          * In a non-global zone, can only lookup global and own name.
6951          * In Trusted Extensions zone label dominance rules apply.
6952          */
6953         if (zone == NULL ||
6954             zone_status_get(zone) < ZONE_IS_READY ||
6955             !zone_list_access(zone)) {
6956                 mutex_exit(&zonehash_lock);
6957                 return (set_errno(EINVAL));
6958         } else {
6959                 zoneid = zone->zone_id;
6960                 mutex_exit(&zonehash_lock);
6961                 return (zoneid);
6962         }
6963 }
6964 
6965 static int
6966 zone_version(int *version_arg)
6967 {
6968         int version = ZONE_SYSCALL_API_VERSION;
6969 
6970         if (copyout(&version, version_arg, sizeof (int)) != 0)
6971                 return (set_errno(EFAULT));
6972         return (0);
6973 }
6974 
6975 /* ARGSUSED */
6976 long
6977 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6978 {
6979         zone_def zs;
6980         int err;
6981 
6982         switch (cmd) {
6983         case ZONE_CREATE:
6984                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6985                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6986                                 return (set_errno(EFAULT));
6987                         }
6988                 } else {
6989 #ifdef _SYSCALL32_IMPL
6990                         zone_def32 zs32;
6991 
6992                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6993                                 return (set_errno(EFAULT));
6994                         }
6995                         zs.zone_name =
6996                             (const char *)(unsigned long)zs32.zone_name;
6997                         zs.zone_root =
6998                             (const char *)(unsigned long)zs32.zone_root;
6999                         zs.zone_privs =
7000                             (const struct priv_set *)
7001                             (unsigned long)zs32.zone_privs;
7002                         zs.zone_privssz = zs32.zone_privssz;
7003                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
7004                         zs.rctlbufsz = zs32.rctlbufsz;
7005                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
7006                         zs.zfsbufsz = zs32.zfsbufsz;
7007                         zs.extended_error =
7008                             (int *)(unsigned long)zs32.extended_error;
7009                         zs.match = zs32.match;
7010                         zs.doi = zs32.doi;
7011                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
7012                         zs.flags = zs32.flags;
7013                         zs.zoneid = zs32.zoneid;
7014 #else
7015                         panic("get_udatamodel() returned bogus result\n");
7016 #endif
7017                 }
7018 
7019                 return (zone_create(zs.zone_name, zs.zone_root,
7020                     zs.zone_privs, zs.zone_privssz,
7021                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
7022                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
7023                     zs.extended_error, zs.match, zs.doi,
7024                     zs.label, zs.flags, zs.zoneid));
7025         case ZONE_BOOT:
7026                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
7027         case ZONE_DESTROY:
7028                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
7029         case ZONE_GETATTR:
7030                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
7031                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7032         case ZONE_SETATTR:
7033                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
7034                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7035         case ZONE_ENTER:
7036                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
7037         case ZONE_LIST:
7038                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
7039         case ZONE_SHUTDOWN:
7040                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
7041         case ZONE_LOOKUP:
7042                 return (zone_lookup((const char *)arg1));
7043         case ZONE_VERSION:
7044                 return (zone_version((int *)arg1));
7045         case ZONE_ADD_DATALINK:
7046                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
7047                     (datalink_id_t)(uintptr_t)arg2));
7048         case ZONE_DEL_DATALINK:
7049                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
7050                     (datalink_id_t)(uintptr_t)arg2));
7051         case ZONE_CHECK_DATALINK: {
7052                 zoneid_t        zoneid;
7053                 boolean_t       need_copyout;
7054 
7055                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
7056                         return (EFAULT);
7057                 need_copyout = (zoneid == ALL_ZONES);
7058                 err = zone_check_datalink(&zoneid,
7059                     (datalink_id_t)(uintptr_t)arg2);
7060                 if (err == 0 && need_copyout) {
7061                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
7062                                 err = EFAULT;
7063                 }
7064                 return (err == 0 ? 0 : set_errno(err));
7065         }
7066         case ZONE_LIST_DATALINK:
7067                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
7068                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
7069         default:
7070                 return (set_errno(EINVAL));
7071         }
7072 }
7073 
7074 struct zarg {
7075         zone_t *zone;
7076         zone_cmd_arg_t arg;
7077 };
7078 
7079 static int
7080 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
7081 {
7082         char *buf;
7083         size_t buflen;
7084         int error;
7085 
7086         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
7087         buf = kmem_alloc(buflen, KM_SLEEP);
7088         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
7089         error = door_ki_open(buf, doorp);
7090         kmem_free(buf, buflen);
7091         return (error);
7092 }
7093 
7094 static void
7095 zone_release_door(door_handle_t *doorp)
7096 {
7097         door_ki_rele(*doorp);
7098         *doorp = NULL;
7099 }
7100 
7101 static void
7102 zone_ki_call_zoneadmd(struct zarg *zargp)
7103 {
7104         door_handle_t door = NULL;
7105         door_arg_t darg, save_arg;
7106         char *zone_name;
7107         size_t zone_namelen;
7108         zoneid_t zoneid;
7109         zone_t *zone;
7110         zone_cmd_arg_t arg;
7111         uint64_t uniqid;
7112         size_t size;
7113         int error;
7114         int retry;
7115 
7116         zone = zargp->zone;
7117         arg = zargp->arg;
7118         kmem_free(zargp, sizeof (*zargp));
7119 
7120         zone_namelen = strlen(zone->zone_name) + 1;
7121         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
7122         bcopy(zone->zone_name, zone_name, zone_namelen);
7123         zoneid = zone->zone_id;
7124         uniqid = zone->zone_uniqid;
7125         arg.status = zone->zone_init_status;
7126         /*
7127          * zoneadmd may be down, but at least we can empty out the zone.
7128          * We can ignore the return value of zone_empty() since we're called
7129          * from a kernel thread and know we won't be delivered any signals.
7130          */
7131         ASSERT(curproc == &p0);
7132         (void) zone_empty(zone);
7133         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
7134         zone_rele(zone);
7135 
7136         size = sizeof (arg);
7137         darg.rbuf = (char *)&arg;
7138         darg.data_ptr = (char *)&arg;
7139         darg.rsize = size;
7140         darg.data_size = size;
7141         darg.desc_ptr = NULL;
7142         darg.desc_num = 0;
7143 
7144         save_arg = darg;
7145         /*
7146          * Since we're not holding a reference to the zone, any number of
7147          * things can go wrong, including the zone disappearing before we get a
7148          * chance to talk to zoneadmd.
7149          */
7150         for (retry = 0; /* forever */; retry++) {
7151                 if (door == NULL &&
7152                     (error = zone_lookup_door(zone_name, &door)) != 0) {
7153                         goto next;
7154                 }
7155                 ASSERT(door != NULL);
7156 
7157                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
7158                     SIZE_MAX, 0)) == 0) {
7159                         break;
7160                 }
7161                 switch (error) {
7162                 case EINTR:
7163                         /* FALLTHROUGH */
7164                 case EAGAIN:    /* process may be forking */
7165                         /*
7166                          * Back off for a bit
7167                          */
7168                         break;
7169                 case EBADF:
7170                         zone_release_door(&door);
7171                         if (zone_lookup_door(zone_name, &door) != 0) {
7172                                 /*
7173                                  * zoneadmd may be dead, but it may come back to
7174                                  * life later.
7175                                  */
7176                                 break;
7177                         }
7178                         break;
7179                 default:
7180                         cmn_err(CE_WARN,
7181                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
7182                             error);
7183                         goto out;
7184                 }
7185 next:
7186                 /*
7187                  * If this isn't the same zone_t that we originally had in mind,
7188                  * then this is the same as if two kadmin requests come in at
7189                  * the same time: the first one wins.  This means we lose, so we
7190                  * bail.
7191                  */
7192                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
7193                         /*
7194                          * Problem is solved.
7195                          */
7196                         break;
7197                 }
7198                 if (zone->zone_uniqid != uniqid) {
7199                         /*
7200                          * zoneid recycled
7201                          */
7202                         zone_rele(zone);
7203                         break;
7204                 }
7205                 /*
7206                  * We could zone_status_timedwait(), but there doesn't seem to
7207                  * be much point in doing that (plus, it would mean that
7208                  * zone_free() isn't called until this thread exits).
7209                  */
7210                 zone_rele(zone);
7211                 delay(hz);
7212                 darg = save_arg;
7213         }
7214 out:
7215         if (door != NULL) {
7216                 zone_release_door(&door);
7217         }
7218         kmem_free(zone_name, zone_namelen);
7219         thread_exit();
7220 }
7221 
7222 /*
7223  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
7224  * kadmin().  The caller is a process in the zone.
7225  *
7226  * In order to shutdown the zone, we will hand off control to zoneadmd
7227  * (running in the global zone) via a door.  We do a half-hearted job at
7228  * killing all processes in the zone, create a kernel thread to contact
7229  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
7230  * a form of generation number used to let zoneadmd (as well as
7231  * zone_destroy()) know exactly which zone they're re talking about.
7232  */
7233 int
7234 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
7235 {
7236         struct zarg *zargp;
7237         zone_cmd_t zcmd;
7238         zone_t *zone;
7239 
7240         zone = curproc->p_zone;
7241         ASSERT(getzoneid() != GLOBAL_ZONEID);
7242 
7243         switch (cmd) {
7244         case A_SHUTDOWN:
7245                 switch (fcn) {
7246                 case AD_HALT:
7247                 case AD_POWEROFF:
7248                         zcmd = Z_HALT;
7249                         break;
7250                 case AD_BOOT:
7251                         zcmd = Z_REBOOT;
7252                         break;
7253                 case AD_IBOOT:
7254                 case AD_SBOOT:
7255                 case AD_SIBOOT:
7256                 case AD_NOSYNC:
7257                         return (ENOTSUP);
7258                 default:
7259                         return (EINVAL);
7260                 }
7261                 break;
7262         case A_REBOOT:
7263                 zcmd = Z_REBOOT;
7264                 break;
7265         case A_FTRACE:
7266         case A_REMOUNT:
7267         case A_FREEZE:
7268         case A_DUMP:
7269         case A_CONFIG:
7270                 return (ENOTSUP);
7271         default:
7272                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
7273                 return (EINVAL);
7274         }
7275 
7276         if (secpolicy_zone_admin(credp, B_FALSE))
7277                 return (EPERM);
7278         mutex_enter(&zone_status_lock);
7279 
7280         /*
7281          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
7282          * is in the zone.
7283          */
7284         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
7285         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
7286                 /*
7287                  * This zone is already on its way down.
7288                  */
7289                 mutex_exit(&zone_status_lock);
7290                 return (0);
7291         }
7292         /*
7293          * Prevent future zone_enter()s
7294          */
7295         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
7296         mutex_exit(&zone_status_lock);
7297 
7298         /*
7299          * Kill everyone now and call zoneadmd later.
7300          * zone_ki_call_zoneadmd() will do a more thorough job of this
7301          * later.
7302          */
7303         killall(zone->zone_id, B_FALSE);
7304         /*
7305          * Now, create the thread to contact zoneadmd and do the rest of the
7306          * work.  This thread can't be created in our zone otherwise
7307          * zone_destroy() would deadlock.
7308          */
7309         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
7310         zargp->arg.cmd = zcmd;
7311         zargp->arg.uniqid = zone->zone_uniqid;
7312         zargp->zone = zone;
7313         (void) strcpy(zargp->arg.locale, "C");
7314         /* mdep was already copied in for us by uadmin */
7315         if (mdep != NULL)
7316                 (void) strlcpy(zargp->arg.bootbuf, mdep,
7317                     sizeof (zargp->arg.bootbuf));
7318         zone_hold(zone);
7319 
7320         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
7321             TS_RUN, minclsyspri);
7322         exit(CLD_EXITED, 0);
7323 
7324         return (EINVAL);
7325 }
7326 
7327 /*
7328  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
7329  * status to ZONE_IS_SHUTTING_DOWN.
7330  *
7331  * This function also shuts down all running zones to ensure that they won't
7332  * fork new processes.
7333  */
7334 void
7335 zone_shutdown_global(void)
7336 {
7337         zone_t *current_zonep;
7338 
7339         ASSERT(INGLOBALZONE(curproc));
7340         mutex_enter(&zonehash_lock);
7341         mutex_enter(&zone_status_lock);
7342 
7343         /* Modify the global zone's status first. */
7344         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
7345         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
7346 
7347         /*
7348          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
7349          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
7350          * could cause assertions to fail (e.g., assertions about a zone's
7351          * state during initialization, readying, or booting) or produce races.
7352          * We'll let threads continue to initialize and ready new zones: they'll
7353          * fail to boot the new zones when they see that the global zone is
7354          * shutting down.
7355          */
7356         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
7357             current_zonep = list_next(&zone_active, current_zonep)) {
7358                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
7359                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
7360         }
7361         mutex_exit(&zone_status_lock);
7362         mutex_exit(&zonehash_lock);
7363 }
7364 
7365 /*
7366  * Returns true if the named dataset is visible in the specified zone.
7367  * The 'write' parameter is set to 1 if the dataset is also writable.
7368  */
7369 int
7370 zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
7371 {
7372         static int zfstype = -1;
7373         zone_dataset_t *zd;
7374         size_t len;
7375         const char *name = NULL;
7376         vfs_t *vfsp = NULL;
7377 
7378         if (dataset[0] == '\0')
7379                 return (0);
7380 
7381         /*
7382          * Walk the list once, looking for datasets which match exactly, or
7383          * specify a dataset underneath an exported dataset.  If found, return
7384          * true and note that it is writable.
7385          */
7386         for (zd = list_head(&zone->zone_datasets); zd != NULL;
7387             zd = list_next(&zone->zone_datasets, zd)) {
7388 
7389                 len = strlen(zd->zd_dataset);
7390                 if (strlen(dataset) >= len &&
7391                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
7392                     (dataset[len] == '\0' || dataset[len] == '/' ||
7393                     dataset[len] == '@')) {
7394                         if (write)
7395                                 *write = 1;
7396                         return (1);
7397                 }
7398         }
7399 
7400         /*
7401          * Walk the list a second time, searching for datasets which are parents
7402          * of exported datasets.  These should be visible, but read-only.
7403          *
7404          * Note that we also have to support forms such as 'pool/dataset/', with
7405          * a trailing slash.
7406          */
7407         for (zd = list_head(&zone->zone_datasets); zd != NULL;
7408             zd = list_next(&zone->zone_datasets, zd)) {
7409 
7410                 len = strlen(dataset);
7411                 if (dataset[len - 1] == '/')
7412                         len--;  /* Ignore trailing slash */
7413                 if (len < strlen(zd->zd_dataset) &&
7414                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
7415                     zd->zd_dataset[len] == '/') {
7416                         if (write)
7417                                 *write = 0;
7418                         return (1);
7419                 }
7420         }
7421 
7422         /*
7423          * We reach here if the given dataset is not found in the zone_dataset
7424          * list. Check if this dataset was added as a filesystem (ie. "add fs")
7425          * instead of delegation. For this we search for the dataset in the
7426          * zone_vfslist of this zone. If found, return true and note that it is
7427          * not writable.
7428          */
7429 
7430         /*
7431          * Initialize zfstype if it is not initialized yet.
7432          */
7433         if (zfstype == -1) {
7434                 struct vfssw *vswp = vfs_getvfssw("zfs");
7435                 zfstype = vswp - vfssw;
7436                 vfs_unrefvfssw(vswp);
7437         }
7438 
7439         vfs_list_read_lock();
7440         vfsp = zone->zone_vfslist;
7441         do {
7442                 if (vfsp == NULL)
7443                         break;
7444                 if (vfsp->vfs_fstype == zfstype) {
7445                         name = refstr_value(vfsp->vfs_resource);
7446 
7447                         /*
7448                          * Check if we have an exact match.
7449                          */
7450                         if (strcmp(dataset, name) == 0) {
7451                                 vfs_list_unlock();
7452                                 if (write)
7453                                         *write = 0;
7454                                 return (1);
7455                         }
7456                         /*
7457                          * We need to check if we are looking for parents of
7458                          * a dataset. These should be visible, but read-only.
7459                          */
7460                         len = strlen(dataset);
7461                         if (dataset[len - 1] == '/')
7462                                 len--;
7463 
7464                         if (len < strlen(name) &&
7465                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
7466                                 vfs_list_unlock();
7467                                 if (write)
7468                                         *write = 0;
7469                                 return (1);
7470                         }
7471                 }
7472                 vfsp = vfsp->vfs_zone_next;
7473         } while (vfsp != zone->zone_vfslist);
7474 
7475         vfs_list_unlock();
7476         return (0);
7477 }
7478 
7479 /*
7480  * Returns true if the named dataset is visible in the current zone.
7481  * The 'write' parameter is set to 1 if the dataset is also writable.
7482  */
7483 int
7484 zone_dataset_visible(const char *dataset, int *write)
7485 {
7486         zone_t *zone = curproc->p_zone;
7487 
7488         return (zone_dataset_visible_inzone(zone, dataset, write));
7489 }
7490 
7491 /*
7492  * zone_find_by_any_path() -
7493  *
7494  * kernel-private routine similar to zone_find_by_path(), but which
7495  * effectively compares against zone paths rather than zonerootpath
7496  * (i.e., the last component of zonerootpaths, which should be "root/",
7497  * are not compared.)  This is done in order to accurately identify all
7498  * paths, whether zone-visible or not, including those which are parallel
7499  * to /root/, such as /dev/, /home/, etc...
7500  *
7501  * If the specified path does not fall under any zone path then global
7502  * zone is returned.
7503  *
7504  * The treat_abs parameter indicates whether the path should be treated as
7505  * an absolute path although it does not begin with "/".  (This supports
7506  * nfs mount syntax such as host:any/path.)
7507  *
7508  * The caller is responsible for zone_rele of the returned zone.
7509  */
7510 zone_t *
7511 zone_find_by_any_path(const char *path, boolean_t treat_abs)
7512 {
7513         zone_t *zone;
7514         int path_offset = 0;
7515 
7516         if (path == NULL) {
7517                 zone_hold(global_zone);
7518                 return (global_zone);
7519         }
7520 
7521         if (*path != '/') {
7522                 ASSERT(treat_abs);
7523                 path_offset = 1;
7524         }
7525 
7526         mutex_enter(&zonehash_lock);
7527         for (zone = list_head(&zone_active); zone != NULL;
7528             zone = list_next(&zone_active, zone)) {
7529                 char    *c;
7530                 size_t  pathlen;
7531                 char *rootpath_start;
7532 
7533                 if (zone == global_zone)        /* skip global zone */
7534                         continue;
7535 
7536                 /* scan backwards to find start of last component */
7537                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
7538                 do {
7539                         c--;
7540                 } while (*c != '/');
7541 
7542                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
7543                 rootpath_start = (zone->zone_rootpath + path_offset);
7544                 if (strncmp(path, rootpath_start, pathlen) == 0)
7545                         break;
7546         }
7547         if (zone == NULL)
7548                 zone = global_zone;
7549         zone_hold(zone);
7550         mutex_exit(&zonehash_lock);
7551         return (zone);
7552 }
7553 
7554 /*
7555  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7556  * zone_dl_t pointer if found, and NULL otherwise.
7557  */
7558 static zone_dl_t *
7559 zone_find_dl(zone_t *zone, datalink_id_t linkid)
7560 {
7561         zone_dl_t *zdl;
7562 
7563         ASSERT(mutex_owned(&zone->zone_lock));
7564         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7565             zdl = list_next(&zone->zone_dl_list, zdl)) {
7566                 if (zdl->zdl_id == linkid)
7567                         break;
7568         }
7569         return (zdl);
7570 }
7571 
7572 static boolean_t
7573 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7574 {
7575         boolean_t exists;
7576 
7577         mutex_enter(&zone->zone_lock);
7578         exists = (zone_find_dl(zone, linkid) != NULL);
7579         mutex_exit(&zone->zone_lock);
7580         return (exists);
7581 }
7582 
7583 /*
7584  * Add an data link name for the zone.
7585  */
7586 static int
7587 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7588 {
7589         zone_dl_t *zdl;
7590         zone_t *zone;
7591         zone_t *thiszone;
7592 
7593         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7594                 return (set_errno(ENXIO));
7595 
7596         /* Verify that the datalink ID doesn't already belong to a zone. */
7597         mutex_enter(&zonehash_lock);
7598         for (zone = list_head(&zone_active); zone != NULL;
7599             zone = list_next(&zone_active, zone)) {
7600                 if (zone_dl_exists(zone, linkid)) {
7601                         mutex_exit(&zonehash_lock);
7602                         zone_rele(thiszone);
7603                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7604                 }
7605         }
7606 
7607         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7608         zdl->zdl_id = linkid;
7609         zdl->zdl_net = NULL;
7610         mutex_enter(&thiszone->zone_lock);
7611         list_insert_head(&thiszone->zone_dl_list, zdl);
7612         mutex_exit(&thiszone->zone_lock);
7613         mutex_exit(&zonehash_lock);
7614         zone_rele(thiszone);
7615         return (0);
7616 }
7617 
7618 static int
7619 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7620 {
7621         zone_dl_t *zdl;
7622         zone_t *zone;
7623         int err = 0;
7624 
7625         if ((zone = zone_find_by_id(zoneid)) == NULL)
7626                 return (set_errno(EINVAL));
7627 
7628         mutex_enter(&zone->zone_lock);
7629         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7630                 err = ENXIO;
7631         } else {
7632                 list_remove(&zone->zone_dl_list, zdl);
7633                 nvlist_free(zdl->zdl_net);
7634                 kmem_free(zdl, sizeof (zone_dl_t));
7635         }
7636         mutex_exit(&zone->zone_lock);
7637         zone_rele(zone);
7638         return (err == 0 ? 0 : set_errno(err));
7639 }
7640 
7641 /*
7642  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7643  * the linkid.  Otherwise we just check if the specified zoneidp has been
7644  * assigned the supplied linkid.
7645  */
7646 int
7647 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7648 {
7649         zone_t *zone;
7650         int err = ENXIO;
7651 
7652         if (*zoneidp != ALL_ZONES) {
7653                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7654                         if (zone_dl_exists(zone, linkid))
7655                                 err = 0;
7656                         zone_rele(zone);
7657                 }
7658                 return (err);
7659         }
7660 
7661         mutex_enter(&zonehash_lock);
7662         for (zone = list_head(&zone_active); zone != NULL;
7663             zone = list_next(&zone_active, zone)) {
7664                 if (zone_dl_exists(zone, linkid)) {
7665                         *zoneidp = zone->zone_id;
7666                         err = 0;
7667                         break;
7668                 }
7669         }
7670         mutex_exit(&zonehash_lock);
7671         return (err);
7672 }
7673 
7674 /*
7675  * Get the list of datalink IDs assigned to a zone.
7676  *
7677  * On input, *nump is the number of datalink IDs that can fit in the supplied
7678  * idarray.  Upon return, *nump is either set to the number of datalink IDs
7679  * that were placed in the array if the array was large enough, or to the
7680  * number of datalink IDs that the function needs to place in the array if the
7681  * array is too small.
7682  */
7683 static int
7684 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7685 {
7686         uint_t num, dlcount;
7687         zone_t *zone;
7688         zone_dl_t *zdl;
7689         datalink_id_t *idptr = idarray;
7690 
7691         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7692                 return (set_errno(EFAULT));
7693         if ((zone = zone_find_by_id(zoneid)) == NULL)
7694                 return (set_errno(ENXIO));
7695 
7696         num = 0;
7697         mutex_enter(&zone->zone_lock);
7698         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7699             zdl = list_next(&zone->zone_dl_list, zdl)) {
7700                 /*
7701                  * If the list is bigger than what the caller supplied, just
7702                  * count, don't do copyout.
7703                  */
7704                 if (++num > dlcount)
7705                         continue;
7706                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7707                         mutex_exit(&zone->zone_lock);
7708                         zone_rele(zone);
7709                         return (set_errno(EFAULT));
7710                 }
7711                 idptr++;
7712         }
7713         mutex_exit(&zone->zone_lock);
7714         zone_rele(zone);
7715 
7716         /* Increased or decreased, caller should be notified. */
7717         if (num != dlcount) {
7718                 if (copyout(&num, nump, sizeof (num)) != 0)
7719                         return (set_errno(EFAULT));
7720         }
7721         return (0);
7722 }
7723 
7724 /*
7725  * Public interface for looking up a zone by zoneid. It's a customized version
7726  * for netstack_zone_create(). It can only be called from the zsd create
7727  * callbacks, since it doesn't have reference on the zone structure hence if
7728  * it is called elsewhere the zone could disappear after the zonehash_lock
7729  * is dropped.
7730  *
7731  * Furthermore it
7732  * 1. Doesn't check the status of the zone.
7733  * 2. It will be called even before zone_init is called, in that case the
7734  *    address of zone0 is returned directly, and netstack_zone_create()
7735  *    will only assign a value to zone0.zone_netstack, won't break anything.
7736  * 3. Returns without the zone being held.
7737  */
7738 zone_t *
7739 zone_find_by_id_nolock(zoneid_t zoneid)
7740 {
7741         zone_t *zone;
7742 
7743         mutex_enter(&zonehash_lock);
7744         if (zonehashbyid == NULL)
7745                 zone = &zone0;
7746         else
7747                 zone = zone_find_all_by_id(zoneid);
7748         mutex_exit(&zonehash_lock);
7749         return (zone);
7750 }
7751 
7752 /*
7753  * Walk the datalinks for a given zone
7754  */
7755 int
7756 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7757     void *data)
7758 {
7759         zone_t          *zone;
7760         zone_dl_t       *zdl;
7761         datalink_id_t   *idarray;
7762         uint_t          idcount = 0;
7763         int             i, ret = 0;
7764 
7765         if ((zone = zone_find_by_id(zoneid)) == NULL)
7766                 return (ENOENT);
7767 
7768         /*
7769          * We first build an array of linkid's so that we can walk these and
7770          * execute the callback with the zone_lock dropped.
7771          */
7772         mutex_enter(&zone->zone_lock);
7773         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7774             zdl = list_next(&zone->zone_dl_list, zdl)) {
7775                 idcount++;
7776         }
7777 
7778         if (idcount == 0) {
7779                 mutex_exit(&zone->zone_lock);
7780                 zone_rele(zone);
7781                 return (0);
7782         }
7783 
7784         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7785         if (idarray == NULL) {
7786                 mutex_exit(&zone->zone_lock);
7787                 zone_rele(zone);
7788                 return (ENOMEM);
7789         }
7790 
7791         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7792             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7793                 idarray[i] = zdl->zdl_id;
7794         }
7795 
7796         mutex_exit(&zone->zone_lock);
7797 
7798         for (i = 0; i < idcount && ret == 0; i++) {
7799                 if ((ret = (*cb)(idarray[i], data)) != 0)
7800                         break;
7801         }
7802 
7803         zone_rele(zone);
7804         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7805         return (ret);
7806 }
7807 
7808 static char *
7809 zone_net_type2name(int type)
7810 {
7811         switch (type) {
7812         case ZONE_NETWORK_ADDRESS:
7813                 return (ZONE_NET_ADDRNAME);
7814         case ZONE_NETWORK_DEFROUTER:
7815                 return (ZONE_NET_RTRNAME);
7816         default:
7817                 return (NULL);
7818         }
7819 }
7820 
7821 static int
7822 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7823 {
7824         zone_t *zone;
7825         zone_dl_t *zdl;
7826         nvlist_t *nvl;
7827         int err = 0;
7828         uint8_t *new = NULL;
7829         char *nvname;
7830         int bufsize;
7831         datalink_id_t linkid = znbuf->zn_linkid;
7832 
7833         if (secpolicy_zone_config(CRED()) != 0)
7834                 return (set_errno(EPERM));
7835 
7836         if (zoneid == GLOBAL_ZONEID)
7837                 return (set_errno(EINVAL));
7838 
7839         nvname = zone_net_type2name(znbuf->zn_type);
7840         bufsize = znbuf->zn_len;
7841         new = znbuf->zn_val;
7842         if (nvname == NULL)
7843                 return (set_errno(EINVAL));
7844 
7845         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7846                 return (set_errno(EINVAL));
7847         }
7848 
7849         mutex_enter(&zone->zone_lock);
7850         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7851                 err = ENXIO;
7852                 goto done;
7853         }
7854         if ((nvl = zdl->zdl_net) == NULL) {
7855                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7856                         err = ENOMEM;
7857                         goto done;
7858                 } else {
7859                         zdl->zdl_net = nvl;
7860                 }
7861         }
7862         if (nvlist_exists(nvl, nvname)) {
7863                 err = EINVAL;
7864                 goto done;
7865         }
7866         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7867         ASSERT(err == 0);
7868 done:
7869         mutex_exit(&zone->zone_lock);
7870         zone_rele(zone);
7871         if (err != 0)
7872                 return (set_errno(err));
7873         else
7874                 return (0);
7875 }
7876 
7877 static int
7878 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7879 {
7880         zone_t *zone;
7881         zone_dl_t *zdl;
7882         nvlist_t *nvl;
7883         uint8_t *ptr;
7884         uint_t psize;
7885         int err = 0;
7886         char *nvname;
7887         int bufsize;
7888         void *buf;
7889         datalink_id_t linkid = znbuf->zn_linkid;
7890 
7891         if (zoneid == GLOBAL_ZONEID)
7892                 return (set_errno(EINVAL));
7893 
7894         nvname = zone_net_type2name(znbuf->zn_type);
7895         bufsize = znbuf->zn_len;
7896         buf = znbuf->zn_val;
7897 
7898         if (nvname == NULL)
7899                 return (set_errno(EINVAL));
7900         if ((zone = zone_find_by_id(zoneid)) == NULL)
7901                 return (set_errno(EINVAL));
7902 
7903         mutex_enter(&zone->zone_lock);
7904         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7905                 err = ENXIO;
7906                 goto done;
7907         }
7908         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7909                 err = ENOENT;
7910                 goto done;
7911         }
7912         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7913         ASSERT(err == 0);
7914 
7915         if (psize > bufsize) {
7916                 err = ENOBUFS;
7917                 goto done;
7918         }
7919         znbuf->zn_len = psize;
7920         bcopy(ptr, buf, psize);
7921 done:
7922         mutex_exit(&zone->zone_lock);
7923         zone_rele(zone);
7924         if (err != 0)
7925                 return (set_errno(err));
7926         else
7927                 return (0);
7928 }