io-lx-public-vs-joyent New usr/src/uts/common/os/zone.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016, Joyent Inc.
  25  */
  26 
  27 /*
  28  * Zones
  29  *
  30  *   A zone is a named collection of processes, namespace constraints,
  31  *   and other system resources which comprise a secure and manageable
  32  *   application containment facility.
  33  *
  34  *   Zones (represented by the reference counted zone_t) are tracked in
  35  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36  *   (zoneid_t) are used to track zone association.  Zone IDs are
  37  *   dynamically generated when the zone is created; if a persistent
  38  *   identifier is needed (core files, accounting logs, audit trail,
  39  *   etc.), the zone name should be used.
  40  *
  41  *
  42  *   Global Zone:
  43  *
  44  *   The global zone (zoneid 0) is automatically associated with all
  45  *   system resources that have not been bound to a user-created zone.
  46  *   This means that even systems where zones are not in active use
  47  *   have a global zone, and all processes, mounts, etc. are
  48  *   associated with that zone.  The global zone is generally
  49  *   unconstrained in terms of privileges and access, though the usual
  50  *   credential and privilege based restrictions apply.
  51  *
  52  *
  53  *   Zone States:
  54  *
  55  *   The states in which a zone may be in and the transitions are as
  56  *   follows:
  57  *
  58  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  59  *   initialized zone is added to the list of active zones on the system but
  60  *   isn't accessible.
  61  *
  62  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  63  *   not yet completed. Not possible to enter the zone, but attributes can
  64  *   be retrieved.
  65  *
  66  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  67  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  68  *   executed.  A zone remains in this state until it transitions into
  69  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  70  *
  71  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  72  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  73  *   state.
  74  *
  75  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  76  *   successfully started init.   A zone remains in this state until
  77  *   zone_shutdown() is called.
  78  *
  79  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  80  *   killing all processes running in the zone. The zone remains
  81  *   in this state until there are no more user processes running in the zone.
  82  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  83  *   Since zone_shutdown() is restartable, it may be called successfully
  84  *   multiple times for the same zone_t.  Setting of the zone's state to
  85  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  86  *   the zone's status without worrying about it being a moving target.
  87  *
  88  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  89  *   are no more user processes in the zone.  The zone remains in this
  90  *   state until there are no more kernel threads associated with the
  91  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  92  *   fail.
  93  *
  94  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  95  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  96  *   join the zone or create kernel threads therein.
  97  *
  98  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  99  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 100  *   return NULL from now on.
 101  *
 102  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 103  *   processes or threads doing work on behalf of the zone.  The zone is
 104  *   removed from the list of active zones.  zone_destroy() returns, and
 105  *   the zone can be recreated.
 106  *
 107  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 108  *   callbacks are executed, and all memory associated with the zone is
 109  *   freed.
 110  *
 111  *   Threads can wait for the zone to enter a requested state by using
 112  *   zone_status_wait() or zone_status_timedwait() with the desired
 113  *   state passed in as an argument.  Zone state transitions are
 114  *   uni-directional; it is not possible to move back to an earlier state.
 115  *
 116  *
 117  *   Zone-Specific Data:
 118  *
 119  *   Subsystems needing to maintain zone-specific data can store that
 120  *   data using the ZSD mechanism.  This provides a zone-specific data
 121  *   store, similar to thread-specific data (see pthread_getspecific(3C)
 122  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 123  *   to register callbacks to be invoked when a zone is created, shut
 124  *   down, or destroyed.  This can be used to initialize zone-specific
 125  *   data for new zones and to clean up when zones go away.
 126  *
 127  *
 128  *   Data Structures:
 129  *
 130  *   The per-zone structure (zone_t) is reference counted, and freed
 131  *   when all references are released.  zone_hold and zone_rele can be
 132  *   used to adjust the reference count.  In addition, reference counts
 133  *   associated with the cred_t structure are tracked separately using
 134  *   zone_cred_hold and zone_cred_rele.
 135  *
 136  *   Pointers to active zone_t's are stored in two hash tables; one
 137  *   for searching by id, the other for searching by name.  Lookups
 138  *   can be performed on either basis, using zone_find_by_id and
 139  *   zone_find_by_name.  Both return zone_t pointers with the zone
 140  *   held, so zone_rele should be called when the pointer is no longer
 141  *   needed.  Zones can also be searched by path; zone_find_by_path
 142  *   returns the zone with which a path name is associated (global
 143  *   zone if the path is not within some other zone's file system
 144  *   hierarchy).  This currently requires iterating through each zone,
 145  *   so it is slower than an id or name search via a hash table.
 146  *
 147  *
 148  *   Locking:
 149  *
 150  *   zonehash_lock: This is a top-level global lock used to protect the
 151  *       zone hash tables and lists.  Zones cannot be created or destroyed
 152  *       while this lock is held.
 153  *   zone_status_lock: This is a global lock protecting zone state.
 154  *       Zones cannot change state while this lock is held.  It also
 155  *       protects the list of kernel threads associated with a zone.
 156  *   zone_lock: This is a per-zone lock used to protect several fields of
 157  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 158  *       this lock means that the zone cannot go away.
 159  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 160  *       related to the zone.max-lwps rctl.
 161  *   zone_mem_lock: This is a per-zone lock used to protect the fields
 162  *       related to the zone.max-locked-memory and zone.max-swap rctls.
 163  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 164  *       currently just max_lofi
 165  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 166  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 167  *       list (a list of zones in the ZONE_IS_DEAD state).
 168  *
 169  *   Ordering requirements:
 170  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 171  *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 172  *
 173  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 174  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 175  *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 176  *
 177  *   Blocking memory allocations are permitted while holding any of the
 178  *   zone locks.
 179  *
 180  *
 181  *   System Call Interface:
 182  *
 183  *   The zone subsystem can be managed and queried from user level with
 184  *   the following system calls (all subcodes of the primary "zone"
 185  *   system call):
 186  *   - zone_create: creates a zone with selected attributes (name,
 187  *     root path, privileges, resource controls, ZFS datasets)
 188  *   - zone_enter: allows the current process to enter a zone
 189  *   - zone_getattr: reports attributes of a zone
 190  *   - zone_setattr: set attributes of a zone
 191  *   - zone_boot: set 'init' running for the zone
 192  *   - zone_list: lists all zones active in the system
 193  *   - zone_lookup: looks up zone id based on name
 194  *   - zone_shutdown: initiates shutdown process (see states above)
 195  *   - zone_destroy: completes shutdown process (see states above)
 196  *
 197  */
 198 
 199 #include <sys/priv_impl.h>
 200 #include <sys/cred.h>
 201 #include <c2/audit.h>
 202 #include <sys/debug.h>
 203 #include <sys/file.h>
 204 #include <sys/kmem.h>
 205 #include <sys/kstat.h>
 206 #include <sys/mutex.h>
 207 #include <sys/note.h>
 208 #include <sys/pathname.h>
 209 #include <sys/proc.h>
 210 #include <sys/project.h>
 211 #include <sys/sysevent.h>
 212 #include <sys/task.h>
 213 #include <sys/systm.h>
 214 #include <sys/types.h>
 215 #include <sys/utsname.h>
 216 #include <sys/vnode.h>
 217 #include <sys/vfs.h>
 218 #include <sys/systeminfo.h>
 219 #include <sys/policy.h>
 220 #include <sys/cred_impl.h>
 221 #include <sys/contract_impl.h>
 222 #include <sys/contract/process_impl.h>
 223 #include <sys/class.h>
 224 #include <sys/pool.h>
 225 #include <sys/pool_pset.h>
 226 #include <sys/pset.h>
 227 #include <sys/strlog.h>
 228 #include <sys/sysmacros.h>
 229 #include <sys/callb.h>
 230 #include <sys/vmparam.h>
 231 #include <sys/corectl.h>
 232 #include <sys/ipc_impl.h>
 233 #include <sys/klpd.h>
 234 
 235 #include <sys/door.h>
 236 #include <sys/cpuvar.h>
 237 #include <sys/sdt.h>
 238 
 239 #include <sys/uadmin.h>
 240 #include <sys/session.h>
 241 #include <sys/cmn_err.h>
 242 #include <sys/modhash.h>
 243 #include <sys/sunddi.h>
 244 #include <sys/nvpair.h>
 245 #include <sys/rctl.h>
 246 #include <sys/fss.h>
 247 #include <sys/brand.h>
 248 #include <sys/zone.h>
 249 #include <net/if.h>
 250 #include <sys/cpucaps.h>
 251 #include <vm/seg.h>
 252 #include <sys/mac.h>
 253 #include <sys/rt.h>
 254 #include <sys/fx.h>
 255 
 256 /*
 257  * This constant specifies the number of seconds that threads waiting for
 258  * subsystems to release a zone's general-purpose references will wait before
 259  * they log the zone's reference counts.  The constant's value shouldn't
 260  * be so small that reference counts are unnecessarily reported for zones
 261  * whose references are slowly released.  On the other hand, it shouldn't be so
 262  * large that users reboot their systems out of frustration over hung zones
 263  * before the system logs the zones' reference counts.
 264  */
 265 #define ZONE_DESTROY_TIMEOUT_SECS       60
 266 
 267 /* List of data link IDs which are accessible from the zone */
 268 typedef struct zone_dl {
 269         datalink_id_t   zdl_id;
 270         nvlist_t        *zdl_net;
 271         list_node_t     zdl_linkage;
 272 } zone_dl_t;
 273 
 274 /*
 275  * cv used to signal that all references to the zone have been released.  This
 276  * needs to be global since there may be multiple waiters, and the first to
 277  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  */
 279 static kcondvar_t zone_destroy_cv;
 280 /*
 281  * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  */
 284 static kmutex_t zone_status_lock;
 285 
 286 /*
 287  * ZSD-related global variables.
 288  */
 289 static kmutex_t zsd_key_lock;   /* protects the following two */
 290 /*
 291  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  */
 293 static zone_key_t zsd_keyval = 0;
 294 /*
 295  * Global list of registered keys.  We use this when a new zone is created.
 296  */
 297 static list_t zsd_registered_keys;
 298 
 299 int zone_hash_size = 256;
 300 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 301 static kmutex_t zonehash_lock;
 302 static uint_t zonecount;
 303 static id_space_t *zoneid_space;
 304 
 305 /*
 306  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  * kernel proper runs, and which manages all other zones.
 308  *
 309  * Although not declared as static, the variable "zone0" should not be used
 310  * except for by code that needs to reference the global zone early on in boot,
 311  * before it is fully initialized.  All other consumers should use
 312  * 'global_zone'.
 313  */
 314 zone_t zone0;
 315 zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316 
 317 /*
 318  * List of active zones, protected by zonehash_lock.
 319  */
 320 static list_t zone_active;
 321 
 322 /*
 323  * List of destroyed zones that still have outstanding cred references.
 324  * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  * problems in zone_free.
 326  */
 327 static list_t zone_deathrow;
 328 static kmutex_t zone_deathrow_lock;
 329 
 330 /* number of zones is limited by virtual interface limit in IP */
 331 uint_t maxzones = 8192;
 332 
 333 /* Event channel to sent zone state change notifications */
 334 evchan_t *zone_event_chan;
 335 
 336 /*
 337  * This table holds the mapping from kernel zone states to
 338  * states visible in the state notification API.
 339  * The idea is that we only expose "obvious" states and
 340  * do not expose states which are just implementation details.
 341  */
 342 const char  *zone_status_table[] = {
 343         ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344         ZONE_EVENT_INITIALIZED,         /* initialized */
 345         ZONE_EVENT_READY,               /* ready */
 346         ZONE_EVENT_READY,               /* booting */
 347         ZONE_EVENT_RUNNING,             /* running */
 348         ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349         ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350         ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351         ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352         ZONE_EVENT_UNINITIALIZED,       /* dead */
 353 };
 354 
 355 /*
 356  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  * (see sys/zone.h).
 358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368 
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_phys_mem;
 376 rctl_hndl_t rc_zone_max_lofi;
 377 rctl_hndl_t rc_zone_cpu_cap;
 378 rctl_hndl_t rc_zone_zfs_io_pri;
 379 rctl_hndl_t rc_zone_nlwps;
 380 rctl_hndl_t rc_zone_nprocs;
 381 rctl_hndl_t rc_zone_shmmax;
 382 rctl_hndl_t rc_zone_shmmni;
 383 rctl_hndl_t rc_zone_semmni;
 384 rctl_hndl_t rc_zone_msgmni;
 385 
 386 const char * const zone_default_initname = "/sbin/init";
 387 static char * const zone_prefix = "/zone/";
 388 static int zone_shutdown(zoneid_t zoneid);
 389 static int zone_add_datalink(zoneid_t, datalink_id_t);
 390 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 391 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 392 static int zone_set_network(zoneid_t, zone_net_data_t *);
 393 static int zone_get_network(zoneid_t, zone_net_data_t *);
 394 
 395 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 396 
 397 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 398 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 399 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 400 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 401     zone_key_t);
 402 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 403 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 404     kmutex_t *);
 405 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 406     kmutex_t *);
 407 
 408 /*
 409  * Bump this number when you alter the zone syscall interfaces; this is
 410  * because we need to have support for previous API versions in libc
 411  * to support patching; libc calls into the kernel to determine this number.
 412  *
 413  * Version 1 of the API is the version originally shipped with Solaris 10
 414  * Version 2 alters the zone_create system call in order to support more
 415  *     arguments by moving the args into a structure; and to do better
 416  *     error reporting when zone_create() fails.
 417  * Version 3 alters the zone_create system call in order to support the
 418  *     import of ZFS datasets to zones.
 419  * Version 4 alters the zone_create system call in order to support
 420  *     Trusted Extensions.
 421  * Version 5 alters the zone_boot system call, and converts its old
 422  *     bootargs parameter to be set by the zone_setattr API instead.
 423  * Version 6 adds the flag argument to zone_create.
 424  */
 425 static const int ZONE_SYSCALL_API_VERSION = 6;
 426 
 427 /*
 428  * Certain filesystems (such as NFS and autofs) need to know which zone
 429  * the mount is being placed in.  Because of this, we need to be able to
 430  * ensure that a zone isn't in the process of being created/destroyed such
 431  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 432  * it gets added the list of mounted zones, it ends up on the wrong zone's
 433  * mount list. Since a zone can't reside on an NFS file system, we don't
 434  * have to worry about the zonepath itself.
 435  *
 436  * The following functions: block_mounts()/resume_mounts() and
 437  * mount_in_progress()/mount_completed() are used by zones and the VFS
 438  * layer (respectively) to synchronize zone state transitions and new
 439  * mounts within a zone. This syncronization is on a per-zone basis, so
 440  * activity for one zone will not interfere with activity for another zone.
 441  *
 442  * The semantics are like a reader-reader lock such that there may
 443  * either be multiple mounts (or zone state transitions, if that weren't
 444  * serialized by zonehash_lock) in progress at the same time, but not
 445  * both.
 446  *
 447  * We use cv's so the user can ctrl-C out of the operation if it's
 448  * taking too long.
 449  *
 450  * The semantics are such that there is unfair bias towards the
 451  * "current" operation.  This means that zone halt may starve if
 452  * there is a rapid succession of new mounts coming in to the zone.
 453  */
 454 /*
 455  * Prevent new mounts from progressing to the point of calling
 456  * VFS_MOUNT().  If there are already mounts in this "region", wait for
 457  * them to complete.
 458  */
 459 static int
 460 block_mounts(zone_t *zp)
 461 {
 462         int retval = 0;
 463 
 464         /*
 465          * Since it may block for a long time, block_mounts() shouldn't be
 466          * called with zonehash_lock held.
 467          */
 468         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 469         mutex_enter(&zp->zone_mount_lock);
 470         while (zp->zone_mounts_in_progress > 0) {
 471                 if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 472                         goto signaled;
 473         }
 474         /*
 475          * A negative value of mounts_in_progress indicates that mounts
 476          * have been blocked by (-mounts_in_progress) different callers
 477          * (remotely possible if two threads enter zone_shutdown at the same
 478          * time).
 479          */
 480         zp->zone_mounts_in_progress--;
 481         retval = 1;
 482 signaled:
 483         mutex_exit(&zp->zone_mount_lock);
 484         return (retval);
 485 }
 486 
 487 /*
 488  * The VFS layer may progress with new mounts as far as we're concerned.
 489  * Allow them to progress if we were the last obstacle.
 490  */
 491 static void
 492 resume_mounts(zone_t *zp)
 493 {
 494         mutex_enter(&zp->zone_mount_lock);
 495         if (++zp->zone_mounts_in_progress == 0)
 496                 cv_broadcast(&zp->zone_mount_cv);
 497         mutex_exit(&zp->zone_mount_lock);
 498 }
 499 
 500 /*
 501  * The VFS layer is busy with a mount; this zone should wait until all
 502  * of its mounts are completed to progress.
 503  */
 504 void
 505 mount_in_progress(zone_t *zp)
 506 {
 507         mutex_enter(&zp->zone_mount_lock);
 508         while (zp->zone_mounts_in_progress < 0)
 509                 cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 510         zp->zone_mounts_in_progress++;
 511         mutex_exit(&zp->zone_mount_lock);
 512 }
 513 
 514 /*
 515  * VFS is done with one mount; wake up any waiting block_mounts()
 516  * callers if this is the last mount.
 517  */
 518 void
 519 mount_completed(zone_t *zp)
 520 {
 521         mutex_enter(&zp->zone_mount_lock);
 522         if (--zp->zone_mounts_in_progress == 0)
 523                 cv_broadcast(&zp->zone_mount_cv);
 524         mutex_exit(&zp->zone_mount_lock);
 525 }
 526 
 527 /*
 528  * ZSD routines.
 529  *
 530  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 531  * defined by the pthread_key_create() and related interfaces.
 532  *
 533  * Kernel subsystems may register one or more data items and/or
 534  * callbacks to be executed when a zone is created, shutdown, or
 535  * destroyed.
 536  *
 537  * Unlike the thread counterpart, destructor callbacks will be executed
 538  * even if the data pointer is NULL and/or there are no constructor
 539  * callbacks, so it is the responsibility of such callbacks to check for
 540  * NULL data values if necessary.
 541  *
 542  * The locking strategy and overall picture is as follows:
 543  *
 544  * When someone calls zone_key_create(), a template ZSD entry is added to the
 545  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 546  * holding that lock all the existing zones are marked as
 547  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 548  * zone_zsd list (protected by zone_lock). The global list is updated first
 549  * (under zone_key_lock) to make sure that newly created zones use the
 550  * most recent list of keys. Then under zonehash_lock we walk the zones
 551  * and mark them.  Similar locking is used in zone_key_delete().
 552  *
 553  * The actual create, shutdown, and destroy callbacks are done without
 554  * holding any lock. And zsd_flags are used to ensure that the operations
 555  * completed so that when zone_key_create (and zone_create) is done, as well as
 556  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 557  * are completed.
 558  *
 559  * When new zones are created constructor callbacks for all registered ZSD
 560  * entries will be called. That also uses the above two phases of marking
 561  * what needs to be done, and then running the callbacks without holding
 562  * any locks.
 563  *
 564  * The framework does not provide any locking around zone_getspecific() and
 565  * zone_setspecific() apart from that needed for internal consistency, so
 566  * callers interested in atomic "test-and-set" semantics will need to provide
 567  * their own locking.
 568  */
 569 
 570 /*
 571  * Helper function to find the zsd_entry associated with the key in the
 572  * given list.
 573  */
 574 static struct zsd_entry *
 575 zsd_find(list_t *l, zone_key_t key)
 576 {
 577         struct zsd_entry *zsd;
 578 
 579         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 580                 if (zsd->zsd_key == key) {
 581                         return (zsd);
 582                 }
 583         }
 584         return (NULL);
 585 }
 586 
 587 /*
 588  * Helper function to find the zsd_entry associated with the key in the
 589  * given list. Move it to the front of the list.
 590  */
 591 static struct zsd_entry *
 592 zsd_find_mru(list_t *l, zone_key_t key)
 593 {
 594         struct zsd_entry *zsd;
 595 
 596         for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 597                 if (zsd->zsd_key == key) {
 598                         /*
 599                          * Move to head of list to keep list in MRU order.
 600                          */
 601                         if (zsd != list_head(l)) {
 602                                 list_remove(l, zsd);
 603                                 list_insert_head(l, zsd);
 604                         }
 605                         return (zsd);
 606                 }
 607         }
 608         return (NULL);
 609 }
 610 
 611 void
 612 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 613     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 614 {
 615         struct zsd_entry *zsdp;
 616         struct zsd_entry *t;
 617         struct zone *zone;
 618         zone_key_t  key;
 619 
 620         zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 621         zsdp->zsd_data = NULL;
 622         zsdp->zsd_create = create;
 623         zsdp->zsd_shutdown = shutdown;
 624         zsdp->zsd_destroy = destroy;
 625 
 626         /*
 627          * Insert in global list of callbacks. Makes future zone creations
 628          * see it.
 629          */
 630         mutex_enter(&zsd_key_lock);
 631         key = zsdp->zsd_key = ++zsd_keyval;
 632         ASSERT(zsd_keyval != 0);
 633         list_insert_tail(&zsd_registered_keys, zsdp);
 634         mutex_exit(&zsd_key_lock);
 635 
 636         /*
 637          * Insert for all existing zones and mark them as needing
 638          * a create callback.
 639          */
 640         mutex_enter(&zonehash_lock);        /* stop the world */
 641         for (zone = list_head(&zone_active); zone != NULL;
 642             zone = list_next(&zone_active, zone)) {
 643                 zone_status_t status;
 644 
 645                 mutex_enter(&zone->zone_lock);
 646 
 647                 /* Skip zones that are on the way down or not yet up */
 648                 status = zone_status_get(zone);
 649                 if (status >= ZONE_IS_DOWN ||
 650                     status == ZONE_IS_UNINITIALIZED) {
 651                         mutex_exit(&zone->zone_lock);
 652                         continue;
 653                 }
 654 
 655                 t = zsd_find_mru(&zone->zone_zsd, key);
 656                 if (t != NULL) {
 657                         /*
 658                          * A zsd_configure already inserted it after
 659                          * we dropped zsd_key_lock above.
 660                          */
 661                         mutex_exit(&zone->zone_lock);
 662                         continue;
 663                 }
 664                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 665                 t->zsd_key = key;
 666                 t->zsd_create = create;
 667                 t->zsd_shutdown = shutdown;
 668                 t->zsd_destroy = destroy;
 669                 if (create != NULL) {
 670                         t->zsd_flags = ZSD_CREATE_NEEDED;
 671                         DTRACE_PROBE2(zsd__create__needed,
 672                             zone_t *, zone, zone_key_t, key);
 673                 }
 674                 list_insert_tail(&zone->zone_zsd, t);
 675                 mutex_exit(&zone->zone_lock);
 676         }
 677         mutex_exit(&zonehash_lock);
 678 
 679         if (create != NULL) {
 680                 /* Now call the create callback for this key */
 681                 zsd_apply_all_zones(zsd_apply_create, key);
 682         }
 683         /*
 684          * It is safe for consumers to use the key now, make it
 685          * globally visible. Specifically zone_getspecific() will
 686          * always successfully return the zone specific data associated
 687          * with the key.
 688          */
 689         *keyp = key;
 690 
 691 }
 692 
 693 /*
 694  * Function called when a module is being unloaded, or otherwise wishes
 695  * to unregister its ZSD key and callbacks.
 696  *
 697  * Remove from the global list and determine the functions that need to
 698  * be called under a global lock. Then call the functions without
 699  * holding any locks. Finally free up the zone_zsd entries. (The apply
 700  * functions need to access the zone_zsd entries to find zsd_data etc.)
 701  */
 702 int
 703 zone_key_delete(zone_key_t key)
 704 {
 705         struct zsd_entry *zsdp = NULL;
 706         zone_t *zone;
 707 
 708         mutex_enter(&zsd_key_lock);
 709         zsdp = zsd_find_mru(&zsd_registered_keys, key);
 710         if (zsdp == NULL) {
 711                 mutex_exit(&zsd_key_lock);
 712                 return (-1);
 713         }
 714         list_remove(&zsd_registered_keys, zsdp);
 715         mutex_exit(&zsd_key_lock);
 716 
 717         mutex_enter(&zonehash_lock);
 718         for (zone = list_head(&zone_active); zone != NULL;
 719             zone = list_next(&zone_active, zone)) {
 720                 struct zsd_entry *del;
 721 
 722                 mutex_enter(&zone->zone_lock);
 723                 del = zsd_find_mru(&zone->zone_zsd, key);
 724                 if (del == NULL) {
 725                         /*
 726                          * Somebody else got here first e.g the zone going
 727                          * away.
 728                          */
 729                         mutex_exit(&zone->zone_lock);
 730                         continue;
 731                 }
 732                 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 733                 ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 734                 if (del->zsd_shutdown != NULL &&
 735                     (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 736                         del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 737                         DTRACE_PROBE2(zsd__shutdown__needed,
 738                             zone_t *, zone, zone_key_t, key);
 739                 }
 740                 if (del->zsd_destroy != NULL &&
 741                     (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 742                         del->zsd_flags |= ZSD_DESTROY_NEEDED;
 743                         DTRACE_PROBE2(zsd__destroy__needed,
 744                             zone_t *, zone, zone_key_t, key);
 745                 }
 746                 mutex_exit(&zone->zone_lock);
 747         }
 748         mutex_exit(&zonehash_lock);
 749         kmem_free(zsdp, sizeof (*zsdp));
 750 
 751         /* Now call the shutdown and destroy callback for this key */
 752         zsd_apply_all_zones(zsd_apply_shutdown, key);
 753         zsd_apply_all_zones(zsd_apply_destroy, key);
 754 
 755         /* Now we can free up the zsdp structures in each zone */
 756         mutex_enter(&zonehash_lock);
 757         for (zone = list_head(&zone_active); zone != NULL;
 758             zone = list_next(&zone_active, zone)) {
 759                 struct zsd_entry *del;
 760 
 761                 mutex_enter(&zone->zone_lock);
 762                 del = zsd_find(&zone->zone_zsd, key);
 763                 if (del != NULL) {
 764                         list_remove(&zone->zone_zsd, del);
 765                         ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 766                         kmem_free(del, sizeof (*del));
 767                 }
 768                 mutex_exit(&zone->zone_lock);
 769         }
 770         mutex_exit(&zonehash_lock);
 771 
 772         return (0);
 773 }
 774 
 775 /*
 776  * ZSD counterpart of pthread_setspecific().
 777  *
 778  * Since all zsd callbacks, including those with no create function,
 779  * have an entry in zone_zsd, if the key is registered it is part of
 780  * the zone_zsd list.
 781  * Return an error if the key wasn't registerd.
 782  */
 783 int
 784 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 785 {
 786         struct zsd_entry *t;
 787 
 788         mutex_enter(&zone->zone_lock);
 789         t = zsd_find_mru(&zone->zone_zsd, key);
 790         if (t != NULL) {
 791                 /*
 792                  * Replace old value with new
 793                  */
 794                 t->zsd_data = (void *)data;
 795                 mutex_exit(&zone->zone_lock);
 796                 return (0);
 797         }
 798         mutex_exit(&zone->zone_lock);
 799         return (-1);
 800 }
 801 
 802 /*
 803  * ZSD counterpart of pthread_getspecific().
 804  */
 805 void *
 806 zone_getspecific(zone_key_t key, zone_t *zone)
 807 {
 808         struct zsd_entry *t;
 809         void *data;
 810 
 811         mutex_enter(&zone->zone_lock);
 812         t = zsd_find_mru(&zone->zone_zsd, key);
 813         data = (t == NULL ? NULL : t->zsd_data);
 814         mutex_exit(&zone->zone_lock);
 815         return (data);
 816 }
 817 
 818 /*
 819  * Function used to initialize a zone's list of ZSD callbacks and data
 820  * when the zone is being created.  The callbacks are initialized from
 821  * the template list (zsd_registered_keys). The constructor callback is
 822  * executed later (once the zone exists and with locks dropped).
 823  */
 824 static void
 825 zone_zsd_configure(zone_t *zone)
 826 {
 827         struct zsd_entry *zsdp;
 828         struct zsd_entry *t;
 829 
 830         ASSERT(MUTEX_HELD(&zonehash_lock));
 831         ASSERT(list_head(&zone->zone_zsd) == NULL);
 832         mutex_enter(&zone->zone_lock);
 833         mutex_enter(&zsd_key_lock);
 834         for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 835             zsdp = list_next(&zsd_registered_keys, zsdp)) {
 836                 /*
 837                  * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 838                  * should not have added anything to it.
 839                  */
 840                 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 841 
 842                 t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 843                 t->zsd_key = zsdp->zsd_key;
 844                 t->zsd_create = zsdp->zsd_create;
 845                 t->zsd_shutdown = zsdp->zsd_shutdown;
 846                 t->zsd_destroy = zsdp->zsd_destroy;
 847                 if (zsdp->zsd_create != NULL) {
 848                         t->zsd_flags = ZSD_CREATE_NEEDED;
 849                         DTRACE_PROBE2(zsd__create__needed,
 850                             zone_t *, zone, zone_key_t, zsdp->zsd_key);
 851                 }
 852                 list_insert_tail(&zone->zone_zsd, t);
 853         }
 854         mutex_exit(&zsd_key_lock);
 855         mutex_exit(&zone->zone_lock);
 856 }
 857 
 858 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 859 
 860 /*
 861  * Helper function to execute shutdown or destructor callbacks.
 862  */
 863 static void
 864 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 865 {
 866         struct zsd_entry *t;
 867 
 868         ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 869         ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 870         ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 871 
 872         /*
 873          * Run the callback solely based on what is registered for the zone
 874          * in zone_zsd. The global list can change independently of this
 875          * as keys are registered and unregistered and we don't register new
 876          * callbacks for a zone that is in the process of going away.
 877          */
 878         mutex_enter(&zone->zone_lock);
 879         for (t = list_head(&zone->zone_zsd); t != NULL;
 880             t = list_next(&zone->zone_zsd, t)) {
 881                 zone_key_t key = t->zsd_key;
 882 
 883                 /* Skip if no callbacks registered */
 884 
 885                 if (ct == ZSD_SHUTDOWN) {
 886                         if (t->zsd_shutdown != NULL &&
 887                             (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 888                                 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 889                                 DTRACE_PROBE2(zsd__shutdown__needed,
 890                                     zone_t *, zone, zone_key_t, key);
 891                         }
 892                 } else {
 893                         if (t->zsd_destroy != NULL &&
 894                             (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 895                                 t->zsd_flags |= ZSD_DESTROY_NEEDED;
 896                                 DTRACE_PROBE2(zsd__destroy__needed,
 897                                     zone_t *, zone, zone_key_t, key);
 898                         }
 899                 }
 900         }
 901         mutex_exit(&zone->zone_lock);
 902 
 903         /* Now call the shutdown and destroy callback for this key */
 904         zsd_apply_all_keys(zsd_apply_shutdown, zone);
 905         zsd_apply_all_keys(zsd_apply_destroy, zone);
 906 
 907 }
 908 
 909 /*
 910  * Called when the zone is going away; free ZSD-related memory, and
 911  * destroy the zone_zsd list.
 912  */
 913 static void
 914 zone_free_zsd(zone_t *zone)
 915 {
 916         struct zsd_entry *t, *next;
 917 
 918         /*
 919          * Free all the zsd_entry's we had on this zone.
 920          */
 921         mutex_enter(&zone->zone_lock);
 922         for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 923                 next = list_next(&zone->zone_zsd, t);
 924                 list_remove(&zone->zone_zsd, t);
 925                 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 926                 kmem_free(t, sizeof (*t));
 927         }
 928         list_destroy(&zone->zone_zsd);
 929         mutex_exit(&zone->zone_lock);
 930 
 931 }
 932 
 933 /*
 934  * Apply a function to all zones for particular key value.
 935  *
 936  * The applyfn has to drop zonehash_lock if it does some work, and
 937  * then reacquire it before it returns.
 938  * When the lock is dropped we don't follow list_next even
 939  * if it is possible to do so without any hazards. This is
 940  * because we want the design to allow for the list of zones
 941  * to change in any arbitrary way during the time the
 942  * lock was dropped.
 943  *
 944  * It is safe to restart the loop at list_head since the applyfn
 945  * changes the zsd_flags as it does work, so a subsequent
 946  * pass through will have no effect in applyfn, hence the loop will terminate
 947  * in at worst O(N^2).
 948  */
 949 static void
 950 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 951 {
 952         zone_t *zone;
 953 
 954         mutex_enter(&zonehash_lock);
 955         zone = list_head(&zone_active);
 956         while (zone != NULL) {
 957                 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 958                         /* Lock dropped - restart at head */
 959                         zone = list_head(&zone_active);
 960                 } else {
 961                         zone = list_next(&zone_active, zone);
 962                 }
 963         }
 964         mutex_exit(&zonehash_lock);
 965 }
 966 
 967 /*
 968  * Apply a function to all keys for a particular zone.
 969  *
 970  * The applyfn has to drop zonehash_lock if it does some work, and
 971  * then reacquire it before it returns.
 972  * When the lock is dropped we don't follow list_next even
 973  * if it is possible to do so without any hazards. This is
 974  * because we want the design to allow for the list of zsd callbacks
 975  * to change in any arbitrary way during the time the
 976  * lock was dropped.
 977  *
 978  * It is safe to restart the loop at list_head since the applyfn
 979  * changes the zsd_flags as it does work, so a subsequent
 980  * pass through will have no effect in applyfn, hence the loop will terminate
 981  * in at worst O(N^2).
 982  */
 983 static void
 984 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 985 {
 986         struct zsd_entry *t;
 987 
 988         mutex_enter(&zone->zone_lock);
 989         t = list_head(&zone->zone_zsd);
 990         while (t != NULL) {
 991                 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 992                         /* Lock dropped - restart at head */
 993                         t = list_head(&zone->zone_zsd);
 994                 } else {
 995                         t = list_next(&zone->zone_zsd, t);
 996                 }
 997         }
 998         mutex_exit(&zone->zone_lock);
 999 }
1000 
1001 /*
1002  * Call the create function for the zone and key if CREATE_NEEDED
1003  * is set.
1004  * If some other thread gets here first and sets CREATE_INPROGRESS, then
1005  * we wait for that thread to complete so that we can ensure that
1006  * all the callbacks are done when we've looped over all zones/keys.
1007  *
1008  * When we call the create function, we drop the global held by the
1009  * caller, and return true to tell the caller it needs to re-evalute the
1010  * state.
1011  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1012  * remains held on exit.
1013  */
1014 static boolean_t
1015 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1016     zone_t *zone, zone_key_t key)
1017 {
1018         void *result;
1019         struct zsd_entry *t;
1020         boolean_t dropped;
1021 
1022         if (lockp != NULL) {
1023                 ASSERT(MUTEX_HELD(lockp));
1024         }
1025         if (zone_lock_held) {
1026                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1027         } else {
1028                 mutex_enter(&zone->zone_lock);
1029         }
1030 
1031         t = zsd_find(&zone->zone_zsd, key);
1032         if (t == NULL) {
1033                 /*
1034                  * Somebody else got here first e.g the zone going
1035                  * away.
1036                  */
1037                 if (!zone_lock_held)
1038                         mutex_exit(&zone->zone_lock);
1039                 return (B_FALSE);
1040         }
1041         dropped = B_FALSE;
1042         if (zsd_wait_for_inprogress(zone, t, lockp))
1043                 dropped = B_TRUE;
1044 
1045         if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1046                 t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1047                 t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1048                 DTRACE_PROBE2(zsd__create__inprogress,
1049                     zone_t *, zone, zone_key_t, key);
1050                 mutex_exit(&zone->zone_lock);
1051                 if (lockp != NULL)
1052                         mutex_exit(lockp);
1053 
1054                 dropped = B_TRUE;
1055                 ASSERT(t->zsd_create != NULL);
1056                 DTRACE_PROBE2(zsd__create__start,
1057                     zone_t *, zone, zone_key_t, key);
1058 
1059                 result = (*t->zsd_create)(zone->zone_id);
1060 
1061                 DTRACE_PROBE2(zsd__create__end,
1062                     zone_t *, zone, voidn *, result);
1063 
1064                 ASSERT(result != NULL);
1065                 if (lockp != NULL)
1066                         mutex_enter(lockp);
1067                 mutex_enter(&zone->zone_lock);
1068                 t->zsd_data = result;
1069                 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1070                 t->zsd_flags |= ZSD_CREATE_COMPLETED;
1071                 cv_broadcast(&t->zsd_cv);
1072                 DTRACE_PROBE2(zsd__create__completed,
1073                     zone_t *, zone, zone_key_t, key);
1074         }
1075         if (!zone_lock_held)
1076                 mutex_exit(&zone->zone_lock);
1077         return (dropped);
1078 }
1079 
1080 /*
1081  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1082  * is set.
1083  * If some other thread gets here first and sets *_INPROGRESS, then
1084  * we wait for that thread to complete so that we can ensure that
1085  * all the callbacks are done when we've looped over all zones/keys.
1086  *
1087  * When we call the shutdown function, we drop the global held by the
1088  * caller, and return true to tell the caller it needs to re-evalute the
1089  * state.
1090  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1091  * remains held on exit.
1092  */
1093 static boolean_t
1094 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1095     zone_t *zone, zone_key_t key)
1096 {
1097         struct zsd_entry *t;
1098         void *data;
1099         boolean_t dropped;
1100 
1101         if (lockp != NULL) {
1102                 ASSERT(MUTEX_HELD(lockp));
1103         }
1104         if (zone_lock_held) {
1105                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1106         } else {
1107                 mutex_enter(&zone->zone_lock);
1108         }
1109 
1110         t = zsd_find(&zone->zone_zsd, key);
1111         if (t == NULL) {
1112                 /*
1113                  * Somebody else got here first e.g the zone going
1114                  * away.
1115                  */
1116                 if (!zone_lock_held)
1117                         mutex_exit(&zone->zone_lock);
1118                 return (B_FALSE);
1119         }
1120         dropped = B_FALSE;
1121         if (zsd_wait_for_creator(zone, t, lockp))
1122                 dropped = B_TRUE;
1123 
1124         if (zsd_wait_for_inprogress(zone, t, lockp))
1125                 dropped = B_TRUE;
1126 
1127         if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1128                 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1129                 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1130                 DTRACE_PROBE2(zsd__shutdown__inprogress,
1131                     zone_t *, zone, zone_key_t, key);
1132                 mutex_exit(&zone->zone_lock);
1133                 if (lockp != NULL)
1134                         mutex_exit(lockp);
1135                 dropped = B_TRUE;
1136 
1137                 ASSERT(t->zsd_shutdown != NULL);
1138                 data = t->zsd_data;
1139 
1140                 DTRACE_PROBE2(zsd__shutdown__start,
1141                     zone_t *, zone, zone_key_t, key);
1142 
1143                 (t->zsd_shutdown)(zone->zone_id, data);
1144                 DTRACE_PROBE2(zsd__shutdown__end,
1145                     zone_t *, zone, zone_key_t, key);
1146 
1147                 if (lockp != NULL)
1148                         mutex_enter(lockp);
1149                 mutex_enter(&zone->zone_lock);
1150                 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1151                 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1152                 cv_broadcast(&t->zsd_cv);
1153                 DTRACE_PROBE2(zsd__shutdown__completed,
1154                     zone_t *, zone, zone_key_t, key);
1155         }
1156         if (!zone_lock_held)
1157                 mutex_exit(&zone->zone_lock);
1158         return (dropped);
1159 }
1160 
1161 /*
1162  * Call the destroy function for the zone and key if DESTROY_NEEDED
1163  * is set.
1164  * If some other thread gets here first and sets *_INPROGRESS, then
1165  * we wait for that thread to complete so that we can ensure that
1166  * all the callbacks are done when we've looped over all zones/keys.
1167  *
1168  * When we call the destroy function, we drop the global held by the
1169  * caller, and return true to tell the caller it needs to re-evalute the
1170  * state.
1171  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1172  * remains held on exit.
1173  */
1174 static boolean_t
1175 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1176     zone_t *zone, zone_key_t key)
1177 {
1178         struct zsd_entry *t;
1179         void *data;
1180         boolean_t dropped;
1181 
1182         if (lockp != NULL) {
1183                 ASSERT(MUTEX_HELD(lockp));
1184         }
1185         if (zone_lock_held) {
1186                 ASSERT(MUTEX_HELD(&zone->zone_lock));
1187         } else {
1188                 mutex_enter(&zone->zone_lock);
1189         }
1190 
1191         t = zsd_find(&zone->zone_zsd, key);
1192         if (t == NULL) {
1193                 /*
1194                  * Somebody else got here first e.g the zone going
1195                  * away.
1196                  */
1197                 if (!zone_lock_held)
1198                         mutex_exit(&zone->zone_lock);
1199                 return (B_FALSE);
1200         }
1201         dropped = B_FALSE;
1202         if (zsd_wait_for_creator(zone, t, lockp))
1203                 dropped = B_TRUE;
1204 
1205         if (zsd_wait_for_inprogress(zone, t, lockp))
1206                 dropped = B_TRUE;
1207 
1208         if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1209                 t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1210                 t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1211                 DTRACE_PROBE2(zsd__destroy__inprogress,
1212                     zone_t *, zone, zone_key_t, key);
1213                 mutex_exit(&zone->zone_lock);
1214                 if (lockp != NULL)
1215                         mutex_exit(lockp);
1216                 dropped = B_TRUE;
1217 
1218                 ASSERT(t->zsd_destroy != NULL);
1219                 data = t->zsd_data;
1220                 DTRACE_PROBE2(zsd__destroy__start,
1221                     zone_t *, zone, zone_key_t, key);
1222 
1223                 (t->zsd_destroy)(zone->zone_id, data);
1224                 DTRACE_PROBE2(zsd__destroy__end,
1225                     zone_t *, zone, zone_key_t, key);
1226 
1227                 if (lockp != NULL)
1228                         mutex_enter(lockp);
1229                 mutex_enter(&zone->zone_lock);
1230                 t->zsd_data = NULL;
1231                 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1232                 t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1233                 cv_broadcast(&t->zsd_cv);
1234                 DTRACE_PROBE2(zsd__destroy__completed,
1235                     zone_t *, zone, zone_key_t, key);
1236         }
1237         if (!zone_lock_held)
1238                 mutex_exit(&zone->zone_lock);
1239         return (dropped);
1240 }
1241 
1242 /*
1243  * Wait for any CREATE_NEEDED flag to be cleared.
1244  * Returns true if lockp was temporarily dropped while waiting.
1245  */
1246 static boolean_t
1247 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1248 {
1249         boolean_t dropped = B_FALSE;
1250 
1251         while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1252                 DTRACE_PROBE2(zsd__wait__for__creator,
1253                     zone_t *, zone, struct zsd_entry *, t);
1254                 if (lockp != NULL) {
1255                         dropped = B_TRUE;
1256                         mutex_exit(lockp);
1257                 }
1258                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1259                 if (lockp != NULL) {
1260                         /* First drop zone_lock to preserve order */
1261                         mutex_exit(&zone->zone_lock);
1262                         mutex_enter(lockp);
1263                         mutex_enter(&zone->zone_lock);
1264                 }
1265         }
1266         return (dropped);
1267 }
1268 
1269 /*
1270  * Wait for any INPROGRESS flag to be cleared.
1271  * Returns true if lockp was temporarily dropped while waiting.
1272  */
1273 static boolean_t
1274 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1275 {
1276         boolean_t dropped = B_FALSE;
1277 
1278         while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1279                 DTRACE_PROBE2(zsd__wait__for__inprogress,
1280                     zone_t *, zone, struct zsd_entry *, t);
1281                 if (lockp != NULL) {
1282                         dropped = B_TRUE;
1283                         mutex_exit(lockp);
1284                 }
1285                 cv_wait(&t->zsd_cv, &zone->zone_lock);
1286                 if (lockp != NULL) {
1287                         /* First drop zone_lock to preserve order */
1288                         mutex_exit(&zone->zone_lock);
1289                         mutex_enter(lockp);
1290                         mutex_enter(&zone->zone_lock);
1291                 }
1292         }
1293         return (dropped);
1294 }
1295 
1296 /*
1297  * Frees memory associated with the zone dataset list.
1298  */
1299 static void
1300 zone_free_datasets(zone_t *zone)
1301 {
1302         zone_dataset_t *t, *next;
1303 
1304         for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1305                 next = list_next(&zone->zone_datasets, t);
1306                 list_remove(&zone->zone_datasets, t);
1307                 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1308                 kmem_free(t, sizeof (*t));
1309         }
1310         list_destroy(&zone->zone_datasets);
1311 }
1312 
1313 /*
1314  * zone.cpu-shares resource control support.
1315  */
1316 /*ARGSUSED*/
1317 static rctl_qty_t
1318 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1319 {
1320         ASSERT(MUTEX_HELD(&p->p_lock));
1321         return (p->p_zone->zone_shares);
1322 }
1323 
1324 /*ARGSUSED*/
1325 static int
1326 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1327     rctl_qty_t nv)
1328 {
1329         ASSERT(MUTEX_HELD(&p->p_lock));
1330         ASSERT(e->rcep_t == RCENTITY_ZONE);
1331         if (e->rcep_p.zone == NULL)
1332                 return (0);
1333 
1334         e->rcep_p.zone->zone_shares = nv;
1335         return (0);
1336 }
1337 
1338 static rctl_ops_t zone_cpu_shares_ops = {
1339         rcop_no_action,
1340         zone_cpu_shares_usage,
1341         zone_cpu_shares_set,
1342         rcop_no_test
1343 };
1344 
1345 /*
1346  * zone.cpu-cap resource control support.
1347  */
1348 /*ARGSUSED*/
1349 static rctl_qty_t
1350 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1351 {
1352         ASSERT(MUTEX_HELD(&p->p_lock));
1353         return (cpucaps_zone_get(p->p_zone));
1354 }
1355 
1356 /*ARGSUSED*/
1357 static int
1358 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1359     rctl_qty_t nv)
1360 {
1361         zone_t *zone = e->rcep_p.zone;
1362 
1363         ASSERT(MUTEX_HELD(&p->p_lock));
1364         ASSERT(e->rcep_t == RCENTITY_ZONE);
1365 
1366         if (zone == NULL)
1367                 return (0);
1368 
1369         /*
1370          * set cap to the new value.
1371          */
1372         return (cpucaps_zone_set(zone, nv));
1373 }
1374 
1375 static rctl_ops_t zone_cpu_cap_ops = {
1376         rcop_no_action,
1377         zone_cpu_cap_get,
1378         zone_cpu_cap_set,
1379         rcop_no_test
1380 };
1381 
1382 /*
1383  * zone.zfs-io-pri resource control support (IO priority).
1384  */
1385 /*ARGSUSED*/
1386 static rctl_qty_t
1387 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1388 {
1389         ASSERT(MUTEX_HELD(&p->p_lock));
1390         return (p->p_zone->zone_zfs_io_pri);
1391 }
1392 
1393 /*ARGSUSED*/
1394 static int
1395 zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1396     rctl_qty_t nv)
1397 {
1398         zone_t *zone = e->rcep_p.zone;
1399 
1400         ASSERT(MUTEX_HELD(&p->p_lock));
1401         ASSERT(e->rcep_t == RCENTITY_ZONE);
1402 
1403         if (zone == NULL)
1404                 return (0);
1405 
1406         /*
1407          * set priority to the new value.
1408          */
1409         zone->zone_zfs_io_pri = nv;
1410         return (0);
1411 }
1412 
1413 static rctl_ops_t zone_zfs_io_pri_ops = {
1414         rcop_no_action,
1415         zone_zfs_io_pri_get,
1416         zone_zfs_io_pri_set,
1417         rcop_no_test
1418 };
1419 
1420 /*ARGSUSED*/
1421 static rctl_qty_t
1422 zone_lwps_usage(rctl_t *r, proc_t *p)
1423 {
1424         rctl_qty_t nlwps;
1425         zone_t *zone = p->p_zone;
1426 
1427         ASSERT(MUTEX_HELD(&p->p_lock));
1428 
1429         mutex_enter(&zone->zone_nlwps_lock);
1430         nlwps = zone->zone_nlwps;
1431         mutex_exit(&zone->zone_nlwps_lock);
1432 
1433         return (nlwps);
1434 }
1435 
1436 /*ARGSUSED*/
1437 static int
1438 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1439     rctl_qty_t incr, uint_t flags)
1440 {
1441         rctl_qty_t nlwps;
1442 
1443         ASSERT(MUTEX_HELD(&p->p_lock));
1444         ASSERT(e->rcep_t == RCENTITY_ZONE);
1445         if (e->rcep_p.zone == NULL)
1446                 return (0);
1447         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1448         nlwps = e->rcep_p.zone->zone_nlwps;
1449 
1450         if (nlwps + incr > rcntl->rcv_value)
1451                 return (1);
1452 
1453         return (0);
1454 }
1455 
1456 /*ARGSUSED*/
1457 static int
1458 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1459 {
1460         ASSERT(MUTEX_HELD(&p->p_lock));
1461         ASSERT(e->rcep_t == RCENTITY_ZONE);
1462         if (e->rcep_p.zone == NULL)
1463                 return (0);
1464         e->rcep_p.zone->zone_nlwps_ctl = nv;
1465         return (0);
1466 }
1467 
1468 static rctl_ops_t zone_lwps_ops = {
1469         rcop_no_action,
1470         zone_lwps_usage,
1471         zone_lwps_set,
1472         zone_lwps_test,
1473 };
1474 
1475 /*ARGSUSED*/
1476 static rctl_qty_t
1477 zone_procs_usage(rctl_t *r, proc_t *p)
1478 {
1479         rctl_qty_t nprocs;
1480         zone_t *zone = p->p_zone;
1481 
1482         ASSERT(MUTEX_HELD(&p->p_lock));
1483 
1484         mutex_enter(&zone->zone_nlwps_lock);
1485         nprocs = zone->zone_nprocs;
1486         mutex_exit(&zone->zone_nlwps_lock);
1487 
1488         return (nprocs);
1489 }
1490 
1491 /*ARGSUSED*/
1492 static int
1493 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1494     rctl_qty_t incr, uint_t flags)
1495 {
1496         rctl_qty_t nprocs;
1497 
1498         ASSERT(MUTEX_HELD(&p->p_lock));
1499         ASSERT(e->rcep_t == RCENTITY_ZONE);
1500         if (e->rcep_p.zone == NULL)
1501                 return (0);
1502         ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1503         nprocs = e->rcep_p.zone->zone_nprocs;
1504 
1505         if (nprocs + incr > rcntl->rcv_value)
1506                 return (1);
1507 
1508         return (0);
1509 }
1510 
1511 /*ARGSUSED*/
1512 static int
1513 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1514 {
1515         ASSERT(MUTEX_HELD(&p->p_lock));
1516         ASSERT(e->rcep_t == RCENTITY_ZONE);
1517         if (e->rcep_p.zone == NULL)
1518                 return (0);
1519         e->rcep_p.zone->zone_nprocs_ctl = nv;
1520         return (0);
1521 }
1522 
1523 static rctl_ops_t zone_procs_ops = {
1524         rcop_no_action,
1525         zone_procs_usage,
1526         zone_procs_set,
1527         zone_procs_test,
1528 };
1529 
1530 /*ARGSUSED*/
1531 static rctl_qty_t
1532 zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1533 {
1534         ASSERT(MUTEX_HELD(&p->p_lock));
1535         return (p->p_zone->zone_shmmax);
1536 }
1537 
1538 /*ARGSUSED*/
1539 static int
1540 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1541     rctl_qty_t incr, uint_t flags)
1542 {
1543         rctl_qty_t v;
1544         ASSERT(MUTEX_HELD(&p->p_lock));
1545         ASSERT(e->rcep_t == RCENTITY_ZONE);
1546         v = e->rcep_p.zone->zone_shmmax + incr;
1547         if (v > rval->rcv_value)
1548                 return (1);
1549         return (0);
1550 }
1551 
1552 static rctl_ops_t zone_shmmax_ops = {
1553         rcop_no_action,
1554         zone_shmmax_usage,
1555         rcop_no_set,
1556         zone_shmmax_test
1557 };
1558 
1559 /*ARGSUSED*/
1560 static rctl_qty_t
1561 zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1562 {
1563         ASSERT(MUTEX_HELD(&p->p_lock));
1564         return (p->p_zone->zone_ipc.ipcq_shmmni);
1565 }
1566 
1567 /*ARGSUSED*/
1568 static int
1569 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1570     rctl_qty_t incr, uint_t flags)
1571 {
1572         rctl_qty_t v;
1573         ASSERT(MUTEX_HELD(&p->p_lock));
1574         ASSERT(e->rcep_t == RCENTITY_ZONE);
1575         v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1576         if (v > rval->rcv_value)
1577                 return (1);
1578         return (0);
1579 }
1580 
1581 static rctl_ops_t zone_shmmni_ops = {
1582         rcop_no_action,
1583         zone_shmmni_usage,
1584         rcop_no_set,
1585         zone_shmmni_test
1586 };
1587 
1588 /*ARGSUSED*/
1589 static rctl_qty_t
1590 zone_semmni_usage(rctl_t *rctl, struct proc *p)
1591 {
1592         ASSERT(MUTEX_HELD(&p->p_lock));
1593         return (p->p_zone->zone_ipc.ipcq_semmni);
1594 }
1595 
1596 /*ARGSUSED*/
1597 static int
1598 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1599     rctl_qty_t incr, uint_t flags)
1600 {
1601         rctl_qty_t v;
1602         ASSERT(MUTEX_HELD(&p->p_lock));
1603         ASSERT(e->rcep_t == RCENTITY_ZONE);
1604         v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1605         if (v > rval->rcv_value)
1606                 return (1);
1607         return (0);
1608 }
1609 
1610 static rctl_ops_t zone_semmni_ops = {
1611         rcop_no_action,
1612         zone_semmni_usage,
1613         rcop_no_set,
1614         zone_semmni_test
1615 };
1616 
1617 /*ARGSUSED*/
1618 static rctl_qty_t
1619 zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1620 {
1621         ASSERT(MUTEX_HELD(&p->p_lock));
1622         return (p->p_zone->zone_ipc.ipcq_msgmni);
1623 }
1624 
1625 /*ARGSUSED*/
1626 static int
1627 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1628     rctl_qty_t incr, uint_t flags)
1629 {
1630         rctl_qty_t v;
1631         ASSERT(MUTEX_HELD(&p->p_lock));
1632         ASSERT(e->rcep_t == RCENTITY_ZONE);
1633         v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1634         if (v > rval->rcv_value)
1635                 return (1);
1636         return (0);
1637 }
1638 
1639 static rctl_ops_t zone_msgmni_ops = {
1640         rcop_no_action,
1641         zone_msgmni_usage,
1642         rcop_no_set,
1643         zone_msgmni_test
1644 };
1645 
1646 /*ARGSUSED*/
1647 static rctl_qty_t
1648 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1649 {
1650         rctl_qty_t q;
1651         ASSERT(MUTEX_HELD(&p->p_lock));
1652         mutex_enter(&p->p_zone->zone_mem_lock);
1653         q = p->p_zone->zone_locked_mem;
1654         mutex_exit(&p->p_zone->zone_mem_lock);
1655         return (q);
1656 }
1657 
1658 /*ARGSUSED*/
1659 static int
1660 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1661     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1662 {
1663         rctl_qty_t q;
1664         zone_t *z;
1665 
1666         z = e->rcep_p.zone;
1667         ASSERT(MUTEX_HELD(&p->p_lock));
1668         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1669         q = z->zone_locked_mem;
1670         if (q + incr > rcntl->rcv_value)
1671                 return (1);
1672         return (0);
1673 }
1674 
1675 /*ARGSUSED*/
1676 static int
1677 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1678     rctl_qty_t nv)
1679 {
1680         ASSERT(MUTEX_HELD(&p->p_lock));
1681         ASSERT(e->rcep_t == RCENTITY_ZONE);
1682         if (e->rcep_p.zone == NULL)
1683                 return (0);
1684         e->rcep_p.zone->zone_locked_mem_ctl = nv;
1685         return (0);
1686 }
1687 
1688 static rctl_ops_t zone_locked_mem_ops = {
1689         rcop_no_action,
1690         zone_locked_mem_usage,
1691         zone_locked_mem_set,
1692         zone_locked_mem_test
1693 };
1694 
1695 /*ARGSUSED*/
1696 static rctl_qty_t
1697 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1698 {
1699         rctl_qty_t q;
1700         zone_t *z = p->p_zone;
1701 
1702         ASSERT(MUTEX_HELD(&p->p_lock));
1703         mutex_enter(&z->zone_mem_lock);
1704         q = z->zone_max_swap;
1705         mutex_exit(&z->zone_mem_lock);
1706         return (q);
1707 }
1708 
1709 /*ARGSUSED*/
1710 static int
1711 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1712     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1713 {
1714         rctl_qty_t q;
1715         zone_t *z;
1716 
1717         z = e->rcep_p.zone;
1718         ASSERT(MUTEX_HELD(&p->p_lock));
1719         ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1720         q = z->zone_max_swap;
1721         if (q + incr > rcntl->rcv_value)
1722                 return (1);
1723         return (0);
1724 }
1725 
1726 /*ARGSUSED*/
1727 static int
1728 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1729     rctl_qty_t nv)
1730 {
1731         ASSERT(MUTEX_HELD(&p->p_lock));
1732         ASSERT(e->rcep_t == RCENTITY_ZONE);
1733         if (e->rcep_p.zone == NULL)
1734                 return (0);
1735         e->rcep_p.zone->zone_max_swap_ctl = nv;
1736         return (0);
1737 }
1738 
1739 static rctl_ops_t zone_max_swap_ops = {
1740         rcop_no_action,
1741         zone_max_swap_usage,
1742         zone_max_swap_set,
1743         zone_max_swap_test
1744 };
1745 
1746 /*ARGSUSED*/
1747 static rctl_qty_t
1748 zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
1749 {
1750         rctl_qty_t q;
1751         zone_t *z = p->p_zone;
1752 
1753         ASSERT(MUTEX_HELD(&p->p_lock));
1754         /* No additional lock because not enforced in the kernel */
1755         q = z->zone_phys_mem;
1756         return (q);
1757 }
1758 
1759 /*ARGSUSED*/
1760 static int
1761 zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1762     rctl_qty_t nv)
1763 {
1764         ASSERT(MUTEX_HELD(&p->p_lock));
1765         ASSERT(e->rcep_t == RCENTITY_ZONE);
1766         if (e->rcep_p.zone == NULL)
1767                 return (0);
1768         e->rcep_p.zone->zone_phys_mem_ctl = nv;
1769         return (0);
1770 }
1771 
1772 static rctl_ops_t zone_phys_mem_ops = {
1773         rcop_no_action,
1774         zone_phys_mem_usage,
1775         zone_phys_mem_set,
1776         rcop_no_test
1777 };
1778 
1779 /*ARGSUSED*/
1780 static rctl_qty_t
1781 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1782 {
1783         rctl_qty_t q;
1784         zone_t *z = p->p_zone;
1785 
1786         ASSERT(MUTEX_HELD(&p->p_lock));
1787         mutex_enter(&z->zone_rctl_lock);
1788         q = z->zone_max_lofi;
1789         mutex_exit(&z->zone_rctl_lock);
1790         return (q);
1791 }
1792 
1793 /*ARGSUSED*/
1794 static int
1795 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1796     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1797 {
1798         rctl_qty_t q;
1799         zone_t *z;
1800 
1801         z = e->rcep_p.zone;
1802         ASSERT(MUTEX_HELD(&p->p_lock));
1803         ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1804         q = z->zone_max_lofi;
1805         if (q + incr > rcntl->rcv_value)
1806                 return (1);
1807         return (0);
1808 }
1809 
1810 /*ARGSUSED*/
1811 static int
1812 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1813     rctl_qty_t nv)
1814 {
1815         ASSERT(MUTEX_HELD(&p->p_lock));
1816         ASSERT(e->rcep_t == RCENTITY_ZONE);
1817         if (e->rcep_p.zone == NULL)
1818                 return (0);
1819         e->rcep_p.zone->zone_max_lofi_ctl = nv;
1820         return (0);
1821 }
1822 
1823 static rctl_ops_t zone_max_lofi_ops = {
1824         rcop_no_action,
1825         zone_max_lofi_usage,
1826         zone_max_lofi_set,
1827         zone_max_lofi_test
1828 };
1829 
1830 /*
1831  * Helper function to brand the zone with a unique ID.
1832  */
1833 static void
1834 zone_uniqid(zone_t *zone)
1835 {
1836         static uint64_t uniqid = 0;
1837 
1838         ASSERT(MUTEX_HELD(&zonehash_lock));
1839         zone->zone_uniqid = uniqid++;
1840 }
1841 
1842 /*
1843  * Returns a held pointer to the "kcred" for the specified zone.
1844  */
1845 struct cred *
1846 zone_get_kcred(zoneid_t zoneid)
1847 {
1848         zone_t *zone;
1849         cred_t *cr;
1850 
1851         if ((zone = zone_find_by_id(zoneid)) == NULL)
1852                 return (NULL);
1853         cr = zone->zone_kcred;
1854         crhold(cr);
1855         zone_rele(zone);
1856         return (cr);
1857 }
1858 
1859 static int
1860 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1861 {
1862         zone_t *zone = ksp->ks_private;
1863         zone_kstat_t *zk = ksp->ks_data;
1864 
1865         if (rw == KSTAT_WRITE)
1866                 return (EACCES);
1867 
1868         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1869         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1870         return (0);
1871 }
1872 
1873 static int
1874 zone_physmem_kstat_update(kstat_t *ksp, int rw)
1875 {
1876         zone_t *zone = ksp->ks_private;
1877         zone_kstat_t *zk = ksp->ks_data;
1878 
1879         if (rw == KSTAT_WRITE)
1880                 return (EACCES);
1881 
1882         zk->zk_usage.value.ui64 = zone->zone_phys_mem;
1883         zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
1884         return (0);
1885 }
1886 
1887 static int
1888 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1889 {
1890         zone_t *zone = ksp->ks_private;
1891         zone_kstat_t *zk = ksp->ks_data;
1892 
1893         if (rw == KSTAT_WRITE)
1894                 return (EACCES);
1895 
1896         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1897         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1898         return (0);
1899 }
1900 
1901 static int
1902 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1903 {
1904         zone_t *zone = ksp->ks_private;
1905         zone_kstat_t *zk = ksp->ks_data;
1906 
1907         if (rw == KSTAT_WRITE)
1908                 return (EACCES);
1909 
1910         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1911         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1912         return (0);
1913 }
1914 
1915 static kstat_t *
1916 zone_kstat_create_common(zone_t *zone, char *name,
1917     int (*updatefunc) (kstat_t *, int))
1918 {
1919         kstat_t *ksp;
1920         zone_kstat_t *zk;
1921 
1922         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1923             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1924             KSTAT_FLAG_VIRTUAL);
1925 
1926         if (ksp == NULL)
1927                 return (NULL);
1928 
1929         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1930         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1931         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1932         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1933         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1934         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1935         ksp->ks_update = updatefunc;
1936         ksp->ks_private = zone;
1937         kstat_install(ksp);
1938         return (ksp);
1939 }
1940 
1941 static int
1942 zone_vfs_kstat_update(kstat_t *ksp, int rw)
1943 {
1944         zone_t *zone = ksp->ks_private;
1945         zone_vfs_kstat_t *zvp = ksp->ks_data;
1946         kstat_io_t *kiop = &zone->zone_vfs_rwstats;
1947 
1948         if (rw == KSTAT_WRITE)
1949                 return (EACCES);
1950 
1951         /*
1952          * Extract the VFS statistics from the kstat_io_t structure used by
1953          * kstat_runq_enter() and related functions.  Since the slow ops
1954          * counters are updated directly by the VFS layer, there's no need to
1955          * copy those statistics here.
1956          *
1957          * Note that kstat_runq_enter() and the related functions use
1958          * gethrtime_unscaled(), so scale the time here.
1959          */
1960         zvp->zv_nread.value.ui64 = kiop->nread;
1961         zvp->zv_reads.value.ui64 = kiop->reads;
1962         zvp->zv_rtime.value.ui64 = kiop->rtime;
1963         zvp->zv_rcnt.value.ui64 = kiop->rcnt;
1964         zvp->zv_rlentime.value.ui64 = kiop->rlentime;
1965         zvp->zv_nwritten.value.ui64 = kiop->nwritten;
1966         zvp->zv_writes.value.ui64 = kiop->writes;
1967         zvp->zv_wtime.value.ui64 = kiop->wtime;
1968         zvp->zv_wcnt.value.ui64 = kiop->wcnt;
1969         zvp->zv_wlentime.value.ui64 = kiop->wlentime;
1970 
1971         scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
1972         scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
1973         scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
1974         scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
1975 
1976         return (0);
1977 }
1978 
1979 static kstat_t *
1980 zone_vfs_kstat_create(zone_t *zone)
1981 {
1982         kstat_t *ksp;
1983         zone_vfs_kstat_t *zvp;
1984 
1985         if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
1986             zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
1987             sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
1988             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1989                 return (NULL);
1990 
1991         if (zone->zone_id != GLOBAL_ZONEID)
1992                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1993 
1994         zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
1995         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1996         ksp->ks_lock = &zone->zone_vfs_lock;
1997         zone->zone_vfs_stats = zvp;
1998 
1999         /* The kstat "name" field is not large enough for a full zonename */
2000         kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
2001         kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
2002         kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
2003         kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
2004         kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
2005         kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
2006         kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
2007         kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
2008         kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
2009         kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
2010         kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
2011         kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
2012         kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
2013         kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
2014         kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
2015         kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
2016         kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
2017         kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
2018 
2019         ksp->ks_update = zone_vfs_kstat_update;
2020         ksp->ks_private = zone;
2021 
2022         kstat_install(ksp);
2023         return (ksp);
2024 }
2025 
2026 static int
2027 zone_zfs_kstat_update(kstat_t *ksp, int rw)
2028 {
2029         zone_t *zone = ksp->ks_private;
2030         zone_zfs_kstat_t *zzp = ksp->ks_data;
2031         kstat_io_t *kiop = &zone->zone_zfs_rwstats;
2032 
2033         if (rw == KSTAT_WRITE)
2034                 return (EACCES);
2035 
2036         /*
2037          * Extract the ZFS statistics from the kstat_io_t structure used by
2038          * kstat_runq_enter() and related functions.  Since the I/O throttle
2039          * counters are updated directly by the ZFS layer, there's no need to
2040          * copy those statistics here.
2041          *
2042          * Note that kstat_runq_enter() and the related functions use
2043          * gethrtime_unscaled(), so scale the time here.
2044          */
2045         zzp->zz_nread.value.ui64 = kiop->nread;
2046         zzp->zz_reads.value.ui64 = kiop->reads;
2047         zzp->zz_rtime.value.ui64 = kiop->rtime;
2048         zzp->zz_rlentime.value.ui64 = kiop->rlentime;
2049         zzp->zz_nwritten.value.ui64 = kiop->nwritten;
2050         zzp->zz_writes.value.ui64 = kiop->writes;
2051 
2052         scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
2053         scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
2054 
2055         return (0);
2056 }
2057 
2058 static kstat_t *
2059 zone_zfs_kstat_create(zone_t *zone)
2060 {
2061         kstat_t *ksp;
2062         zone_zfs_kstat_t *zzp;
2063 
2064         if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
2065             zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
2066             sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
2067             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2068                 return (NULL);
2069 
2070         if (zone->zone_id != GLOBAL_ZONEID)
2071                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2072 
2073         zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
2074         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2075         ksp->ks_lock = &zone->zone_zfs_lock;
2076         zone->zone_zfs_stats = zzp;
2077 
2078         /* The kstat "name" field is not large enough for a full zonename */
2079         kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
2080         kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
2081         kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
2082         kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
2083         kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
2084         kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
2085         kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
2086         kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
2087         kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
2088 
2089         ksp->ks_update = zone_zfs_kstat_update;
2090         ksp->ks_private = zone;
2091 
2092         kstat_install(ksp);
2093         return (ksp);
2094 }
2095 
2096 static int
2097 zone_mcap_kstat_update(kstat_t *ksp, int rw)
2098 {
2099         zone_t *zone = ksp->ks_private;
2100         zone_mcap_kstat_t *zmp = ksp->ks_data;
2101 
2102         if (rw == KSTAT_WRITE)
2103                 return (EACCES);
2104 
2105         zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
2106         zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
2107         zmp->zm_swap.value.ui64 = zone->zone_max_swap;
2108         zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
2109         zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
2110         zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
2111         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
2112         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
2113         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
2114         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
2115         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
2116         zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
2117         zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
2118 
2119         return (0);
2120 }
2121 
2122 static kstat_t *
2123 zone_mcap_kstat_create(zone_t *zone)
2124 {
2125         kstat_t *ksp;
2126         zone_mcap_kstat_t *zmp;
2127 
2128         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2129             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2130             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2131             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2132                 return (NULL);
2133 
2134         if (zone->zone_id != GLOBAL_ZONEID)
2135                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2136 
2137         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2138         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2139         ksp->ks_lock = &zone->zone_mcap_lock;
2140         zone->zone_mcap_stats = zmp;
2141 
2142         /* The kstat "name" field is not large enough for a full zonename */
2143         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2144         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2145         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2146         kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2147         kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2148         kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2149         kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2150         kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2151         kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2152         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2153         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2154         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2155         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2156         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2157             KSTAT_DATA_UINT64);
2158         kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
2159             KSTAT_DATA_UINT64);
2160         kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
2161             KSTAT_DATA_UINT64);
2162 
2163         ksp->ks_update = zone_mcap_kstat_update;
2164         ksp->ks_private = zone;
2165 
2166         kstat_install(ksp);
2167         return (ksp);
2168 }
2169 
2170 static int
2171 zone_misc_kstat_update(kstat_t *ksp, int rw)
2172 {
2173         zone_t *zone = ksp->ks_private;
2174         zone_misc_kstat_t *zmp = ksp->ks_data;
2175         hrtime_t tmp;
2176 
2177         if (rw == KSTAT_WRITE)
2178                 return (EACCES);
2179 
2180         tmp = zone->zone_utime;
2181         scalehrtime(&tmp);
2182         zmp->zm_utime.value.ui64 = tmp;
2183         tmp = zone->zone_stime;
2184         scalehrtime(&tmp);
2185         zmp->zm_stime.value.ui64 = tmp;
2186         tmp = zone->zone_wtime;
2187         scalehrtime(&tmp);
2188         zmp->zm_wtime.value.ui64 = tmp;
2189 
2190         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
2191         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
2192         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
2193 
2194         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
2195         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
2196         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
2197         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
2198 
2199         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
2200 
2201         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
2202         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
2203 
2204         return (0);
2205 }
2206 
2207 static kstat_t *
2208 zone_misc_kstat_create(zone_t *zone)
2209 {
2210         kstat_t *ksp;
2211         zone_misc_kstat_t *zmp;
2212 
2213         if ((ksp = kstat_create_zone("zones", zone->zone_id,
2214             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
2215             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
2216             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2217                 return (NULL);
2218 
2219         if (zone->zone_id != GLOBAL_ZONEID)
2220                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2221 
2222         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2223         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2224         ksp->ks_lock = &zone->zone_misc_lock;
2225         zone->zone_misc_stats = zmp;
2226 
2227         /* The kstat "name" field is not large enough for a full zonename */
2228         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2229         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2230         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2231         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
2232         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2233         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2234         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2235         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2236             KSTAT_DATA_UINT32);
2237         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2238         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2239             KSTAT_DATA_UINT32);
2240         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2241         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2242         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2243             KSTAT_DATA_UINT32);
2244         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2245         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2246 
2247         ksp->ks_update = zone_misc_kstat_update;
2248         ksp->ks_private = zone;
2249 
2250         kstat_install(ksp);
2251         return (ksp);
2252 }
2253 
2254 static void
2255 zone_kstat_create(zone_t *zone)
2256 {
2257         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2258             "lockedmem", zone_lockedmem_kstat_update);
2259         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2260             "swapresv", zone_swapresv_kstat_update);
2261         zone->zone_physmem_kstat = zone_kstat_create_common(zone,
2262             "physicalmem", zone_physmem_kstat_update);
2263         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2264             "nprocs", zone_nprocs_kstat_update);
2265 
2266         if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2267                 zone->zone_vfs_stats = kmem_zalloc(
2268                     sizeof (zone_vfs_kstat_t), KM_SLEEP);
2269         }
2270 
2271         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2272                 zone->zone_mcap_stats = kmem_zalloc(
2273                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2274         }
2275 
2276         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2277                 zone->zone_misc_stats = kmem_zalloc(
2278                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2279         }
2280 
2281 }
2282 
2283 static void
2284 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2285 {
2286         void *data;
2287 
2288         if (*pkstat != NULL) {
2289                 data = (*pkstat)->ks_data;
2290                 kstat_delete(*pkstat);
2291                 kmem_free(data, datasz);
2292                 *pkstat = NULL;
2293         }
2294 }
2295 
2296 static void
2297 zone_kstat_delete(zone_t *zone)
2298 {
2299         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2300             sizeof (zone_kstat_t));
2301         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2302             sizeof (zone_kstat_t));
2303         zone_kstat_delete_common(&zone->zone_physmem_kstat,
2304             sizeof (zone_kstat_t));
2305         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2306             sizeof (zone_kstat_t));
2307 
2308         zone_kstat_delete_common(&zone->zone_vfs_ksp,
2309             sizeof (zone_vfs_kstat_t));
2310         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2311             sizeof (zone_mcap_kstat_t));
2312         zone_kstat_delete_common(&zone->zone_misc_ksp,
2313             sizeof (zone_misc_kstat_t));
2314 
2315 }
2316 
2317 /*
2318  * Called very early on in boot to initialize the ZSD list so that
2319  * zone_key_create() can be called before zone_init().  It also initializes
2320  * portions of zone0 which may be used before zone_init() is called.  The
2321  * variable "global_zone" will be set when zone0 is fully initialized by
2322  * zone_init().
2323  */
2324 void
2325 zone_zsd_init(void)
2326 {
2327         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2328         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2329         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2330             offsetof(struct zsd_entry, zsd_linkage));
2331         list_create(&zone_active, sizeof (zone_t),
2332             offsetof(zone_t, zone_linkage));
2333         list_create(&zone_deathrow, sizeof (zone_t),
2334             offsetof(zone_t, zone_linkage));
2335 
2336         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2337         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2338         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2339         zone0.zone_shares = 1;
2340         zone0.zone_nlwps = 0;
2341         zone0.zone_nlwps_ctl = INT_MAX;
2342         zone0.zone_nprocs = 0;
2343         zone0.zone_nprocs_ctl = INT_MAX;
2344         zone0.zone_locked_mem = 0;
2345         zone0.zone_locked_mem_ctl = UINT64_MAX;
2346         ASSERT(zone0.zone_max_swap == 0);
2347         zone0.zone_max_swap_ctl = UINT64_MAX;
2348         zone0.zone_phys_mem = 0;
2349         zone0.zone_phys_mem_ctl = UINT64_MAX;
2350         zone0.zone_max_lofi = 0;
2351         zone0.zone_max_lofi_ctl = UINT64_MAX;
2352         zone0.zone_shmmax = 0;
2353         zone0.zone_ipc.ipcq_shmmni = 0;
2354         zone0.zone_ipc.ipcq_semmni = 0;
2355         zone0.zone_ipc.ipcq_msgmni = 0;
2356         zone0.zone_name = GLOBAL_ZONENAME;
2357         zone0.zone_nodename = utsname.nodename;
2358         zone0.zone_domain = srpc_domain;
2359         zone0.zone_hostid = HW_INVALID_HOSTID;
2360         zone0.zone_fs_allowed = NULL;
2361         zone0.zone_ref = 1;
2362         zone0.zone_id = GLOBAL_ZONEID;
2363         zone0.zone_status = ZONE_IS_RUNNING;
2364         zone0.zone_rootpath = "/";
2365         zone0.zone_rootpathlen = 2;
2366         zone0.zone_psetid = ZONE_PS_INVAL;
2367         zone0.zone_ncpus = 0;
2368         zone0.zone_ncpus_online = 0;
2369         zone0.zone_proc_initpid = 1;
2370         zone0.zone_initname = initname;
2371         zone0.zone_lockedmem_kstat = NULL;
2372         zone0.zone_swapresv_kstat = NULL;
2373         zone0.zone_physmem_kstat = NULL;
2374         zone0.zone_nprocs_kstat = NULL;
2375         zone0.zone_zfs_io_pri = 1;
2376 
2377         zone0.zone_stime = 0;
2378         zone0.zone_utime = 0;
2379         zone0.zone_wtime = 0;
2380 
2381         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2382             offsetof(zone_ref_t, zref_linkage));
2383         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2384             offsetof(struct zsd_entry, zsd_linkage));
2385         list_insert_head(&zone_active, &zone0);
2386 
2387         /*
2388          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2389          * to anything meaningful.  It is assigned to be 'rootdir' in
2390          * vfs_mountroot().
2391          */
2392         zone0.zone_rootvp = NULL;
2393         zone0.zone_vfslist = NULL;
2394         zone0.zone_bootargs = initargs;
2395         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2396         /*
2397          * The global zone has all privileges
2398          */
2399         priv_fillset(zone0.zone_privset);
2400         /*
2401          * Add p0 to the global zone
2402          */
2403         zone0.zone_zsched = &p0;
2404         p0.p_zone = &zone0;
2405 }
2406 
2407 /*
2408  * Compute a hash value based on the contents of the label and the DOI.  The
2409  * hash algorithm is somewhat arbitrary, but is based on the observation that
2410  * humans will likely pick labels that differ by amounts that work out to be
2411  * multiples of the number of hash chains, and thus stirring in some primes
2412  * should help.
2413  */
2414 static uint_t
2415 hash_bylabel(void *hdata, mod_hash_key_t key)
2416 {
2417         const ts_label_t *lab = (ts_label_t *)key;
2418         const uint32_t *up, *ue;
2419         uint_t hash;
2420         int i;
2421 
2422         _NOTE(ARGUNUSED(hdata));
2423 
2424         hash = lab->tsl_doi + (lab->tsl_doi << 1);
2425         /* we depend on alignment of label, but not representation */
2426         up = (const uint32_t *)&lab->tsl_label;
2427         ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2428         i = 1;
2429         while (up < ue) {
2430                 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2431                 hash += *up + (*up << ((i % 16) + 1));
2432                 up++;
2433                 i++;
2434         }
2435         return (hash);
2436 }
2437 
2438 /*
2439  * All that mod_hash cares about here is zero (equal) versus non-zero (not
2440  * equal).  This may need to be changed if less than / greater than is ever
2441  * needed.
2442  */
2443 static int
2444 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2445 {
2446         ts_label_t *lab1 = (ts_label_t *)key1;
2447         ts_label_t *lab2 = (ts_label_t *)key2;
2448 
2449         return (label_equal(lab1, lab2) ? 0 : 1);
2450 }
2451 
2452 /*
2453  * Called by main() to initialize the zones framework.
2454  */
2455 void
2456 zone_init(void)
2457 {
2458         rctl_dict_entry_t *rde;
2459         rctl_val_t *dval;
2460         rctl_set_t *set;
2461         rctl_alloc_gp_t *gp;
2462         rctl_entity_p_t e;
2463         int res;
2464 
2465         ASSERT(curproc == &p0);
2466 
2467         /*
2468          * Create ID space for zone IDs.  ID 0 is reserved for the
2469          * global zone.
2470          */
2471         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2472 
2473         /*
2474          * Initialize generic zone resource controls, if any.
2475          */
2476         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2477             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2478             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2479             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2480 
2481         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2482             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2483             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2484             RCTL_GLOBAL_INFINITE,
2485             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2486 
2487         rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2488             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2489             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2490             16384, 16384, &zone_zfs_io_pri_ops);
2491 
2492         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2493             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2494             INT_MAX, INT_MAX, &zone_lwps_ops);
2495 
2496         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2497             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2498             INT_MAX, INT_MAX, &zone_procs_ops);
2499 
2500         /*
2501          * System V IPC resource controls
2502          */
2503         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2504             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2505             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2506 
2507         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2508             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2509             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2510 
2511         rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2512             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2513             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2514 
2515         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2516             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2517             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2518 
2519         /*
2520          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2521          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2522          */
2523         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2524         bzero(dval, sizeof (rctl_val_t));
2525         dval->rcv_value = 1;
2526         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2527         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2528         dval->rcv_action_recip_pid = -1;
2529 
2530         rde = rctl_dict_lookup("zone.cpu-shares");
2531         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2532 
2533         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2534             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2535             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2536             &zone_locked_mem_ops);
2537 
2538         rc_zone_max_swap = rctl_register("zone.max-swap",
2539             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2540             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2541             &zone_max_swap_ops);
2542 
2543         rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2544             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2545             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2546             &zone_phys_mem_ops);
2547 
2548         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2549             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2550             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2551             &zone_max_lofi_ops);
2552 
2553         /*
2554          * Initialize the ``global zone''.
2555          */
2556         set = rctl_set_create();
2557         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2558         mutex_enter(&p0.p_lock);
2559         e.rcep_p.zone = &zone0;
2560         e.rcep_t = RCENTITY_ZONE;
2561         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2562             gp);
2563 
2564         zone0.zone_nlwps = p0.p_lwpcnt;
2565         zone0.zone_nprocs = 1;
2566         zone0.zone_ntasks = 1;
2567         mutex_exit(&p0.p_lock);
2568         zone0.zone_restart_init = B_TRUE;
2569         zone0.zone_reboot_on_init_exit = B_FALSE;
2570         zone0.zone_init_status = -1;
2571         zone0.zone_brand = &native_brand;
2572         rctl_prealloc_destroy(gp);
2573         /*
2574          * pool_default hasn't been initialized yet, so we let pool_init()
2575          * take care of making sure the global zone is in the default pool.
2576          */
2577 
2578         /*
2579          * Initialize global zone kstats
2580          */
2581         zone_kstat_create(&zone0);
2582 
2583         /*
2584          * Initialize zone label.
2585          * mlp are initialized when tnzonecfg is loaded.
2586          */
2587         zone0.zone_slabel = l_admin_low;
2588         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2589         label_hold(l_admin_low);
2590 
2591         /*
2592          * Initialise the lock for the database structure used by mntfs.
2593          */
2594         rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2595 
2596         mutex_enter(&zonehash_lock);
2597         zone_uniqid(&zone0);
2598         ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2599 
2600         zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2601             mod_hash_null_valdtor);
2602         zonehashbyname = mod_hash_create_strhash("zone_by_name",
2603             zone_hash_size, mod_hash_null_valdtor);
2604         /*
2605          * maintain zonehashbylabel only for labeled systems
2606          */
2607         if (is_system_labeled())
2608                 zonehashbylabel = mod_hash_create_extended("zone_by_label",
2609                     zone_hash_size, mod_hash_null_keydtor,
2610                     mod_hash_null_valdtor, hash_bylabel, NULL,
2611                     hash_labelkey_cmp, KM_SLEEP);
2612         zonecount = 1;
2613 
2614         (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2615             (mod_hash_val_t)&zone0);
2616         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2617             (mod_hash_val_t)&zone0);
2618         if (is_system_labeled()) {
2619                 zone0.zone_flags |= ZF_HASHED_LABEL;
2620                 (void) mod_hash_insert(zonehashbylabel,
2621                     (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2622         }
2623         mutex_exit(&zonehash_lock);
2624 
2625         /*
2626          * We avoid setting zone_kcred until now, since kcred is initialized
2627          * sometime after zone_zsd_init() and before zone_init().
2628          */
2629         zone0.zone_kcred = kcred;
2630         /*
2631          * The global zone is fully initialized (except for zone_rootvp which
2632          * will be set when the root filesystem is mounted).
2633          */
2634         global_zone = &zone0;
2635 
2636         /*
2637          * Setup an event channel to send zone status change notifications on
2638          */
2639         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2640             EVCH_CREAT);
2641 
2642         if (res)
2643                 panic("Sysevent_evc_bind failed during zone setup.\n");
2644 
2645 }
2646 
2647 static void
2648 zone_free(zone_t *zone)
2649 {
2650         zone_dl_t *zdl;
2651 
2652         ASSERT(zone != global_zone);
2653         ASSERT(zone->zone_ntasks == 0);
2654         ASSERT(zone->zone_nlwps == 0);
2655         ASSERT(zone->zone_nprocs == 0);
2656         ASSERT(zone->zone_cred_ref == 0);
2657         ASSERT(zone->zone_kcred == NULL);
2658         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2659             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2660         ASSERT(list_is_empty(&zone->zone_ref_list));
2661 
2662         /*
2663          * Remove any zone caps.
2664          */
2665         cpucaps_zone_remove(zone);
2666 
2667         ASSERT(zone->zone_cpucap == NULL);
2668 
2669         /* remove from deathrow list */
2670         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2671                 ASSERT(zone->zone_ref == 0);
2672                 mutex_enter(&zone_deathrow_lock);
2673                 list_remove(&zone_deathrow, zone);
2674                 mutex_exit(&zone_deathrow_lock);
2675         }
2676 
2677         list_destroy(&zone->zone_ref_list);
2678         zone_free_zsd(zone);
2679         zone_free_datasets(zone);
2680 
2681         /*
2682          * While dlmgmtd should have removed all of these, it could have left
2683          * something behind or crashed. In which case it's not safe for us to
2684          * assume that the list is empty which list_destroy() will ASSERT. We
2685          * clean up for our userland comrades which may have crashed, or worse,
2686          * been disabled by SMF.
2687          */
2688         while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
2689                 if (zdl->zdl_net != NULL)
2690                         nvlist_free(zdl->zdl_net);
2691                 kmem_free(zdl, sizeof (zone_dl_t));
2692         }
2693         list_destroy(&zone->zone_dl_list);
2694 
2695         if (zone->zone_rootvp != NULL)
2696                 VN_RELE(zone->zone_rootvp);
2697         if (zone->zone_rootpath)
2698                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2699         if (zone->zone_name != NULL)
2700                 kmem_free(zone->zone_name, ZONENAME_MAX);
2701         if (zone->zone_slabel != NULL)
2702                 label_rele(zone->zone_slabel);
2703         if (zone->zone_nodename != NULL)
2704                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2705         if (zone->zone_domain != NULL)
2706                 kmem_free(zone->zone_domain, _SYS_NMLN);
2707         if (zone->zone_privset != NULL)
2708                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2709         if (zone->zone_rctls != NULL)
2710                 rctl_set_free(zone->zone_rctls);
2711         if (zone->zone_bootargs != NULL)
2712                 strfree(zone->zone_bootargs);
2713         if (zone->zone_initname != NULL)
2714                 strfree(zone->zone_initname);
2715         if (zone->zone_fs_allowed != NULL)
2716                 strfree(zone->zone_fs_allowed);
2717         if (zone->zone_pfexecd != NULL)
2718                 klpd_freelist(&zone->zone_pfexecd);
2719         id_free(zoneid_space, zone->zone_id);
2720         mutex_destroy(&zone->zone_lock);
2721         cv_destroy(&zone->zone_cv);
2722         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2723         rw_destroy(&zone->zone_mntfs_db_lock);
2724         kmem_free(zone, sizeof (zone_t));
2725 }
2726 
2727 /*
2728  * See block comment at the top of this file for information about zone
2729  * status values.
2730  */
2731 /*
2732  * Convenience function for setting zone status.
2733  */
2734 static void
2735 zone_status_set(zone_t *zone, zone_status_t status)
2736 {
2737 
2738         nvlist_t *nvl = NULL;
2739         ASSERT(MUTEX_HELD(&zone_status_lock));
2740         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2741             status >= zone_status_get(zone));
2742 
2743         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2744             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2745             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2746             zone_status_table[status]) ||
2747             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2748             zone_status_table[zone->zone_status]) ||
2749             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2750             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2751             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2752             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2753 #ifdef DEBUG
2754                 (void) printf(
2755                     "Failed to allocate and send zone state change event.\n");
2756 #endif
2757         }
2758         nvlist_free(nvl);
2759 
2760         zone->zone_status = status;
2761 
2762         cv_broadcast(&zone->zone_cv);
2763 }
2764 
2765 /*
2766  * Public function to retrieve the zone status.  The zone status may
2767  * change after it is retrieved.
2768  */
2769 zone_status_t
2770 zone_status_get(zone_t *zone)
2771 {
2772         return (zone->zone_status);
2773 }
2774 
2775 static int
2776 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2777 {
2778         char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2779         int err = 0;
2780 
2781         ASSERT(zone != global_zone);
2782         if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2783                 goto done;      /* EFAULT or ENAMETOOLONG */
2784 
2785         if (zone->zone_bootargs != NULL)
2786                 strfree(zone->zone_bootargs);
2787 
2788         zone->zone_bootargs = strdup(buf);
2789 
2790 done:
2791         kmem_free(buf, BOOTARGS_MAX);
2792         return (err);
2793 }
2794 
2795 static int
2796 zone_set_brand(zone_t *zone, const char *brand)
2797 {
2798         struct brand_attr *attrp;
2799         brand_t *bp;
2800 
2801         attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2802         if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2803                 kmem_free(attrp, sizeof (struct brand_attr));
2804                 return (EFAULT);
2805         }
2806 
2807         bp = brand_register_zone(attrp);
2808         kmem_free(attrp, sizeof (struct brand_attr));
2809         if (bp == NULL)
2810                 return (EINVAL);
2811 
2812         /*
2813          * This is the only place where a zone can change it's brand.
2814          * We already need to hold zone_status_lock to check the zone
2815          * status, so we'll just use that lock to serialize zone
2816          * branding requests as well.
2817          */
2818         mutex_enter(&zone_status_lock);
2819 
2820         /* Re-Branding is not allowed and the zone can't be booted yet */
2821         if ((ZONE_IS_BRANDED(zone)) ||
2822             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2823                 mutex_exit(&zone_status_lock);
2824                 brand_unregister_zone(bp);
2825                 return (EINVAL);
2826         }
2827 
2828         /*
2829          * Set up the brand specific data.
2830          * Note that it's possible that the hook has to drop the
2831          * zone_status_lock and reaquire it before returning so we can't
2832          * assume the lock has been held the entire time.
2833          */
2834         zone->zone_brand = bp;
2835         ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
2836 
2837         mutex_exit(&zone_status_lock);
2838         return (0);
2839 }
2840 
2841 static int
2842 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2843 {
2844         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2845         int err = 0;
2846 
2847         ASSERT(zone != global_zone);
2848         if ((err = copyinstr(zone_fs_allowed, buf,
2849             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2850                 goto done;
2851 
2852         if (zone->zone_fs_allowed != NULL)
2853                 strfree(zone->zone_fs_allowed);
2854 
2855         zone->zone_fs_allowed = strdup(buf);
2856 
2857 done:
2858         kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2859         return (err);
2860 }
2861 
2862 static int
2863 zone_set_initname(zone_t *zone, const char *zone_initname)
2864 {
2865         char initname[INITNAME_SZ];
2866         size_t len;
2867         int err = 0;
2868 
2869         ASSERT(zone != global_zone);
2870         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2871                 return (err);   /* EFAULT or ENAMETOOLONG */
2872 
2873         if (zone->zone_initname != NULL)
2874                 strfree(zone->zone_initname);
2875 
2876         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2877         (void) strcpy(zone->zone_initname, initname);
2878         return (0);
2879 }
2880 
2881 /*
2882  * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
2883  * to provide the physical memory capping kstats.  Since physical memory
2884  * capping is currently implemented in userland, that code uses the setattr
2885  * entry point to increment the kstats.  We always simply increment nover
2886  * every time that setattr is called and we always add in the input value
2887  * to zone_mcap_pagedout every time that is called.
2888  */
2889 /*ARGSUSED*/
2890 static int
2891 zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
2892 {
2893         zone->zone_mcap_nover++;
2894 
2895         return (0);
2896 }
2897 
2898 static int
2899 zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
2900 {
2901         uint64_t pageout;
2902         int err;
2903 
2904         if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
2905                 zone->zone_mcap_pagedout += pageout;
2906 
2907         return (err);
2908 }
2909 
2910 /*
2911  * The zone_set_page_fault_delay function is used to set the number of usecs
2912  * to throttle page faults.  This is normally 0 but can be set to a non-0 value
2913  * by the user-land memory capping code when the zone is over its physcial
2914  * memory cap.
2915  */
2916 static int
2917 zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
2918 {
2919         uint32_t dusec;
2920         int err;
2921 
2922         if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
2923                 zone->zone_pg_flt_delay = dusec;
2924 
2925         return (err);
2926 }
2927 
2928 /*
2929  * The zone_set_rss function is used to set the zone's RSS when we do the
2930  * fast, approximate calculation in user-land.
2931  */
2932 static int
2933 zone_set_rss(zone_t *zone, const uint64_t *prss)
2934 {
2935         uint64_t rss;
2936         int err;
2937 
2938         if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
2939                 zone->zone_phys_mem = rss;
2940 
2941         return (err);
2942 }
2943 
2944 static int
2945 zone_set_sched_class(zone_t *zone, const char *new_class)
2946 {
2947         char sched_class[PC_CLNMSZ];
2948         id_t classid;
2949         int err;
2950 
2951         ASSERT(zone != global_zone);
2952         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2953                 return (err);   /* EFAULT or ENAMETOOLONG */
2954 
2955         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2956                 return (set_errno(EINVAL));
2957         zone->zone_defaultcid = classid;
2958         ASSERT(zone->zone_defaultcid > 0 &&
2959             zone->zone_defaultcid < loaded_classes);
2960 
2961         return (0);
2962 }
2963 
2964 /*
2965  * Block indefinitely waiting for (zone_status >= status)
2966  */
2967 void
2968 zone_status_wait(zone_t *zone, zone_status_t status)
2969 {
2970         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2971 
2972         mutex_enter(&zone_status_lock);
2973         while (zone->zone_status < status) {
2974                 cv_wait(&zone->zone_cv, &zone_status_lock);
2975         }
2976         mutex_exit(&zone_status_lock);
2977 }
2978 
2979 /*
2980  * Private CPR-safe version of zone_status_wait().
2981  */
2982 static void
2983 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2984 {
2985         callb_cpr_t cprinfo;
2986 
2987         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2988 
2989         CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2990             str);
2991         mutex_enter(&zone_status_lock);
2992         while (zone->zone_status < status) {
2993                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2994                 cv_wait(&zone->zone_cv, &zone_status_lock);
2995                 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2996         }
2997         /*
2998          * zone_status_lock is implicitly released by the following.
2999          */
3000         CALLB_CPR_EXIT(&cprinfo);
3001 }
3002 
3003 /*
3004  * Block until zone enters requested state or signal is received.  Return (0)
3005  * if signaled, non-zero otherwise.
3006  */
3007 int
3008 zone_status_wait_sig(zone_t *zone, zone_status_t status)
3009 {
3010         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3011 
3012         mutex_enter(&zone_status_lock);
3013         while (zone->zone_status < status) {
3014                 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
3015                         mutex_exit(&zone_status_lock);
3016                         return (0);
3017                 }
3018         }
3019         mutex_exit(&zone_status_lock);
3020         return (1);
3021 }
3022 
3023 /*
3024  * Block until the zone enters the requested state or the timeout expires,
3025  * whichever happens first.  Return (-1) if operation timed out, time remaining
3026  * otherwise.
3027  */
3028 clock_t
3029 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
3030 {
3031         clock_t timeleft = 0;
3032 
3033         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3034 
3035         mutex_enter(&zone_status_lock);
3036         while (zone->zone_status < status && timeleft != -1) {
3037                 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
3038         }
3039         mutex_exit(&zone_status_lock);
3040         return (timeleft);
3041 }
3042 
3043 /*
3044  * Block until the zone enters the requested state, the current process is
3045  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
3046  * operation timed out, 0 if signaled, time remaining otherwise.
3047  */
3048 clock_t
3049 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
3050 {
3051         clock_t timeleft = tim - ddi_get_lbolt();
3052 
3053         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3054 
3055         mutex_enter(&zone_status_lock);
3056         while (zone->zone_status < status) {
3057                 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
3058                     tim);
3059                 if (timeleft <= 0)
3060                         break;
3061         }
3062         mutex_exit(&zone_status_lock);
3063         return (timeleft);
3064 }
3065 
3066 /*
3067  * Zones have two reference counts: one for references from credential
3068  * structures (zone_cred_ref), and one (zone_ref) for everything else.
3069  * This is so we can allow a zone to be rebooted while there are still
3070  * outstanding cred references, since certain drivers cache dblks (which
3071  * implicitly results in cached creds).  We wait for zone_ref to drop to
3072  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
3073  * later freed when the zone_cred_ref drops to 0, though nothing other
3074  * than the zone id and privilege set should be accessed once the zone
3075  * is "dead".
3076  *
3077  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
3078  * to force halt/reboot to block waiting for the zone_cred_ref to drop
3079  * to 0.  This can be useful to flush out other sources of cached creds
3080  * that may be less innocuous than the driver case.
3081  *
3082  * Zones also provide a tracked reference counting mechanism in which zone
3083  * references are represented by "crumbs" (zone_ref structures).  Crumbs help
3084  * debuggers determine the sources of leaked zone references.  See
3085  * zone_hold_ref() and zone_rele_ref() below for more information.
3086  */
3087 
3088 int zone_wait_for_cred = 0;
3089 
3090 static void
3091 zone_hold_locked(zone_t *z)
3092 {
3093         ASSERT(MUTEX_HELD(&z->zone_lock));
3094         z->zone_ref++;
3095         ASSERT(z->zone_ref != 0);
3096 }
3097 
3098 /*
3099  * Increment the specified zone's reference count.  The zone's zone_t structure
3100  * will not be freed as long as the zone's reference count is nonzero.
3101  * Decrement the zone's reference count via zone_rele().
3102  *
3103  * NOTE: This function should only be used to hold zones for short periods of
3104  * time.  Use zone_hold_ref() if the zone must be held for a long time.
3105  */
3106 void
3107 zone_hold(zone_t *z)
3108 {
3109         mutex_enter(&z->zone_lock);
3110         zone_hold_locked(z);
3111         mutex_exit(&z->zone_lock);
3112 }
3113 
3114 /*
3115  * If the non-cred ref count drops to 1 and either the cred ref count
3116  * is 0 or we aren't waiting for cred references, the zone is ready to
3117  * be destroyed.
3118  */
3119 #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
3120             (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
3121 
3122 /*
3123  * Common zone reference release function invoked by zone_rele() and
3124  * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
3125  * zone's subsystem-specific reference counters are not affected by the
3126  * release.  If ref is not NULL, then the zone_ref_t to which it refers is
3127  * removed from the specified zone's reference list.  ref must be non-NULL iff
3128  * subsys is not ZONE_REF_NUM_SUBSYS.
3129  */
3130 static void
3131 zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3132 {
3133         boolean_t wakeup;
3134 
3135         mutex_enter(&z->zone_lock);
3136         ASSERT(z->zone_ref != 0);
3137         z->zone_ref--;
3138         if (subsys != ZONE_REF_NUM_SUBSYS) {
3139                 ASSERT(z->zone_subsys_ref[subsys] != 0);
3140                 z->zone_subsys_ref[subsys]--;
3141                 list_remove(&z->zone_ref_list, ref);
3142         }
3143         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3144                 /* no more refs, free the structure */
3145                 mutex_exit(&z->zone_lock);
3146                 zone_free(z);
3147                 return;
3148         }
3149         /* signal zone_destroy so the zone can finish halting */
3150         wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
3151         mutex_exit(&z->zone_lock);
3152 
3153         if (wakeup) {
3154                 /*
3155                  * Grabbing zonehash_lock here effectively synchronizes with
3156                  * zone_destroy() to avoid missed signals.
3157                  */
3158                 mutex_enter(&zonehash_lock);
3159                 cv_broadcast(&zone_destroy_cv);
3160                 mutex_exit(&zonehash_lock);
3161         }
3162 }
3163 
3164 /*
3165  * Decrement the specified zone's reference count.  The specified zone will
3166  * cease to exist after this function returns if the reference count drops to
3167  * zero.  This function should be paired with zone_hold().
3168  */
3169 void
3170 zone_rele(zone_t *z)
3171 {
3172         zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
3173 }
3174 
3175 /*
3176  * Initialize a zone reference structure.  This function must be invoked for
3177  * a reference structure before the structure is passed to zone_hold_ref().
3178  */
3179 void
3180 zone_init_ref(zone_ref_t *ref)
3181 {
3182         ref->zref_zone = NULL;
3183         list_link_init(&ref->zref_linkage);
3184 }
3185 
3186 /*
3187  * Acquire a reference to zone z.  The caller must specify the
3188  * zone_ref_subsys_t constant associated with its subsystem.  The specified
3189  * zone_ref_t structure will represent a reference to the specified zone.  Use
3190  * zone_rele_ref() to release the reference.
3191  *
3192  * The referenced zone_t structure will not be freed as long as the zone_t's
3193  * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
3194  * references.
3195  *
3196  * NOTE: The zone_ref_t structure must be initialized before it is used.
3197  * See zone_init_ref() above.
3198  */
3199 void
3200 zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3201 {
3202         ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
3203 
3204         /*
3205          * Prevent consumers from reusing a reference structure before
3206          * releasing it.
3207          */
3208         VERIFY(ref->zref_zone == NULL);
3209 
3210         ref->zref_zone = z;
3211         mutex_enter(&z->zone_lock);
3212         zone_hold_locked(z);
3213         z->zone_subsys_ref[subsys]++;
3214         ASSERT(z->zone_subsys_ref[subsys] != 0);
3215         list_insert_head(&z->zone_ref_list, ref);
3216         mutex_exit(&z->zone_lock);
3217 }
3218 
3219 /*
3220  * Release the zone reference represented by the specified zone_ref_t.
3221  * The reference is invalid after it's released; however, the zone_ref_t
3222  * structure can be reused without having to invoke zone_init_ref().
3223  * subsys should be the same value that was passed to zone_hold_ref()
3224  * when the reference was acquired.
3225  */
3226 void
3227 zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
3228 {
3229         zone_rele_common(ref->zref_zone, ref, subsys);
3230 
3231         /*
3232          * Set the zone_ref_t's zref_zone field to NULL to generate panics
3233          * when consumers dereference the reference.  This helps us catch
3234          * consumers who use released references.  Furthermore, this lets
3235          * consumers reuse the zone_ref_t structure without having to
3236          * invoke zone_init_ref().
3237          */
3238         ref->zref_zone = NULL;
3239 }
3240 
3241 void
3242 zone_cred_hold(zone_t *z)
3243 {
3244         mutex_enter(&z->zone_lock);
3245         z->zone_cred_ref++;
3246         ASSERT(z->zone_cred_ref != 0);
3247         mutex_exit(&z->zone_lock);
3248 }
3249 
3250 void
3251 zone_cred_rele(zone_t *z)
3252 {
3253         boolean_t wakeup;
3254 
3255         mutex_enter(&z->zone_lock);
3256         ASSERT(z->zone_cred_ref != 0);
3257         z->zone_cred_ref--;
3258         if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3259                 /* no more refs, free the structure */
3260                 mutex_exit(&z->zone_lock);
3261                 zone_free(z);
3262                 return;
3263         }
3264         /*
3265          * If zone_destroy is waiting for the cred references to drain
3266          * out, and they have, signal it.
3267          */
3268         wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
3269             zone_status_get(z) >= ZONE_IS_DEAD);
3270         mutex_exit(&z->zone_lock);
3271 
3272         if (wakeup) {
3273                 /*
3274                  * Grabbing zonehash_lock here effectively synchronizes with
3275                  * zone_destroy() to avoid missed signals.
3276                  */
3277                 mutex_enter(&zonehash_lock);
3278                 cv_broadcast(&zone_destroy_cv);
3279                 mutex_exit(&zonehash_lock);
3280         }
3281 }
3282 
3283 void
3284 zone_task_hold(zone_t *z)
3285 {
3286         mutex_enter(&z->zone_lock);
3287         z->zone_ntasks++;
3288         ASSERT(z->zone_ntasks != 0);
3289         mutex_exit(&z->zone_lock);
3290 }
3291 
3292 void
3293 zone_task_rele(zone_t *zone)
3294 {
3295         uint_t refcnt;
3296 
3297         mutex_enter(&zone->zone_lock);
3298         ASSERT(zone->zone_ntasks != 0);
3299         refcnt = --zone->zone_ntasks;
3300         if (refcnt > 1)      {       /* Common case */
3301                 mutex_exit(&zone->zone_lock);
3302                 return;
3303         }
3304         zone_hold_locked(zone); /* so we can use the zone_t later */
3305         mutex_exit(&zone->zone_lock);
3306         if (refcnt == 1) {
3307                 /*
3308                  * See if the zone is shutting down.
3309                  */
3310                 mutex_enter(&zone_status_lock);
3311                 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
3312                         goto out;
3313                 }
3314 
3315                 /*
3316                  * Make sure the ntasks didn't change since we
3317                  * dropped zone_lock.
3318                  */
3319                 mutex_enter(&zone->zone_lock);
3320                 if (refcnt != zone->zone_ntasks) {
3321                         mutex_exit(&zone->zone_lock);
3322                         goto out;
3323                 }
3324                 mutex_exit(&zone->zone_lock);
3325 
3326                 /*
3327                  * No more user processes in the zone.  The zone is empty.
3328                  */
3329                 zone_status_set(zone, ZONE_IS_EMPTY);
3330                 goto out;
3331         }
3332 
3333         ASSERT(refcnt == 0);
3334         /*
3335          * zsched has exited; the zone is dead.
3336          */
3337         zone->zone_zsched = NULL;            /* paranoia */
3338         mutex_enter(&zone_status_lock);
3339         zone_status_set(zone, ZONE_IS_DEAD);
3340 out:
3341         mutex_exit(&zone_status_lock);
3342         zone_rele(zone);
3343 }
3344 
3345 zoneid_t
3346 getzoneid(void)
3347 {
3348         return (curproc->p_zone->zone_id);
3349 }
3350 
3351 /*
3352  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3353  * check the validity of a zone's state.
3354  */
3355 static zone_t *
3356 zone_find_all_by_id(zoneid_t zoneid)
3357 {
3358         mod_hash_val_t hv;
3359         zone_t *zone = NULL;
3360 
3361         ASSERT(MUTEX_HELD(&zonehash_lock));
3362 
3363         if (mod_hash_find(zonehashbyid,
3364             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3365                 zone = (zone_t *)hv;
3366         return (zone);
3367 }
3368 
3369 static zone_t *
3370 zone_find_all_by_label(const ts_label_t *label)
3371 {
3372         mod_hash_val_t hv;
3373         zone_t *zone = NULL;
3374 
3375         ASSERT(MUTEX_HELD(&zonehash_lock));
3376 
3377         /*
3378          * zonehashbylabel is not maintained for unlabeled systems
3379          */
3380         if (!is_system_labeled())
3381                 return (NULL);
3382         if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3383                 zone = (zone_t *)hv;
3384         return (zone);
3385 }
3386 
3387 static zone_t *
3388 zone_find_all_by_name(char *name)
3389 {
3390         mod_hash_val_t hv;
3391         zone_t *zone = NULL;
3392 
3393         ASSERT(MUTEX_HELD(&zonehash_lock));
3394 
3395         if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3396                 zone = (zone_t *)hv;
3397         return (zone);
3398 }
3399 
3400 /*
3401  * Public interface for looking up a zone by zoneid.  Only returns the zone if
3402  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3403  * Caller must call zone_rele() once it is done with the zone.
3404  *
3405  * The zone may begin the zone_destroy() sequence immediately after this
3406  * function returns, but may be safely used until zone_rele() is called.
3407  */
3408 zone_t *
3409 zone_find_by_id(zoneid_t zoneid)
3410 {
3411         zone_t *zone;
3412         zone_status_t status;
3413 
3414         mutex_enter(&zonehash_lock);
3415         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3416                 mutex_exit(&zonehash_lock);
3417                 return (NULL);
3418         }
3419         status = zone_status_get(zone);
3420         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3421                 /*
3422                  * For all practical purposes the zone doesn't exist.
3423                  */
3424                 mutex_exit(&zonehash_lock);
3425                 return (NULL);
3426         }
3427         zone_hold(zone);
3428         mutex_exit(&zonehash_lock);
3429         return (zone);
3430 }
3431 
3432 /*
3433  * Similar to zone_find_by_id, but using zone label as the key.
3434  */
3435 zone_t *
3436 zone_find_by_label(const ts_label_t *label)
3437 {
3438         zone_t *zone;
3439         zone_status_t status;
3440 
3441         mutex_enter(&zonehash_lock);
3442         if ((zone = zone_find_all_by_label(label)) == NULL) {
3443                 mutex_exit(&zonehash_lock);
3444                 return (NULL);
3445         }
3446 
3447         status = zone_status_get(zone);
3448         if (status > ZONE_IS_DOWN) {
3449                 /*
3450                  * For all practical purposes the zone doesn't exist.
3451                  */
3452                 mutex_exit(&zonehash_lock);
3453                 return (NULL);
3454         }
3455         zone_hold(zone);
3456         mutex_exit(&zonehash_lock);
3457         return (zone);
3458 }
3459 
3460 /*
3461  * Similar to zone_find_by_id, but using zone name as the key.
3462  */
3463 zone_t *
3464 zone_find_by_name(char *name)
3465 {
3466         zone_t *zone;
3467         zone_status_t status;
3468 
3469         mutex_enter(&zonehash_lock);
3470         if ((zone = zone_find_all_by_name(name)) == NULL) {
3471                 mutex_exit(&zonehash_lock);
3472                 return (NULL);
3473         }
3474         status = zone_status_get(zone);
3475         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3476                 /*
3477                  * For all practical purposes the zone doesn't exist.
3478                  */
3479                 mutex_exit(&zonehash_lock);
3480                 return (NULL);
3481         }
3482         zone_hold(zone);
3483         mutex_exit(&zonehash_lock);
3484         return (zone);
3485 }
3486 
3487 /*
3488  * Similar to zone_find_by_id(), using the path as a key.  For instance,
3489  * if there is a zone "foo" rooted at /foo/root, and the path argument
3490  * is "/foo/root/proc", it will return the held zone_t corresponding to
3491  * zone "foo".
3492  *
3493  * zone_find_by_path() always returns a non-NULL value, since at the
3494  * very least every path will be contained in the global zone.
3495  *
3496  * As with the other zone_find_by_*() functions, the caller is
3497  * responsible for zone_rele()ing the return value of this function.
3498  */
3499 zone_t *
3500 zone_find_by_path(const char *path)
3501 {
3502         zone_t *zone;
3503         zone_t *zret = NULL;
3504         zone_status_t status;
3505 
3506         if (path == NULL) {
3507                 /*
3508                  * Call from rootconf().
3509                  */
3510                 zone_hold(global_zone);
3511                 return (global_zone);
3512         }
3513         ASSERT(*path == '/');
3514         mutex_enter(&zonehash_lock);
3515         for (zone = list_head(&zone_active); zone != NULL;
3516             zone = list_next(&zone_active, zone)) {
3517                 if (ZONE_PATH_VISIBLE(path, zone))
3518                         zret = zone;
3519         }
3520         ASSERT(zret != NULL);
3521         status = zone_status_get(zret);
3522         if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3523                 /*
3524                  * Zone practically doesn't exist.
3525                  */
3526                 zret = global_zone;
3527         }
3528         zone_hold(zret);
3529         mutex_exit(&zonehash_lock);
3530         return (zret);
3531 }
3532 
3533 /*
3534  * Public interface for updating per-zone load averages.  Called once per
3535  * second.
3536  *
3537  * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3538  */
3539 void
3540 zone_loadavg_update()
3541 {
3542         zone_t *zp;
3543         zone_status_t status;
3544         struct loadavg_s *lavg;
3545         hrtime_t zone_total;
3546         int i;
3547         hrtime_t hr_avg;
3548         int nrun;
3549         static int64_t f[3] = { 135, 27, 9 };
3550         int64_t q, r;
3551 
3552         mutex_enter(&zonehash_lock);
3553         for (zp = list_head(&zone_active); zp != NULL;
3554             zp = list_next(&zone_active, zp)) {
3555                 mutex_enter(&zp->zone_lock);
3556 
3557                 /* Skip zones that are on the way down or not yet up */
3558                 status = zone_status_get(zp);
3559                 if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3560                         /* For all practical purposes the zone doesn't exist. */
3561                         mutex_exit(&zp->zone_lock);
3562                         continue;
3563                 }
3564 
3565                 /*
3566                  * Update the 10 second moving average data in zone_loadavg.
3567                  */
3568                 lavg = &zp->zone_loadavg;
3569 
3570                 zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3571                 scalehrtime(&zone_total);
3572 
3573                 /* The zone_total should always be increasing. */
3574                 lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3575                     zone_total - lavg->lg_total : 0;
3576                 lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3577                 /* lg_total holds the prev. 1 sec. total */
3578                 lavg->lg_total = zone_total;
3579 
3580                 /*
3581                  * To simplify the calculation, we don't calculate the load avg.
3582                  * until the zone has been up for at least 10 seconds and our
3583                  * moving average is thus full.
3584                  */
3585                 if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3586                         lavg->lg_len++;
3587                         mutex_exit(&zp->zone_lock);
3588                         continue;
3589                 }
3590 
3591                 /* Now calculate the 1min, 5min, 15 min load avg. */
3592                 hr_avg = 0;
3593                 for (i = 0; i < S_LOADAVG_SZ; i++)
3594                         hr_avg += lavg->lg_loads[i];
3595                 hr_avg = hr_avg / S_LOADAVG_SZ;
3596                 nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3597 
3598                 /* Compute load avg. See comment in calcloadavg() */
3599                 for (i = 0; i < 3; i++) {
3600                         q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3601                         r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3602                         zp->zone_hp_avenrun[i] +=
3603                             ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3604 
3605                         /* avenrun[] can only hold 31 bits of load avg. */
3606                         if (zp->zone_hp_avenrun[i] <
3607                             ((uint64_t)1<<(31+16-FSHIFT)))
3608                                 zp->zone_avenrun[i] = (int32_t)
3609                                     (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3610                         else
3611                                 zp->zone_avenrun[i] = 0x7fffffff;
3612                 }
3613 
3614                 mutex_exit(&zp->zone_lock);
3615         }
3616         mutex_exit(&zonehash_lock);
3617 }
3618 
3619 /*
3620  * Get the number of cpus visible to this zone.  The system-wide global
3621  * 'ncpus' is returned if pools are disabled, the caller is in the
3622  * global zone, or a NULL zone argument is passed in.
3623  */
3624 int
3625 zone_ncpus_get(zone_t *zone)
3626 {
3627         int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3628 
3629         return (myncpus != 0 ? myncpus : ncpus);
3630 }
3631 
3632 /*
3633  * Get the number of online cpus visible to this zone.  The system-wide
3634  * global 'ncpus_online' is returned if pools are disabled, the caller
3635  * is in the global zone, or a NULL zone argument is passed in.
3636  */
3637 int
3638 zone_ncpus_online_get(zone_t *zone)
3639 {
3640         int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3641 
3642         return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3643 }
3644 
3645 /*
3646  * Return the pool to which the zone is currently bound.
3647  */
3648 pool_t *
3649 zone_pool_get(zone_t *zone)
3650 {
3651         ASSERT(pool_lock_held());
3652 
3653         return (zone->zone_pool);
3654 }
3655 
3656 /*
3657  * Set the zone's pool pointer and update the zone's visibility to match
3658  * the resources in the new pool.
3659  */
3660 void
3661 zone_pool_set(zone_t *zone, pool_t *pool)
3662 {
3663         ASSERT(pool_lock_held());
3664         ASSERT(MUTEX_HELD(&cpu_lock));
3665 
3666         zone->zone_pool = pool;
3667         zone_pset_set(zone, pool->pool_pset->pset_id);
3668 }
3669 
3670 /*
3671  * Return the cached value of the id of the processor set to which the
3672  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3673  * facility is disabled.
3674  */
3675 psetid_t
3676 zone_pset_get(zone_t *zone)
3677 {
3678         ASSERT(MUTEX_HELD(&cpu_lock));
3679 
3680         return (zone->zone_psetid);
3681 }
3682 
3683 /*
3684  * Set the cached value of the id of the processor set to which the zone
3685  * is currently bound.  Also update the zone's visibility to match the
3686  * resources in the new processor set.
3687  */
3688 void
3689 zone_pset_set(zone_t *zone, psetid_t newpsetid)
3690 {
3691         psetid_t oldpsetid;
3692 
3693         ASSERT(MUTEX_HELD(&cpu_lock));
3694         oldpsetid = zone_pset_get(zone);
3695 
3696         if (oldpsetid == newpsetid)
3697                 return;
3698         /*
3699          * Global zone sees all.
3700          */
3701         if (zone != global_zone) {
3702                 zone->zone_psetid = newpsetid;
3703                 if (newpsetid != ZONE_PS_INVAL)
3704                         pool_pset_visibility_add(newpsetid, zone);
3705                 if (oldpsetid != ZONE_PS_INVAL)
3706                         pool_pset_visibility_remove(oldpsetid, zone);
3707         }
3708         /*
3709          * Disabling pools, so we should start using the global values
3710          * for ncpus and ncpus_online.
3711          */
3712         if (newpsetid == ZONE_PS_INVAL) {
3713                 zone->zone_ncpus = 0;
3714                 zone->zone_ncpus_online = 0;
3715         }
3716 }
3717 
3718 /*
3719  * Walk the list of active zones and issue the provided callback for
3720  * each of them.
3721  *
3722  * Caller must not be holding any locks that may be acquired under
3723  * zonehash_lock.  See comment at the beginning of the file for a list of
3724  * common locks and their interactions with zones.
3725  */
3726 int
3727 zone_walk(int (*cb)(zone_t *, void *), void *data)
3728 {
3729         zone_t *zone;
3730         int ret = 0;
3731         zone_status_t status;
3732 
3733         mutex_enter(&zonehash_lock);
3734         for (zone = list_head(&zone_active); zone != NULL;
3735             zone = list_next(&zone_active, zone)) {
3736                 /*
3737                  * Skip zones that shouldn't be externally visible.
3738                  */
3739                 status = zone_status_get(zone);
3740                 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3741                         continue;
3742                 /*
3743                  * Bail immediately if any callback invocation returns a
3744                  * non-zero value.
3745                  */
3746                 ret = (*cb)(zone, data);
3747                 if (ret != 0)
3748                         break;
3749         }
3750         mutex_exit(&zonehash_lock);
3751         return (ret);
3752 }
3753 
3754 static int
3755 zone_set_root(zone_t *zone, const char *upath)
3756 {
3757         vnode_t *vp;
3758         int trycount;
3759         int error = 0;
3760         char *path;
3761         struct pathname upn, pn;
3762         size_t pathlen;
3763 
3764         if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3765                 return (error);
3766 
3767         pn_alloc(&pn);
3768 
3769         /* prevent infinite loop */
3770         trycount = 10;
3771         for (;;) {
3772                 if (--trycount <= 0) {
3773                         error = ESTALE;
3774                         goto out;
3775                 }
3776 
3777                 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3778                         /*
3779                          * VOP_ACCESS() may cover 'vp' with a new
3780                          * filesystem, if 'vp' is an autoFS vnode.
3781                          * Get the new 'vp' if so.
3782                          */
3783                         if ((error =
3784                             VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3785                             (!vn_ismntpt(vp) ||
3786                             (error = traverse(&vp)) == 0)) {
3787                                 pathlen = pn.pn_pathlen + 2;
3788                                 path = kmem_alloc(pathlen, KM_SLEEP);
3789                                 (void) strncpy(path, pn.pn_path,
3790                                     pn.pn_pathlen + 1);
3791                                 path[pathlen - 2] = '/';
3792                                 path[pathlen - 1] = '\0';
3793                                 pn_free(&pn);
3794                                 pn_free(&upn);
3795 
3796                                 /* Success! */
3797                                 break;
3798                         }
3799                         VN_RELE(vp);
3800                 }
3801                 if (error != ESTALE)
3802                         goto out;
3803         }
3804 
3805         ASSERT(error == 0);
3806         zone->zone_rootvp = vp;              /* we hold a reference to vp */
3807         zone->zone_rootpath = path;
3808         zone->zone_rootpathlen = pathlen;
3809         if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3810                 zone->zone_flags |= ZF_IS_SCRATCH;
3811         return (0);
3812 
3813 out:
3814         pn_free(&pn);
3815         pn_free(&upn);
3816         return (error);
3817 }
3818 
3819 #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3820                         ((c) >= 'a' && (c) <= 'z') || \
3821                         ((c) >= 'A' && (c) <= 'Z'))
3822 
3823 static int
3824 zone_set_name(zone_t *zone, const char *uname)
3825 {
3826         char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3827         size_t len;
3828         int i, err;
3829 
3830         if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3831                 kmem_free(kname, ZONENAME_MAX);
3832                 return (err);   /* EFAULT or ENAMETOOLONG */
3833         }
3834 
3835         /* must be less than ZONENAME_MAX */
3836         if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3837                 kmem_free(kname, ZONENAME_MAX);
3838                 return (EINVAL);
3839         }
3840 
3841         /*
3842          * Name must start with an alphanumeric and must contain only
3843          * alphanumerics, '-', '_' and '.'.
3844          */
3845         if (!isalnum(kname[0])) {
3846                 kmem_free(kname, ZONENAME_MAX);
3847                 return (EINVAL);
3848         }
3849         for (i = 1; i < len - 1; i++) {
3850                 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3851                     kname[i] != '.') {
3852                         kmem_free(kname, ZONENAME_MAX);
3853                         return (EINVAL);
3854                 }
3855         }
3856 
3857         zone->zone_name = kname;
3858         return (0);
3859 }
3860 
3861 /*
3862  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3863  * is NULL or it points to a zone with no hostid emulation, then the machine's
3864  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3865  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3866  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3867  * hostid and the machine's hostid is invalid.
3868  */
3869 uint32_t
3870 zone_get_hostid(zone_t *zonep)
3871 {
3872         unsigned long machine_hostid;
3873 
3874         if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3875                 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3876                         return (HW_INVALID_HOSTID);
3877                 return ((uint32_t)machine_hostid);
3878         }
3879         return (zonep->zone_hostid);
3880 }
3881 
3882 /*
3883  * Similar to thread_create(), but makes sure the thread is in the appropriate
3884  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3885  */
3886 /*ARGSUSED*/
3887 kthread_t *
3888 zthread_create(
3889     caddr_t stk,
3890     size_t stksize,
3891     void (*proc)(),
3892     void *arg,
3893     size_t len,
3894     pri_t pri)
3895 {
3896         kthread_t *t;
3897         zone_t *zone = curproc->p_zone;
3898         proc_t *pp = zone->zone_zsched;
3899 
3900         zone_hold(zone);        /* Reference to be dropped when thread exits */
3901 
3902         /*
3903          * No-one should be trying to create threads if the zone is shutting
3904          * down and there aren't any kernel threads around.  See comment
3905          * in zthread_exit().
3906          */
3907         ASSERT(!(zone->zone_kthreads == NULL &&
3908             zone_status_get(zone) >= ZONE_IS_EMPTY));
3909         /*
3910          * Create a thread, but don't let it run until we've finished setting
3911          * things up.
3912          */
3913         t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3914         ASSERT(t->t_forw == NULL);
3915         mutex_enter(&zone_status_lock);
3916         if (zone->zone_kthreads == NULL) {
3917                 t->t_forw = t->t_back = t;
3918         } else {
3919                 kthread_t *tx = zone->zone_kthreads;
3920 
3921                 t->t_forw = tx;
3922                 t->t_back = tx->t_back;
3923                 tx->t_back->t_forw = t;
3924                 tx->t_back = t;
3925         }
3926         zone->zone_kthreads = t;
3927         mutex_exit(&zone_status_lock);
3928 
3929         mutex_enter(&pp->p_lock);
3930         t->t_proc_flag |= TP_ZTHREAD;
3931         project_rele(t->t_proj);
3932         t->t_proj = project_hold(pp->p_task->tk_proj);
3933 
3934         /*
3935          * Setup complete, let it run.
3936          */
3937         thread_lock(t);
3938         t->t_schedflag |= TS_ALLSTART;
3939         setrun_locked(t);
3940         thread_unlock(t);
3941 
3942         mutex_exit(&pp->p_lock);
3943 
3944         return (t);
3945 }
3946 
3947 /*
3948  * Similar to thread_exit().  Must be called by threads created via
3949  * zthread_exit().
3950  */
3951 void
3952 zthread_exit(void)
3953 {
3954         kthread_t *t = curthread;
3955         proc_t *pp = curproc;
3956         zone_t *zone = pp->p_zone;
3957 
3958         mutex_enter(&zone_status_lock);
3959 
3960         /*
3961          * Reparent to p0
3962          */
3963         kpreempt_disable();
3964         mutex_enter(&pp->p_lock);
3965         t->t_proc_flag &= ~TP_ZTHREAD;
3966         t->t_procp = &p0;
3967         hat_thread_exit(t);
3968         mutex_exit(&pp->p_lock);
3969         kpreempt_enable();
3970 
3971         if (t->t_back == t) {
3972                 ASSERT(t->t_forw == t);
3973                 /*
3974                  * If the zone is empty, once the thread count
3975                  * goes to zero no further kernel threads can be
3976                  * created.  This is because if the creator is a process
3977                  * in the zone, then it must have exited before the zone
3978                  * state could be set to ZONE_IS_EMPTY.
3979                  * Otherwise, if the creator is a kernel thread in the
3980                  * zone, the thread count is non-zero.
3981                  *
3982                  * This really means that non-zone kernel threads should
3983                  * not create zone kernel threads.
3984                  */
3985                 zone->zone_kthreads = NULL;
3986                 if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3987                         zone_status_set(zone, ZONE_IS_DOWN);
3988                         /*
3989                          * Remove any CPU caps on this zone.
3990                          */
3991                         cpucaps_zone_remove(zone);
3992                 }
3993         } else {
3994                 t->t_forw->t_back = t->t_back;
3995                 t->t_back->t_forw = t->t_forw;
3996                 if (zone->zone_kthreads == t)
3997                         zone->zone_kthreads = t->t_forw;
3998         }
3999         mutex_exit(&zone_status_lock);
4000         zone_rele(zone);
4001         thread_exit();
4002         /* NOTREACHED */
4003 }
4004 
4005 static void
4006 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
4007 {
4008         vnode_t *oldvp;
4009 
4010         /* we're going to hold a reference here to the directory */
4011         VN_HOLD(vp);
4012 
4013         /* update abs cwd/root path see c2/audit.c */
4014         if (AU_AUDITING())
4015                 audit_chdirec(vp, vpp);
4016 
4017         mutex_enter(&pp->p_lock);
4018         oldvp = *vpp;
4019         *vpp = vp;
4020         mutex_exit(&pp->p_lock);
4021         if (oldvp != NULL)
4022                 VN_RELE(oldvp);
4023 }
4024 
4025 /*
4026  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
4027  */
4028 static int
4029 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
4030 {
4031         nvpair_t *nvp = NULL;
4032         boolean_t priv_set = B_FALSE;
4033         boolean_t limit_set = B_FALSE;
4034         boolean_t action_set = B_FALSE;
4035 
4036         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4037                 const char *name;
4038                 uint64_t ui64;
4039 
4040                 name = nvpair_name(nvp);
4041                 if (nvpair_type(nvp) != DATA_TYPE_UINT64)
4042                         return (EINVAL);
4043                 (void) nvpair_value_uint64(nvp, &ui64);
4044                 if (strcmp(name, "privilege") == 0) {
4045                         /*
4046                          * Currently only privileged values are allowed, but
4047                          * this may change in the future.
4048                          */
4049                         if (ui64 != RCPRIV_PRIVILEGED)
4050                                 return (EINVAL);
4051                         rv->rcv_privilege = ui64;
4052                         priv_set = B_TRUE;
4053                 } else if (strcmp(name, "limit") == 0) {
4054                         rv->rcv_value = ui64;
4055                         limit_set = B_TRUE;
4056                 } else if (strcmp(name, "action") == 0) {
4057                         if (ui64 != RCTL_LOCAL_NOACTION &&
4058                             ui64 != RCTL_LOCAL_DENY)
4059                                 return (EINVAL);
4060                         rv->rcv_flagaction = ui64;
4061                         action_set = B_TRUE;
4062                 } else {
4063                         return (EINVAL);
4064                 }
4065         }
4066 
4067         if (!(priv_set && limit_set && action_set))
4068                 return (EINVAL);
4069         rv->rcv_action_signal = 0;
4070         rv->rcv_action_recipient = NULL;
4071         rv->rcv_action_recip_pid = -1;
4072         rv->rcv_firing_time = 0;
4073 
4074         return (0);
4075 }
4076 
4077 /*
4078  * Non-global zone version of start_init.
4079  */
4080 void
4081 zone_start_init(void)
4082 {
4083         proc_t *p = ttoproc(curthread);
4084         zone_t *z = p->p_zone;
4085 
4086         ASSERT(!INGLOBALZONE(curproc));
4087 
4088         /*
4089          * For all purposes (ZONE_ATTR_INITPID and restart_init),
4090          * storing just the pid of init is sufficient.
4091          */
4092         z->zone_proc_initpid = p->p_pid;
4093 
4094         if (z->zone_setup_app_contract == B_TRUE) {
4095                 /*
4096                  * Normally a process cannot modify its own contract, but we're
4097                  * just starting the zone's init process and its contract is
4098                  * always initialized from the sys_process_tmpl template, so
4099                  * this is the simplest way to setup init's contract to kill
4100                  * the process if any other process in the contract exits.
4101                  */
4102                 p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
4103         }
4104 
4105         /*
4106          * We maintain zone_boot_err so that we can return the cause of the
4107          * failure back to the caller of the zone_boot syscall.
4108          */
4109         p->p_zone->zone_boot_err = start_init_common();
4110 
4111         /*
4112          * We will prevent booting zones from becoming running zones if the
4113          * global zone is shutting down.
4114          */
4115         mutex_enter(&zone_status_lock);
4116         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
4117             ZONE_IS_SHUTTING_DOWN) {
4118                 /*
4119                  * Make sure we are still in the booting state-- we could have
4120                  * raced and already be shutting down, or even further along.
4121                  */
4122                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
4123                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
4124                 }
4125                 mutex_exit(&zone_status_lock);
4126                 /* It's gone bad, dispose of the process */
4127                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
4128                         mutex_enter(&p->p_lock);
4129                         ASSERT(p->p_flag & SEXITLWPS);
4130                         lwp_exit();
4131                 }
4132         } else {
4133                 id_t cid = curthread->t_cid;
4134 
4135                 if (zone_status_get(z) == ZONE_IS_BOOTING)
4136                         zone_status_set(z, ZONE_IS_RUNNING);
4137                 mutex_exit(&zone_status_lock);
4138 
4139                 mutex_enter(&class_lock);
4140                 ASSERT(cid < loaded_classes);
4141                 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
4142                     z->zone_fixed_hipri) {
4143                         /*
4144                          * If the zone is using FX then by default all
4145                          * processes start at the lowest priority and stay
4146                          * there. We provide a mechanism for the zone to
4147                          * indicate that it should run at "high priority". In
4148                          * this case we setup init to run at the highest FX
4149                          * priority (which is one level higher than the
4150                          * non-fixed scheduling classes can use).
4151                          */
4152                         pcparms_t pcparms;
4153 
4154                         pcparms.pc_cid = cid;
4155                         ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
4156                         ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
4157                             FXMAXUPRI;
4158                         ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
4159                             FX_DOUPRILIM | FX_DOUPRI;
4160 
4161                         mutex_enter(&pidlock);
4162                         mutex_enter(&curproc->p_lock);
4163 
4164                         (void) parmsset(&pcparms, curthread);
4165 
4166                         mutex_exit(&curproc->p_lock);
4167                         mutex_exit(&pidlock);
4168                 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
4169                         /*
4170                          * zsched always starts the init lwp at priority
4171                          * minclsyspri - 1. This priority gets set in t_pri and
4172                          * is invalid for RT, but RT never uses t_pri. However
4173                          * t_pri is used by procfs, so we always see processes
4174                          * within an RT zone with an invalid priority value.
4175                          * We fix that up now.
4176                          */
4177                         curthread->t_pri = RTGPPRIO0;
4178                 }
4179                 mutex_exit(&class_lock);
4180 
4181                 /* cause the process to return to userland. */
4182                 lwp_rtt();
4183         }
4184 }
4185 
4186 struct zsched_arg {
4187         zone_t *zone;
4188         nvlist_t *nvlist;
4189 };
4190 
4191 /*
4192  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
4193  * anything to do with scheduling, but rather with the fact that
4194  * per-zone kernel threads are parented to zsched, just like regular
4195  * kernel threads are parented to sched (p0).
4196  *
4197  * zsched is also responsible for launching init for the zone.
4198  */
4199 static void
4200 zsched(void *arg)
4201 {
4202         struct zsched_arg *za = arg;
4203         proc_t *pp = curproc;
4204         proc_t *initp = proc_init;
4205         zone_t *zone = za->zone;
4206         cred_t *cr, *oldcred;
4207         rctl_set_t *set;
4208         rctl_alloc_gp_t *gp;
4209         contract_t *ct = NULL;
4210         task_t *tk, *oldtk;
4211         rctl_entity_p_t e;
4212         kproject_t *pj;
4213 
4214         nvlist_t *nvl = za->nvlist;
4215         nvpair_t *nvp = NULL;
4216 
4217         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
4218         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
4219         PTOU(pp)->u_argc = 0;
4220         PTOU(pp)->u_argv = NULL;
4221         PTOU(pp)->u_envp = NULL;
4222         PTOU(pp)->u_commpagep = NULL;
4223         closeall(P_FINFO(pp));
4224 
4225         /*
4226          * We are this zone's "zsched" process.  As the zone isn't generally
4227          * visible yet we don't need to grab any locks before initializing its
4228          * zone_proc pointer.
4229          */
4230         zone_hold(zone);  /* this hold is released by zone_destroy() */
4231         zone->zone_zsched = pp;
4232         mutex_enter(&pp->p_lock);
4233         pp->p_zone = zone;
4234         mutex_exit(&pp->p_lock);
4235 
4236         /*
4237          * Disassociate process from its 'parent'; parent ourselves to init
4238          * (pid 1) and change other values as needed.
4239          */
4240         sess_create();
4241 
4242         mutex_enter(&pidlock);
4243         proc_detach(pp);
4244         pp->p_ppid = 1;
4245         pp->p_flag |= SZONETOP;
4246         pp->p_ancpid = 1;
4247         pp->p_parent = initp;
4248         pp->p_psibling = NULL;
4249         if (initp->p_child)
4250                 initp->p_child->p_psibling = pp;
4251         pp->p_sibling = initp->p_child;
4252         initp->p_child = pp;
4253 
4254         /* Decrement what newproc() incremented. */
4255         upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
4256         /*
4257          * Our credentials are about to become kcred-like, so we don't care
4258          * about the caller's ruid.
4259          */
4260         upcount_inc(crgetruid(kcred), zone->zone_id);
4261         mutex_exit(&pidlock);
4262 
4263         /*
4264          * getting out of global zone, so decrement lwp and process counts
4265          */
4266         pj = pp->p_task->tk_proj;
4267         mutex_enter(&global_zone->zone_nlwps_lock);
4268         pj->kpj_nlwps -= pp->p_lwpcnt;
4269         global_zone->zone_nlwps -= pp->p_lwpcnt;
4270         pj->kpj_nprocs--;
4271         global_zone->zone_nprocs--;
4272         mutex_exit(&global_zone->zone_nlwps_lock);
4273 
4274         /*
4275          * Decrement locked memory counts on old zone and project.
4276          */
4277         mutex_enter(&global_zone->zone_mem_lock);
4278         global_zone->zone_locked_mem -= pp->p_locked_mem;
4279         pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
4280         mutex_exit(&global_zone->zone_mem_lock);
4281 
4282         /*
4283          * Create and join a new task in project '0' of this zone.
4284          *
4285          * We don't need to call holdlwps() since we know we're the only lwp in
4286          * this process.
4287          *
4288          * task_join() returns with p_lock held.
4289          */
4290         tk = task_create(0, zone);
4291         mutex_enter(&cpu_lock);
4292         oldtk = task_join(tk, 0);
4293 
4294         pj = pp->p_task->tk_proj;
4295 
4296         mutex_enter(&zone->zone_mem_lock);
4297         zone->zone_locked_mem += pp->p_locked_mem;
4298         pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
4299         mutex_exit(&zone->zone_mem_lock);
4300 
4301         /*
4302          * add lwp and process counts to zsched's zone, and increment
4303          * project's task and process count due to the task created in
4304          * the above task_create.
4305          */
4306         mutex_enter(&zone->zone_nlwps_lock);
4307         pj->kpj_nlwps += pp->p_lwpcnt;
4308         pj->kpj_ntasks += 1;
4309         zone->zone_nlwps += pp->p_lwpcnt;
4310         pj->kpj_nprocs++;
4311         zone->zone_nprocs++;
4312         mutex_exit(&zone->zone_nlwps_lock);
4313 
4314         mutex_exit(&curproc->p_lock);
4315         mutex_exit(&cpu_lock);
4316         task_rele(oldtk);
4317 
4318         /*
4319          * The process was created by a process in the global zone, hence the
4320          * credentials are wrong.  We might as well have kcred-ish credentials.
4321          */
4322         cr = zone->zone_kcred;
4323         crhold(cr);
4324         mutex_enter(&pp->p_crlock);
4325         oldcred = pp->p_cred;
4326         pp->p_cred = cr;
4327         mutex_exit(&pp->p_crlock);
4328         crfree(oldcred);
4329 
4330         /*
4331          * Hold credentials again (for thread)
4332          */
4333         crhold(cr);
4334 
4335         /*
4336          * p_lwpcnt can't change since this is a kernel process.
4337          */
4338         crset(pp, cr);
4339 
4340         /*
4341          * Chroot
4342          */
4343         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
4344         zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
4345 
4346         /*
4347          * Initialize zone's rctl set.
4348          */
4349         set = rctl_set_create();
4350         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
4351         mutex_enter(&pp->p_lock);
4352         e.rcep_p.zone = zone;
4353         e.rcep_t = RCENTITY_ZONE;
4354         zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
4355         mutex_exit(&pp->p_lock);
4356         rctl_prealloc_destroy(gp);
4357 
4358         /*
4359          * Apply the rctls passed in to zone_create().  This is basically a list
4360          * assignment: all of the old values are removed and the new ones
4361          * inserted.  That is, if an empty list is passed in, all values are
4362          * removed.
4363          */
4364         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4365                 rctl_dict_entry_t *rde;
4366                 rctl_hndl_t hndl;
4367                 char *name;
4368                 nvlist_t **nvlarray;
4369                 uint_t i, nelem;
4370                 int error;      /* For ASSERT()s */
4371 
4372                 name = nvpair_name(nvp);
4373                 hndl = rctl_hndl_lookup(name);
4374                 ASSERT(hndl != -1);
4375                 rde = rctl_dict_lookup_hndl(hndl);
4376                 ASSERT(rde != NULL);
4377 
4378                 for (; /* ever */; ) {
4379                         rctl_val_t oval;
4380 
4381                         mutex_enter(&pp->p_lock);
4382                         error = rctl_local_get(hndl, NULL, &oval, pp);
4383                         mutex_exit(&pp->p_lock);
4384                         ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4385                         ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4386                         if (oval.rcv_privilege == RCPRIV_SYSTEM)
4387                                 break;
4388                         mutex_enter(&pp->p_lock);
4389                         error = rctl_local_delete(hndl, &oval, pp);
4390                         mutex_exit(&pp->p_lock);
4391                         ASSERT(error == 0);
4392                 }
4393                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4394                 ASSERT(error == 0);
4395                 for (i = 0; i < nelem; i++) {
4396                         rctl_val_t *nvalp;
4397 
4398                         nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4399                         error = nvlist2rctlval(nvlarray[i], nvalp);
4400                         ASSERT(error == 0);
4401                         /*
4402                          * rctl_local_insert can fail if the value being
4403                          * inserted is a duplicate; this is OK.
4404                          */
4405                         mutex_enter(&pp->p_lock);
4406                         if (rctl_local_insert(hndl, nvalp, pp) != 0)
4407                                 kmem_cache_free(rctl_val_cache, nvalp);
4408                         mutex_exit(&pp->p_lock);
4409                 }
4410         }
4411         /*
4412          * Tell the world that we're done setting up.
4413          *
4414          * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4415          * and atomically set the zone's processor set visibility.  Once
4416          * we drop pool_lock() this zone will automatically get updated
4417          * to reflect any future changes to the pools configuration.
4418          *
4419          * Note that after we drop the locks below (zonehash_lock in
4420          * particular) other operations such as a zone_getattr call can
4421          * now proceed and observe the zone. That is the reason for doing a
4422          * state transition to the INITIALIZED state.
4423          */
4424         pool_lock();
4425         mutex_enter(&cpu_lock);
4426         mutex_enter(&zonehash_lock);
4427         zone_uniqid(zone);
4428         zone_zsd_configure(zone);
4429         if (pool_state == POOL_ENABLED)
4430                 zone_pset_set(zone, pool_default->pool_pset->pset_id);
4431         mutex_enter(&zone_status_lock);
4432         ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4433         zone_status_set(zone, ZONE_IS_INITIALIZED);
4434         mutex_exit(&zone_status_lock);
4435         mutex_exit(&zonehash_lock);
4436         mutex_exit(&cpu_lock);
4437         pool_unlock();
4438 
4439         /* Now call the create callback for this key */
4440         zsd_apply_all_keys(zsd_apply_create, zone);
4441 
4442         /* The callbacks are complete. Mark ZONE_IS_READY */
4443         mutex_enter(&zone_status_lock);
4444         ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4445         zone_status_set(zone, ZONE_IS_READY);
4446         mutex_exit(&zone_status_lock);
4447 
4448         /*
4449          * Once we see the zone transition to the ZONE_IS_BOOTING state,
4450          * we launch init, and set the state to running.
4451          */
4452         zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4453 
4454         if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4455                 id_t cid;
4456 
4457                 /*
4458                  * Ok, this is a little complicated.  We need to grab the
4459                  * zone's pool's scheduling class ID; note that by now, we
4460                  * are already bound to a pool if we need to be (zoneadmd
4461                  * will have done that to us while we're in the READY
4462                  * state).  *But* the scheduling class for the zone's 'init'
4463                  * must be explicitly passed to newproc, which doesn't
4464                  * respect pool bindings.
4465                  *
4466                  * We hold the pool_lock across the call to newproc() to
4467                  * close the obvious race: the pool's scheduling class
4468                  * could change before we manage to create the LWP with
4469                  * classid 'cid'.
4470                  */
4471                 pool_lock();
4472                 if (zone->zone_defaultcid > 0)
4473                         cid = zone->zone_defaultcid;
4474                 else
4475                         cid = pool_get_class(zone->zone_pool);
4476                 if (cid == -1)
4477                         cid = defaultcid;
4478 
4479                 /*
4480                  * If this fails, zone_boot will ultimately fail.  The
4481                  * state of the zone will be set to SHUTTING_DOWN-- userland
4482                  * will have to tear down the zone, and fail, or try again.
4483                  */
4484                 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4485                     minclsyspri - 1, &ct, 0)) != 0) {
4486                         mutex_enter(&zone_status_lock);
4487                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4488                         mutex_exit(&zone_status_lock);
4489                 } else {
4490                         zone->zone_boot_time = gethrestime_sec();
4491                 }
4492 
4493                 pool_unlock();
4494         }
4495 
4496         /*
4497          * Wait for zone_destroy() to be called.  This is what we spend
4498          * most of our life doing.
4499          */
4500         zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4501 
4502         if (ct)
4503                 /*
4504                  * At this point the process contract should be empty.
4505                  * (Though if it isn't, it's not the end of the world.)
4506                  */
4507                 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4508 
4509         /*
4510          * Allow kcred to be freed when all referring processes
4511          * (including this one) go away.  We can't just do this in
4512          * zone_free because we need to wait for the zone_cred_ref to
4513          * drop to 0 before calling zone_free, and the existence of
4514          * zone_kcred will prevent that.  Thus, we call crfree here to
4515          * balance the crdup in zone_create.  The crhold calls earlier
4516          * in zsched will be dropped when the thread and process exit.
4517          */
4518         crfree(zone->zone_kcred);
4519         zone->zone_kcred = NULL;
4520 
4521         exit(CLD_EXITED, 0);
4522 }
4523 
4524 /*
4525  * Helper function to determine if there are any submounts of the
4526  * provided path.  Used to make sure the zone doesn't "inherit" any
4527  * mounts from before it is created.
4528  */
4529 static uint_t
4530 zone_mount_count(const char *rootpath)
4531 {
4532         vfs_t *vfsp;
4533         uint_t count = 0;
4534         size_t rootpathlen = strlen(rootpath);
4535 
4536         /*
4537          * Holding zonehash_lock prevents race conditions with
4538          * vfs_list_add()/vfs_list_remove() since we serialize with
4539          * zone_find_by_path().
4540          */
4541         ASSERT(MUTEX_HELD(&zonehash_lock));
4542         /*
4543          * The rootpath must end with a '/'
4544          */
4545         ASSERT(rootpath[rootpathlen - 1] == '/');
4546 
4547         /*
4548          * This intentionally does not count the rootpath itself if that
4549          * happens to be a mount point.
4550          */
4551         vfs_list_read_lock();
4552         vfsp = rootvfs;
4553         do {
4554                 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4555                     rootpathlen) == 0)
4556                         count++;
4557                 vfsp = vfsp->vfs_next;
4558         } while (vfsp != rootvfs);
4559         vfs_list_unlock();
4560         return (count);
4561 }
4562 
4563 /*
4564  * Helper function to make sure that a zone created on 'rootpath'
4565  * wouldn't end up containing other zones' rootpaths.
4566  */
4567 static boolean_t
4568 zone_is_nested(const char *rootpath)
4569 {
4570         zone_t *zone;
4571         size_t rootpathlen = strlen(rootpath);
4572         size_t len;
4573 
4574         ASSERT(MUTEX_HELD(&zonehash_lock));
4575 
4576         /*
4577          * zone_set_root() appended '/' and '\0' at the end of rootpath
4578          */
4579         if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4580             (rootpath[1] == '/') && (rootpath[2] == '\0'))
4581                 return (B_TRUE);
4582 
4583         for (zone = list_head(&zone_active); zone != NULL;
4584             zone = list_next(&zone_active, zone)) {
4585                 if (zone == global_zone)
4586                         continue;
4587                 len = strlen(zone->zone_rootpath);
4588                 if (strncmp(rootpath, zone->zone_rootpath,
4589                     MIN(rootpathlen, len)) == 0)
4590                         return (B_TRUE);
4591         }
4592         return (B_FALSE);
4593 }
4594 
4595 static int
4596 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4597     size_t zone_privssz)
4598 {
4599         priv_set_t *privs;
4600 
4601         if (zone_privssz < sizeof (priv_set_t))
4602                 return (ENOMEM);
4603 
4604         privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4605 
4606         if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4607                 kmem_free(privs, sizeof (priv_set_t));
4608                 return (EFAULT);
4609         }
4610 
4611         zone->zone_privset = privs;
4612         return (0);
4613 }
4614 
4615 /*
4616  * We make creative use of nvlists to pass in rctls from userland.  The list is
4617  * a list of the following structures:
4618  *
4619  * (name = rctl_name, value = nvpair_list_array)
4620  *
4621  * Where each element of the nvpair_list_array is of the form:
4622  *
4623  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4624  *      (name = "limit", value = uint64_t),
4625  *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4626  */
4627 static int
4628 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4629 {
4630         nvpair_t *nvp = NULL;
4631         nvlist_t *nvl = NULL;
4632         char *kbuf;
4633         int error;
4634         rctl_val_t rv;
4635 
4636         *nvlp = NULL;
4637 
4638         if (buflen == 0)
4639                 return (0);
4640 
4641         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4642                 return (ENOMEM);
4643         if (copyin(ubuf, kbuf, buflen)) {
4644                 error = EFAULT;
4645                 goto out;
4646         }
4647         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4648                 /*
4649                  * nvl may have been allocated/free'd, but the value set to
4650                  * non-NULL, so we reset it here.
4651                  */
4652                 nvl = NULL;
4653                 error = EINVAL;
4654                 goto out;
4655         }
4656         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4657                 rctl_dict_entry_t *rde;
4658                 rctl_hndl_t hndl;
4659                 nvlist_t **nvlarray;
4660                 uint_t i, nelem;
4661                 char *name;
4662 
4663                 error = EINVAL;
4664                 name = nvpair_name(nvp);
4665                 if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
4666                     strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
4667                     nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4668                         goto out;
4669                 }
4670                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4671                         goto out;
4672                 }
4673                 rde = rctl_dict_lookup_hndl(hndl);
4674                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4675                 ASSERT(error == 0);
4676                 for (i = 0; i < nelem; i++) {
4677                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4678                                 goto out;
4679                 }
4680                 if (rctl_invalid_value(rde, &rv)) {
4681                         error = EINVAL;
4682                         goto out;
4683                 }
4684         }
4685         error = 0;
4686         *nvlp = nvl;
4687 out:
4688         kmem_free(kbuf, buflen);
4689         if (error && nvl != NULL)
4690                 nvlist_free(nvl);
4691         return (error);
4692 }
4693 
4694 int
4695 zone_create_error(int er_error, int er_ext, int *er_out) {
4696         if (er_out != NULL) {
4697                 if (copyout(&er_ext, er_out, sizeof (int))) {
4698                         return (set_errno(EFAULT));
4699                 }
4700         }
4701         return (set_errno(er_error));
4702 }
4703 
4704 static int
4705 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4706 {
4707         ts_label_t *tsl;
4708         bslabel_t blab;
4709 
4710         /* Get label from user */
4711         if (copyin(lab, &blab, sizeof (blab)) != 0)
4712                 return (EFAULT);
4713         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4714         if (tsl == NULL)
4715                 return (ENOMEM);
4716 
4717         zone->zone_slabel = tsl;
4718         return (0);
4719 }
4720 
4721 /*
4722  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4723  */
4724 static int
4725 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4726 {
4727         char *kbuf;
4728         char *dataset, *next;
4729         zone_dataset_t *zd;
4730         size_t len;
4731 
4732         if (ubuf == NULL || buflen == 0)
4733                 return (0);
4734 
4735         if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4736                 return (ENOMEM);
4737 
4738         if (copyin(ubuf, kbuf, buflen) != 0) {
4739                 kmem_free(kbuf, buflen);
4740                 return (EFAULT);
4741         }
4742 
4743         dataset = next = kbuf;
4744         for (;;) {
4745                 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4746 
4747                 next = strchr(dataset, ',');
4748 
4749                 if (next == NULL)
4750                         len = strlen(dataset);
4751                 else
4752                         len = next - dataset;
4753 
4754                 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4755                 bcopy(dataset, zd->zd_dataset, len);
4756                 zd->zd_dataset[len] = '\0';
4757 
4758                 list_insert_head(&zone->zone_datasets, zd);
4759 
4760                 if (next == NULL)
4761                         break;
4762 
4763                 dataset = next + 1;
4764         }
4765 
4766         kmem_free(kbuf, buflen);
4767         return (0);
4768 }
4769 
4770 /*
4771  * System call to create/initialize a new zone named 'zone_name', rooted
4772  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4773  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4774  * with labeling set by 'match', 'doi', and 'label'.
4775  *
4776  * If extended error is non-null, we may use it to return more detailed
4777  * error information.
4778  */
4779 static zoneid_t
4780 zone_create(const char *zone_name, const char *zone_root,
4781     const priv_set_t *zone_privs, size_t zone_privssz,
4782     caddr_t rctlbuf, size_t rctlbufsz,
4783     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4784     int match, uint32_t doi, const bslabel_t *label,
4785     int flags)
4786 {
4787         struct zsched_arg zarg;
4788         nvlist_t *rctls = NULL;
4789         proc_t *pp = curproc;
4790         zone_t *zone, *ztmp;
4791         zoneid_t zoneid;
4792         int error;
4793         int error2 = 0;
4794         char *str;
4795         cred_t *zkcr;
4796         boolean_t insert_label_hash;
4797 
4798         if (secpolicy_zone_config(CRED()) != 0)
4799                 return (set_errno(EPERM));
4800 
4801         /* can't boot zone from within chroot environment */
4802         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4803                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4804                     extended_error));
4805 
4806         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4807         zoneid = zone->zone_id = id_alloc(zoneid_space);
4808         zone->zone_status = ZONE_IS_UNINITIALIZED;
4809         zone->zone_pool = pool_default;
4810         zone->zone_pool_mod = gethrtime();
4811         zone->zone_psetid = ZONE_PS_INVAL;
4812         zone->zone_ncpus = 0;
4813         zone->zone_ncpus_online = 0;
4814         zone->zone_restart_init = B_TRUE;
4815         zone->zone_reboot_on_init_exit = B_FALSE;
4816         zone->zone_init_status = -1;
4817         zone->zone_brand = &native_brand;
4818         zone->zone_initname = NULL;
4819         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4820         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4821         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4822         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4823         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4824             offsetof(zone_ref_t, zref_linkage));
4825         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4826             offsetof(struct zsd_entry, zsd_linkage));
4827         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4828             offsetof(zone_dataset_t, zd_linkage));
4829         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4830             offsetof(zone_dl_t, zdl_linkage));
4831         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4832         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4833 
4834         if (flags & ZCF_NET_EXCL) {
4835                 zone->zone_flags |= ZF_NET_EXCL;
4836         }
4837 
4838         if ((error = zone_set_name(zone, zone_name)) != 0) {
4839                 zone_free(zone);
4840                 return (zone_create_error(error, 0, extended_error));
4841         }
4842 
4843         if ((error = zone_set_root(zone, zone_root)) != 0) {
4844                 zone_free(zone);
4845                 return (zone_create_error(error, 0, extended_error));
4846         }
4847         if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4848                 zone_free(zone);
4849                 return (zone_create_error(error, 0, extended_error));
4850         }
4851 
4852         /* initialize node name to be the same as zone name */
4853         zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4854         (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4855         zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4856 
4857         zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4858         zone->zone_domain[0] = '\0';
4859         zone->zone_hostid = HW_INVALID_HOSTID;
4860         zone->zone_shares = 1;
4861         zone->zone_shmmax = 0;
4862         zone->zone_ipc.ipcq_shmmni = 0;
4863         zone->zone_ipc.ipcq_semmni = 0;
4864         zone->zone_ipc.ipcq_msgmni = 0;
4865         zone->zone_bootargs = NULL;
4866         zone->zone_fs_allowed = NULL;
4867         zone->zone_initname =
4868             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4869         (void) strcpy(zone->zone_initname, zone_default_initname);
4870         zone->zone_nlwps = 0;
4871         zone->zone_nlwps_ctl = INT_MAX;
4872         zone->zone_nprocs = 0;
4873         zone->zone_nprocs_ctl = INT_MAX;
4874         zone->zone_locked_mem = 0;
4875         zone->zone_locked_mem_ctl = UINT64_MAX;
4876         zone->zone_max_swap = 0;
4877         zone->zone_max_swap_ctl = UINT64_MAX;
4878         zone->zone_phys_mem = 0;
4879         zone->zone_phys_mem_ctl = UINT64_MAX;
4880         zone->zone_max_lofi = 0;
4881         zone->zone_max_lofi_ctl = UINT64_MAX;
4882         zone->zone_lockedmem_kstat = NULL;
4883         zone->zone_swapresv_kstat = NULL;
4884         zone->zone_physmem_kstat = NULL;
4885         zone->zone_zfs_io_pri = 1;
4886 
4887         /*
4888          * Zsched initializes the rctls.
4889          */
4890         zone->zone_rctls = NULL;
4891 
4892         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4893                 zone_free(zone);
4894                 return (zone_create_error(error, 0, extended_error));
4895         }
4896 
4897         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4898                 zone_free(zone);
4899                 return (set_errno(error));
4900         }
4901 
4902         /*
4903          * Read in the trusted system parameters:
4904          * match flag and sensitivity label.
4905          */
4906         zone->zone_match = match;
4907         if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4908                 /* Fail if requested to set doi to anything but system's doi */
4909                 if (doi != 0 && doi != default_doi) {
4910                         zone_free(zone);
4911                         return (set_errno(EINVAL));
4912                 }
4913                 /* Always apply system's doi to the zone */
4914                 error = zone_set_label(zone, label, default_doi);
4915                 if (error != 0) {
4916                         zone_free(zone);
4917                         return (set_errno(error));
4918                 }
4919                 insert_label_hash = B_TRUE;
4920         } else {
4921                 /* all zones get an admin_low label if system is not labeled */
4922                 zone->zone_slabel = l_admin_low;
4923                 label_hold(l_admin_low);
4924                 insert_label_hash = B_FALSE;
4925         }
4926 
4927         /*
4928          * Stop all lwps since that's what normally happens as part of fork().
4929          * This needs to happen before we grab any locks to avoid deadlock
4930          * (another lwp in the process could be waiting for the held lock).
4931          */
4932         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4933                 zone_free(zone);
4934                 nvlist_free(rctls);
4935                 return (zone_create_error(error, 0, extended_error));
4936         }
4937 
4938         if (block_mounts(zone) == 0) {
4939                 mutex_enter(&pp->p_lock);
4940                 if (curthread != pp->p_agenttp)
4941                         continuelwps(pp);
4942                 mutex_exit(&pp->p_lock);
4943                 zone_free(zone);
4944                 nvlist_free(rctls);
4945                 return (zone_create_error(error, 0, extended_error));
4946         }
4947 
4948         /*
4949          * Set up credential for kernel access.  After this, any errors
4950          * should go through the dance in errout rather than calling
4951          * zone_free directly.
4952          */
4953         zone->zone_kcred = crdup(kcred);
4954         crsetzone(zone->zone_kcred, zone);
4955         priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4956         priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4957         priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4958         priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4959 
4960         mutex_enter(&zonehash_lock);
4961         /*
4962          * Make sure zone doesn't already exist.
4963          *
4964          * If the system and zone are labeled,
4965          * make sure no other zone exists that has the same label.
4966          */
4967         if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4968             (insert_label_hash &&
4969             (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4970                 zone_status_t status;
4971 
4972                 status = zone_status_get(ztmp);
4973                 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4974                         error = EEXIST;
4975                 else
4976                         error = EBUSY;
4977 
4978                 if (insert_label_hash)
4979                         error2 = ZE_LABELINUSE;
4980 
4981                 goto errout;
4982         }
4983 
4984         /*
4985          * Don't allow zone creations which would cause one zone's rootpath to
4986          * be accessible from that of another (non-global) zone.
4987          */
4988         if (zone_is_nested(zone->zone_rootpath)) {
4989                 error = EBUSY;
4990                 goto errout;
4991         }
4992 
4993         ASSERT(zonecount != 0);         /* check for leaks */
4994         if (zonecount + 1 > maxzones) {
4995                 error = ENOMEM;
4996                 goto errout;
4997         }
4998 
4999         if (zone_mount_count(zone->zone_rootpath) != 0) {
5000                 error = EBUSY;
5001                 error2 = ZE_AREMOUNTS;
5002                 goto errout;
5003         }
5004 
5005         /*
5006          * Zone is still incomplete, but we need to drop all locks while
5007          * zsched() initializes this zone's kernel process.  We
5008          * optimistically add the zone to the hashtable and associated
5009          * lists so a parallel zone_create() doesn't try to create the
5010          * same zone.
5011          */
5012         zonecount++;
5013         (void) mod_hash_insert(zonehashbyid,
5014             (mod_hash_key_t)(uintptr_t)zone->zone_id,
5015             (mod_hash_val_t)(uintptr_t)zone);
5016         str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
5017         (void) strcpy(str, zone->zone_name);
5018         (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
5019             (mod_hash_val_t)(uintptr_t)zone);
5020         if (insert_label_hash) {
5021                 (void) mod_hash_insert(zonehashbylabel,
5022                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
5023                 zone->zone_flags |= ZF_HASHED_LABEL;
5024         }
5025 
5026         /*
5027          * Insert into active list.  At this point there are no 'hold's
5028          * on the zone, but everyone else knows not to use it, so we can
5029          * continue to use it.  zsched() will do a zone_hold() if the
5030          * newproc() is successful.
5031          */
5032         list_insert_tail(&zone_active, zone);
5033         mutex_exit(&zonehash_lock);
5034 
5035         zarg.zone = zone;
5036         zarg.nvlist = rctls;
5037         /*
5038          * The process, task, and project rctls are probably wrong;
5039          * we need an interface to get the default values of all rctls,
5040          * and initialize zsched appropriately. However, we allow zoneadmd
5041          * to pass down both zone and project rctls for the zone's init.
5042          */
5043         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
5044         if (error != 0) {
5045                 /*
5046                  * We need to undo all globally visible state.
5047                  */
5048                 mutex_enter(&zonehash_lock);
5049                 list_remove(&zone_active, zone);
5050                 if (zone->zone_flags & ZF_HASHED_LABEL) {
5051                         ASSERT(zone->zone_slabel != NULL);
5052                         (void) mod_hash_destroy(zonehashbylabel,
5053                             (mod_hash_key_t)zone->zone_slabel);
5054                 }
5055                 (void) mod_hash_destroy(zonehashbyname,
5056                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
5057                 (void) mod_hash_destroy(zonehashbyid,
5058                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
5059                 ASSERT(zonecount > 1);
5060                 zonecount--;
5061                 goto errout;
5062         }
5063 
5064         /*
5065          * Zone creation can't fail from now on.
5066          */
5067 
5068         /*
5069          * Create zone kstats
5070          */
5071         zone_kstat_create(zone);
5072 
5073         /*
5074          * Let the other lwps continue.
5075          */
5076         mutex_enter(&pp->p_lock);
5077         if (curthread != pp->p_agenttp)
5078                 continuelwps(pp);
5079         mutex_exit(&pp->p_lock);
5080 
5081         /*
5082          * Wait for zsched to finish initializing the zone.
5083          */
5084         zone_status_wait(zone, ZONE_IS_READY);
5085         /*
5086          * The zone is fully visible, so we can let mounts progress.
5087          */
5088         resume_mounts(zone);
5089         nvlist_free(rctls);
5090 
5091         return (zoneid);
5092 
5093 errout:
5094         mutex_exit(&zonehash_lock);
5095         /*
5096          * Let the other lwps continue.
5097          */
5098         mutex_enter(&pp->p_lock);
5099         if (curthread != pp->p_agenttp)
5100                 continuelwps(pp);
5101         mutex_exit(&pp->p_lock);
5102 
5103         resume_mounts(zone);
5104         nvlist_free(rctls);
5105         /*
5106          * There is currently one reference to the zone, a cred_ref from
5107          * zone_kcred.  To free the zone, we call crfree, which will call
5108          * zone_cred_rele, which will call zone_free.
5109          */
5110         ASSERT(zone->zone_cred_ref == 1);
5111         ASSERT(zone->zone_kcred->cr_ref == 1);
5112         ASSERT(zone->zone_ref == 0);
5113         zkcr = zone->zone_kcred;
5114         zone->zone_kcred = NULL;
5115         crfree(zkcr);                           /* triggers call to zone_free */
5116         return (zone_create_error(error, error2, extended_error));
5117 }
5118 
5119 /*
5120  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
5121  * the heavy lifting.  initname is the path to the program to launch
5122  * at the "top" of the zone; if this is NULL, we use the system default,
5123  * which is stored at zone_default_initname.
5124  */
5125 static int
5126 zone_boot(zoneid_t zoneid)
5127 {
5128         int err;
5129         zone_t *zone;
5130 
5131         if (secpolicy_zone_config(CRED()) != 0)
5132                 return (set_errno(EPERM));
5133         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5134                 return (set_errno(EINVAL));
5135 
5136         mutex_enter(&zonehash_lock);
5137         /*
5138          * Look for zone under hash lock to prevent races with calls to
5139          * zone_shutdown, zone_destroy, etc.
5140          */
5141         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5142                 mutex_exit(&zonehash_lock);
5143                 return (set_errno(EINVAL));
5144         }
5145 
5146         mutex_enter(&zone_status_lock);
5147         if (zone_status_get(zone) != ZONE_IS_READY) {
5148                 mutex_exit(&zone_status_lock);
5149                 mutex_exit(&zonehash_lock);
5150                 return (set_errno(EINVAL));
5151         }
5152         zone_status_set(zone, ZONE_IS_BOOTING);
5153         mutex_exit(&zone_status_lock);
5154 
5155         zone_hold(zone);        /* so we can use the zone_t later */
5156         mutex_exit(&zonehash_lock);
5157 
5158         if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
5159                 zone_rele(zone);
5160                 return (set_errno(EINTR));
5161         }
5162 
5163         /*
5164          * Boot (starting init) might have failed, in which case the zone
5165          * will go to the SHUTTING_DOWN state; an appropriate errno will
5166          * be placed in zone->zone_boot_err, and so we return that.
5167          */
5168         err = zone->zone_boot_err;
5169         zone_rele(zone);
5170         return (err ? set_errno(err) : 0);
5171 }
5172 
5173 /*
5174  * Kills all user processes in the zone, waiting for them all to exit
5175  * before returning.
5176  */
5177 static int
5178 zone_empty(zone_t *zone)
5179 {
5180         int waitstatus;
5181 
5182         /*
5183          * We need to drop zonehash_lock before killing all
5184          * processes, otherwise we'll deadlock with zone_find_*
5185          * which can be called from the exit path.
5186          */
5187         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
5188         while ((waitstatus = zone_status_timedwait_sig(zone,
5189             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
5190                 killall(zone->zone_id);
5191         }
5192         /*
5193          * return EINTR if we were signaled
5194          */
5195         if (waitstatus == 0)
5196                 return (EINTR);
5197         return (0);
5198 }
5199 
5200 /*
5201  * This function implements the policy for zone visibility.
5202  *
5203  * In standard Solaris, a non-global zone can only see itself.
5204  *
5205  * In Trusted Extensions, a labeled zone can lookup any zone whose label
5206  * it dominates. For this test, the label of the global zone is treated as
5207  * admin_high so it is special-cased instead of being checked for dominance.
5208  *
5209  * Returns true if zone attributes are viewable, false otherwise.
5210  */
5211 static boolean_t
5212 zone_list_access(zone_t *zone)
5213 {
5214 
5215         if (curproc->p_zone == global_zone ||
5216             curproc->p_zone == zone) {
5217                 return (B_TRUE);
5218         } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
5219                 bslabel_t *curproc_label;
5220                 bslabel_t *zone_label;
5221 
5222                 curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
5223                 zone_label = label2bslabel(zone->zone_slabel);
5224 
5225                 if (zone->zone_id != GLOBAL_ZONEID &&
5226                     bldominates(curproc_label, zone_label)) {
5227                         return (B_TRUE);
5228                 } else {
5229                         return (B_FALSE);
5230                 }
5231         } else {
5232                 return (B_FALSE);
5233         }
5234 }
5235 
5236 /*
5237  * Systemcall to start the zone's halt sequence.  By the time this
5238  * function successfully returns, all user processes and kernel threads
5239  * executing in it will have exited, ZSD shutdown callbacks executed,
5240  * and the zone status set to ZONE_IS_DOWN.
5241  *
5242  * It is possible that the call will interrupt itself if the caller is the
5243  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
5244  */
5245 static int
5246 zone_shutdown(zoneid_t zoneid)
5247 {
5248         int error;
5249         zone_t *zone;
5250         zone_status_t status;
5251 
5252         if (secpolicy_zone_config(CRED()) != 0)
5253                 return (set_errno(EPERM));
5254         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5255                 return (set_errno(EINVAL));
5256 
5257         mutex_enter(&zonehash_lock);
5258         /*
5259          * Look for zone under hash lock to prevent races with other
5260          * calls to zone_shutdown and zone_destroy.
5261          */
5262         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5263                 mutex_exit(&zonehash_lock);
5264                 return (set_errno(EINVAL));
5265         }
5266 
5267         /*
5268          * We have to drop zonehash_lock before calling block_mounts.
5269          * Hold the zone so we can continue to use the zone_t.
5270          */
5271         zone_hold(zone);
5272         mutex_exit(&zonehash_lock);
5273 
5274         /*
5275          * Block mounts so that VFS_MOUNT() can get an accurate view of
5276          * the zone's status with regards to ZONE_IS_SHUTTING down.
5277          *
5278          * e.g. NFS can fail the mount if it determines that the zone
5279          * has already begun the shutdown sequence.
5280          *
5281          */
5282         if (block_mounts(zone) == 0) {
5283                 zone_rele(zone);
5284                 return (set_errno(EINTR));
5285         }
5286 
5287         mutex_enter(&zonehash_lock);
5288         mutex_enter(&zone_status_lock);
5289         status = zone_status_get(zone);
5290         /*
5291          * Fail if the zone isn't fully initialized yet.
5292          */
5293         if (status < ZONE_IS_READY) {
5294                 mutex_exit(&zone_status_lock);
5295                 mutex_exit(&zonehash_lock);
5296                 resume_mounts(zone);
5297                 zone_rele(zone);
5298                 return (set_errno(EINVAL));
5299         }
5300         /*
5301          * If conditions required for zone_shutdown() to return have been met,
5302          * return success.
5303          */
5304         if (status >= ZONE_IS_DOWN) {
5305                 mutex_exit(&zone_status_lock);
5306                 mutex_exit(&zonehash_lock);
5307                 resume_mounts(zone);
5308                 zone_rele(zone);
5309                 return (0);
5310         }
5311         /*
5312          * If zone_shutdown() hasn't been called before, go through the motions.
5313          * If it has, there's nothing to do but wait for the kernel threads to
5314          * drain.
5315          */
5316         if (status < ZONE_IS_EMPTY) {
5317                 uint_t ntasks;
5318 
5319                 mutex_enter(&zone->zone_lock);
5320                 if ((ntasks = zone->zone_ntasks) != 1) {
5321                         /*
5322                          * There's still stuff running.
5323                          */
5324                         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5325                 }
5326                 mutex_exit(&zone->zone_lock);
5327                 if (ntasks == 1) {
5328                         /*
5329                          * The only way to create another task is through
5330                          * zone_enter(), which will block until we drop
5331                          * zonehash_lock.  The zone is empty.
5332                          */
5333                         if (zone->zone_kthreads == NULL) {
5334                                 /*
5335                                  * Skip ahead to ZONE_IS_DOWN
5336                                  */
5337                                 zone_status_set(zone, ZONE_IS_DOWN);
5338                         } else {
5339                                 zone_status_set(zone, ZONE_IS_EMPTY);
5340                         }
5341                 }
5342         }
5343         mutex_exit(&zone_status_lock);
5344         mutex_exit(&zonehash_lock);
5345         resume_mounts(zone);
5346 
5347         if (error = zone_empty(zone)) {
5348                 zone_rele(zone);
5349                 return (set_errno(error));
5350         }
5351         /*
5352          * After the zone status goes to ZONE_IS_DOWN this zone will no
5353          * longer be notified of changes to the pools configuration, so
5354          * in order to not end up with a stale pool pointer, we point
5355          * ourselves at the default pool and remove all resource
5356          * visibility.  This is especially important as the zone_t may
5357          * languish on the deathrow for a very long time waiting for
5358          * cred's to drain out.
5359          *
5360          * This rebinding of the zone can happen multiple times
5361          * (presumably due to interrupted or parallel systemcalls)
5362          * without any adverse effects.
5363          */
5364         if (pool_lock_intr() != 0) {
5365                 zone_rele(zone);
5366                 return (set_errno(EINTR));
5367         }
5368         if (pool_state == POOL_ENABLED) {
5369                 mutex_enter(&cpu_lock);
5370                 zone_pool_set(zone, pool_default);
5371                 /*
5372                  * The zone no longer needs to be able to see any cpus.
5373                  */
5374                 zone_pset_set(zone, ZONE_PS_INVAL);
5375                 mutex_exit(&cpu_lock);
5376         }
5377         pool_unlock();
5378 
5379         /*
5380          * ZSD shutdown callbacks can be executed multiple times, hence
5381          * it is safe to not be holding any locks across this call.
5382          */
5383         zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5384 
5385         mutex_enter(&zone_status_lock);
5386         if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5387                 zone_status_set(zone, ZONE_IS_DOWN);
5388         mutex_exit(&zone_status_lock);
5389 
5390         /*
5391          * Wait for kernel threads to drain.
5392          */
5393         if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5394                 zone_rele(zone);
5395                 return (set_errno(EINTR));
5396         }
5397 
5398         /*
5399          * Zone can be become down/destroyable even if the above wait
5400          * returns EINTR, so any code added here may never execute.
5401          * (i.e. don't add code here)
5402          */
5403 
5404         zone_rele(zone);
5405         return (0);
5406 }
5407 
5408 /*
5409  * Log the specified zone's reference counts.  The caller should not be
5410  * holding the zone's zone_lock.
5411  */
5412 static void
5413 zone_log_refcounts(zone_t *zone)
5414 {
5415         char *buffer;
5416         char *buffer_position;
5417         uint32_t buffer_size;
5418         uint32_t index;
5419         uint_t ref;
5420         uint_t cred_ref;
5421 
5422         /*
5423          * Construct a string representing the subsystem-specific reference
5424          * counts.  The counts are printed in ascending order by index into the
5425          * zone_t::zone_subsys_ref array.  The list will be surrounded by
5426          * square brackets [] and will only contain nonzero reference counts.
5427          *
5428          * The buffer will hold two square bracket characters plus ten digits,
5429          * one colon, one space, one comma, and some characters for a
5430          * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5431          * bit integers have at most ten decimal digits.)  The last
5432          * reference count's comma is replaced by the closing square
5433          * bracket and a NULL character to terminate the string.
5434          *
5435          * NOTE: We have to grab the zone's zone_lock to create a consistent
5436          * snapshot of the zone's reference counters.
5437          *
5438          * First, figure out how much space the string buffer will need.
5439          * The buffer's size is stored in buffer_size.
5440          */
5441         buffer_size = 2;                        /* for the square brackets */
5442         mutex_enter(&zone->zone_lock);
5443         zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5444         ref = zone->zone_ref;
5445         cred_ref = zone->zone_cred_ref;
5446         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5447                 if (zone->zone_subsys_ref[index] != 0)
5448                         buffer_size += strlen(zone_ref_subsys_names[index]) +
5449                             13;
5450         if (buffer_size == 2) {
5451                 /*
5452                  * No subsystems had nonzero reference counts.  Don't bother
5453                  * with allocating a buffer; just log the general-purpose and
5454                  * credential reference counts.
5455                  */
5456                 mutex_exit(&zone->zone_lock);
5457                 (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5458                     "Zone '%s' (ID: %d) is shutting down, but %u zone "
5459                     "references and %u credential references are still extant",
5460                     zone->zone_name, zone->zone_id, ref, cred_ref);
5461                 return;
5462         }
5463 
5464         /*
5465          * buffer_size contains the exact number of characters that the
5466          * buffer will need.  Allocate the buffer and fill it with nonzero
5467          * subsystem-specific reference counts.  Surround the results with
5468          * square brackets afterwards.
5469          */
5470         buffer = kmem_alloc(buffer_size, KM_SLEEP);
5471         buffer_position = &buffer[1];
5472         for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5473                 /*
5474                  * NOTE: The DDI's version of sprintf() returns a pointer to
5475                  * the modified buffer rather than the number of bytes written
5476                  * (as in snprintf(3C)).  This is unfortunate and annoying.
5477                  * Therefore, we'll use snprintf() with INT_MAX to get the
5478                  * number of bytes written.  Using INT_MAX is safe because
5479                  * the buffer is perfectly sized for the data: we'll never
5480                  * overrun the buffer.
5481                  */
5482                 if (zone->zone_subsys_ref[index] != 0)
5483                         buffer_position += snprintf(buffer_position, INT_MAX,
5484                             "%s: %u,", zone_ref_subsys_names[index],
5485                             zone->zone_subsys_ref[index]);
5486         }
5487         mutex_exit(&zone->zone_lock);
5488         buffer[0] = '[';
5489         ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5490         ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5491         buffer_position[-1] = ']';
5492 
5493         /*
5494          * Log the reference counts and free the message buffer.
5495          */
5496         (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5497             "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5498             "%u credential references are still extant %s", zone->zone_name,
5499             zone->zone_id, ref, cred_ref, buffer);
5500         kmem_free(buffer, buffer_size);
5501 }
5502 
5503 /*
5504  * Systemcall entry point to finalize the zone halt process.  The caller
5505  * must have already successfully called zone_shutdown().
5506  *
5507  * Upon successful completion, the zone will have been fully destroyed:
5508  * zsched will have exited, destructor callbacks executed, and the zone
5509  * removed from the list of active zones.
5510  */
5511 static int
5512 zone_destroy(zoneid_t zoneid)
5513 {
5514         uint64_t uniqid;
5515         zone_t *zone;
5516         zone_status_t status;
5517         clock_t wait_time;
5518         boolean_t log_refcounts;
5519 
5520         if (secpolicy_zone_config(CRED()) != 0)
5521                 return (set_errno(EPERM));
5522         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5523                 return (set_errno(EINVAL));
5524 
5525         mutex_enter(&zonehash_lock);
5526         /*
5527          * Look for zone under hash lock to prevent races with other
5528          * calls to zone_destroy.
5529          */
5530         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5531                 mutex_exit(&zonehash_lock);
5532                 return (set_errno(EINVAL));
5533         }
5534 
5535         if (zone_mount_count(zone->zone_rootpath) != 0) {
5536                 mutex_exit(&zonehash_lock);
5537                 return (set_errno(EBUSY));
5538         }
5539         mutex_enter(&zone_status_lock);
5540         status = zone_status_get(zone);
5541         if (status < ZONE_IS_DOWN) {
5542                 mutex_exit(&zone_status_lock);
5543                 mutex_exit(&zonehash_lock);
5544                 return (set_errno(EBUSY));
5545         } else if (status == ZONE_IS_DOWN) {
5546                 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5547         }
5548         mutex_exit(&zone_status_lock);
5549         zone_hold(zone);
5550         mutex_exit(&zonehash_lock);
5551 
5552         /*
5553          * wait for zsched to exit
5554          */
5555         zone_status_wait(zone, ZONE_IS_DEAD);
5556         zone_zsd_callbacks(zone, ZSD_DESTROY);
5557         zone->zone_netstack = NULL;
5558         uniqid = zone->zone_uniqid;
5559         zone_rele(zone);
5560         zone = NULL;    /* potentially free'd */
5561 
5562         log_refcounts = B_FALSE;
5563         wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5564         mutex_enter(&zonehash_lock);
5565         for (; /* ever */; ) {
5566                 boolean_t unref;
5567                 boolean_t refs_have_been_logged;
5568 
5569                 if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5570                     zone->zone_uniqid != uniqid) {
5571                         /*
5572                          * The zone has gone away.  Necessary conditions
5573                          * are met, so we return success.
5574                          */
5575                         mutex_exit(&zonehash_lock);
5576                         return (0);
5577                 }
5578                 mutex_enter(&zone->zone_lock);
5579                 unref = ZONE_IS_UNREF(zone);
5580                 refs_have_been_logged = (zone->zone_flags &
5581                     ZF_REFCOUNTS_LOGGED);
5582                 mutex_exit(&zone->zone_lock);
5583                 if (unref) {
5584                         /*
5585                          * There is only one reference to the zone -- that
5586                          * added when the zone was added to the hashtables --
5587                          * and things will remain this way until we drop
5588                          * zonehash_lock... we can go ahead and cleanup the
5589                          * zone.
5590                          */
5591                         break;
5592                 }
5593 
5594                 /*
5595                  * Wait for zone_rele_common() or zone_cred_rele() to signal
5596                  * zone_destroy_cv.  zone_destroy_cv is signaled only when
5597                  * some zone's general-purpose reference count reaches one.
5598                  * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5599                  * on zone_destroy_cv, then log the zone's reference counts and
5600                  * continue to wait for zone_rele() and zone_cred_rele().
5601                  */
5602                 if (!refs_have_been_logged) {
5603                         if (!log_refcounts) {
5604                                 /*
5605                                  * This thread hasn't timed out waiting on
5606                                  * zone_destroy_cv yet.  Wait wait_time clock
5607                                  * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5608                                  * seconds) for the zone's references to clear.
5609                                  */
5610                                 ASSERT(wait_time > 0);
5611                                 wait_time = cv_reltimedwait_sig(
5612                                     &zone_destroy_cv, &zonehash_lock, wait_time,
5613                                     TR_SEC);
5614                                 if (wait_time > 0) {
5615                                         /*
5616                                          * A thread in zone_rele() or
5617                                          * zone_cred_rele() signaled
5618                                          * zone_destroy_cv before this thread's
5619                                          * wait timed out.  The zone might have
5620                                          * only one reference left; find out!
5621                                          */
5622                                         continue;
5623                                 } else if (wait_time == 0) {
5624                                         /* The thread's process was signaled. */
5625                                         mutex_exit(&zonehash_lock);
5626                                         return (set_errno(EINTR));
5627                                 }
5628 
5629                                 /*
5630                                  * The thread timed out while waiting on
5631                                  * zone_destroy_cv.  Even though the thread
5632                                  * timed out, it has to check whether another
5633                                  * thread woke up from zone_destroy_cv and
5634                                  * destroyed the zone.
5635                                  *
5636                                  * If the zone still exists and has more than
5637                                  * one unreleased general-purpose reference,
5638                                  * then log the zone's reference counts.
5639                                  */
5640                                 log_refcounts = B_TRUE;
5641                                 continue;
5642                         }
5643 
5644                         /*
5645                          * The thread already timed out on zone_destroy_cv while
5646                          * waiting for subsystems to release the zone's last
5647                          * general-purpose references.  Log the zone's reference
5648                          * counts and wait indefinitely on zone_destroy_cv.
5649                          */
5650                         zone_log_refcounts(zone);
5651                 }
5652                 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5653                         /* The thread's process was signaled. */
5654                         mutex_exit(&zonehash_lock);
5655                         return (set_errno(EINTR));
5656                 }
5657         }
5658 
5659         /*
5660          * Remove CPU cap for this zone now since we're not going to
5661          * fail below this point.
5662          */
5663         cpucaps_zone_remove(zone);
5664 
5665         /* Get rid of the zone's kstats */
5666         zone_kstat_delete(zone);
5667 
5668         /* remove the pfexecd doors */
5669         if (zone->zone_pfexecd != NULL) {
5670                 klpd_freelist(&zone->zone_pfexecd);
5671                 zone->zone_pfexecd = NULL;
5672         }
5673 
5674         /* free brand specific data */
5675         if (ZONE_IS_BRANDED(zone))
5676                 ZBROP(zone)->b_free_brand_data(zone);
5677 
5678         /* Say goodbye to brand framework. */
5679         brand_unregister_zone(zone->zone_brand);
5680 
5681         /*
5682          * It is now safe to let the zone be recreated; remove it from the
5683          * lists.  The memory will not be freed until the last cred
5684          * reference goes away.
5685          */
5686         ASSERT(zonecount > 1);       /* must be > 1; can't destroy global zone */
5687         zonecount--;
5688         /* remove from active list and hash tables */
5689         list_remove(&zone_active, zone);
5690         (void) mod_hash_destroy(zonehashbyname,
5691             (mod_hash_key_t)zone->zone_name);
5692         (void) mod_hash_destroy(zonehashbyid,
5693             (mod_hash_key_t)(uintptr_t)zone->zone_id);
5694         if (zone->zone_flags & ZF_HASHED_LABEL)
5695                 (void) mod_hash_destroy(zonehashbylabel,
5696                     (mod_hash_key_t)zone->zone_slabel);
5697         mutex_exit(&zonehash_lock);
5698 
5699         /*
5700          * Release the root vnode; we're not using it anymore.  Nor should any
5701          * other thread that might access it exist.
5702          */
5703         if (zone->zone_rootvp != NULL) {
5704                 VN_RELE(zone->zone_rootvp);
5705                 zone->zone_rootvp = NULL;
5706         }
5707 
5708         /* add to deathrow list */
5709         mutex_enter(&zone_deathrow_lock);
5710         list_insert_tail(&zone_deathrow, zone);
5711         mutex_exit(&zone_deathrow_lock);
5712 
5713         /*
5714          * Drop last reference (which was added by zsched()), this will
5715          * free the zone unless there are outstanding cred references.
5716          */
5717         zone_rele(zone);
5718         return (0);
5719 }
5720 
5721 /*
5722  * Systemcall entry point for zone_getattr(2).
5723  */
5724 static ssize_t
5725 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5726 {
5727         size_t size;
5728         int error = 0, err;
5729         zone_t *zone;
5730         char *zonepath;
5731         char *outstr;
5732         zone_status_t zone_status;
5733         pid_t initpid;
5734         boolean_t global = (curzone == global_zone);
5735         boolean_t inzone = (curzone->zone_id == zoneid);
5736         ushort_t flags;
5737         zone_net_data_t *zbuf;
5738 
5739         mutex_enter(&zonehash_lock);
5740         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5741                 mutex_exit(&zonehash_lock);
5742                 return (set_errno(EINVAL));
5743         }
5744         zone_status = zone_status_get(zone);
5745         if (zone_status < ZONE_IS_INITIALIZED) {
5746                 mutex_exit(&zonehash_lock);
5747                 return (set_errno(EINVAL));
5748         }
5749         zone_hold(zone);
5750         mutex_exit(&zonehash_lock);
5751 
5752         /*
5753          * If not in the global zone, don't show information about other zones,
5754          * unless the system is labeled and the local zone's label dominates
5755          * the other zone.
5756          */
5757         if (!zone_list_access(zone)) {
5758                 zone_rele(zone);
5759                 return (set_errno(EINVAL));
5760         }
5761 
5762         switch (attr) {
5763         case ZONE_ATTR_ROOT:
5764                 if (global) {
5765                         /*
5766                          * Copy the path to trim the trailing "/" (except for
5767                          * the global zone).
5768                          */
5769                         if (zone != global_zone)
5770                                 size = zone->zone_rootpathlen - 1;
5771                         else
5772                                 size = zone->zone_rootpathlen;
5773                         zonepath = kmem_alloc(size, KM_SLEEP);
5774                         bcopy(zone->zone_rootpath, zonepath, size);
5775                         zonepath[size - 1] = '\0';
5776                 } else {
5777                         if (inzone || !is_system_labeled()) {
5778                                 /*
5779                                  * Caller is not in the global zone.
5780                                  * if the query is on the current zone
5781                                  * or the system is not labeled,
5782                                  * just return faked-up path for current zone.
5783                                  */
5784                                 zonepath = "/";
5785                                 size = 2;
5786                         } else {
5787                                 /*
5788                                  * Return related path for current zone.
5789                                  */
5790                                 int prefix_len = strlen(zone_prefix);
5791                                 int zname_len = strlen(zone->zone_name);
5792 
5793                                 size = prefix_len + zname_len + 1;
5794                                 zonepath = kmem_alloc(size, KM_SLEEP);
5795                                 bcopy(zone_prefix, zonepath, prefix_len);
5796                                 bcopy(zone->zone_name, zonepath +
5797                                     prefix_len, zname_len);
5798                                 zonepath[size - 1] = '\0';
5799                         }
5800                 }
5801                 if (bufsize > size)
5802                         bufsize = size;
5803                 if (buf != NULL) {
5804                         err = copyoutstr(zonepath, buf, bufsize, NULL);
5805                         if (err != 0 && err != ENAMETOOLONG)
5806                                 error = EFAULT;
5807                 }
5808                 if (global || (is_system_labeled() && !inzone))
5809                         kmem_free(zonepath, size);
5810                 break;
5811 
5812         case ZONE_ATTR_NAME:
5813                 size = strlen(zone->zone_name) + 1;
5814                 if (bufsize > size)
5815                         bufsize = size;
5816                 if (buf != NULL) {
5817                         err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5818                         if (err != 0 && err != ENAMETOOLONG)
5819                                 error = EFAULT;
5820                 }
5821                 break;
5822 
5823         case ZONE_ATTR_STATUS:
5824                 /*
5825                  * Since we're not holding zonehash_lock, the zone status
5826                  * may be anything; leave it up to userland to sort it out.
5827                  */
5828                 size = sizeof (zone_status);
5829                 if (bufsize > size)
5830                         bufsize = size;
5831                 zone_status = zone_status_get(zone);
5832                 if (buf != NULL &&
5833                     copyout(&zone_status, buf, bufsize) != 0)
5834                         error = EFAULT;
5835                 break;
5836         case ZONE_ATTR_FLAGS:
5837                 size = sizeof (zone->zone_flags);
5838                 if (bufsize > size)
5839                         bufsize = size;
5840                 flags = zone->zone_flags;
5841                 if (buf != NULL &&
5842                     copyout(&flags, buf, bufsize) != 0)
5843                         error = EFAULT;
5844                 break;
5845         case ZONE_ATTR_PRIVSET:
5846                 size = sizeof (priv_set_t);
5847                 if (bufsize > size)
5848                         bufsize = size;
5849                 if (buf != NULL &&
5850                     copyout(zone->zone_privset, buf, bufsize) != 0)
5851                         error = EFAULT;
5852                 break;
5853         case ZONE_ATTR_UNIQID:
5854                 size = sizeof (zone->zone_uniqid);
5855                 if (bufsize > size)
5856                         bufsize = size;
5857                 if (buf != NULL &&
5858                     copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5859                         error = EFAULT;
5860                 break;
5861         case ZONE_ATTR_POOLID:
5862                 {
5863                         pool_t *pool;
5864                         poolid_t poolid;
5865 
5866                         if (pool_lock_intr() != 0) {
5867                                 error = EINTR;
5868                                 break;
5869                         }
5870                         pool = zone_pool_get(zone);
5871                         poolid = pool->pool_id;
5872                         pool_unlock();
5873                         size = sizeof (poolid);
5874                         if (bufsize > size)
5875                                 bufsize = size;
5876                         if (buf != NULL && copyout(&poolid, buf, size) != 0)
5877                                 error = EFAULT;
5878                 }
5879                 break;
5880         case ZONE_ATTR_SLBL:
5881                 size = sizeof (bslabel_t);
5882                 if (bufsize > size)
5883                         bufsize = size;
5884                 if (zone->zone_slabel == NULL)
5885                         error = EINVAL;
5886                 else if (buf != NULL &&
5887                     copyout(label2bslabel(zone->zone_slabel), buf,
5888                     bufsize) != 0)
5889                         error = EFAULT;
5890                 break;
5891         case ZONE_ATTR_INITPID:
5892                 size = sizeof (initpid);
5893                 if (bufsize > size)
5894                         bufsize = size;
5895                 initpid = zone->zone_proc_initpid;
5896                 if (initpid == -1) {
5897                         error = ESRCH;
5898                         break;
5899                 }
5900                 if (buf != NULL &&
5901                     copyout(&initpid, buf, bufsize) != 0)
5902                         error = EFAULT;
5903                 break;
5904         case ZONE_ATTR_BRAND:
5905                 size = strlen(zone->zone_brand->b_name) + 1;
5906 
5907                 if (bufsize > size)
5908                         bufsize = size;
5909                 if (buf != NULL) {
5910                         err = copyoutstr(zone->zone_brand->b_name, buf,
5911                             bufsize, NULL);
5912                         if (err != 0 && err != ENAMETOOLONG)
5913                                 error = EFAULT;
5914                 }
5915                 break;
5916         case ZONE_ATTR_INITNAME:
5917                 size = strlen(zone->zone_initname) + 1;
5918                 if (bufsize > size)
5919                         bufsize = size;
5920                 if (buf != NULL) {
5921                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5922                             NULL);
5923                         if (err != 0 && err != ENAMETOOLONG)
5924                                 error = EFAULT;
5925                 }
5926                 break;
5927         case ZONE_ATTR_BOOTARGS:
5928                 if (zone->zone_bootargs == NULL)
5929                         outstr = "";
5930                 else
5931                         outstr = zone->zone_bootargs;
5932                 size = strlen(outstr) + 1;
5933                 if (bufsize > size)
5934                         bufsize = size;
5935                 if (buf != NULL) {
5936                         err = copyoutstr(outstr, buf, bufsize, NULL);
5937                         if (err != 0 && err != ENAMETOOLONG)
5938                                 error = EFAULT;
5939                 }
5940                 break;
5941         case ZONE_ATTR_SCHED_CLASS:
5942                 mutex_enter(&class_lock);
5943 
5944                 if (zone->zone_defaultcid >= loaded_classes)
5945                         outstr = "";
5946                 else
5947                         outstr = sclass[zone->zone_defaultcid].cl_name;
5948                 size = strlen(outstr) + 1;
5949                 if (bufsize > size)
5950                         bufsize = size;
5951                 if (buf != NULL) {
5952                         err = copyoutstr(outstr, buf, bufsize, NULL);
5953                         if (err != 0 && err != ENAMETOOLONG)
5954                                 error = EFAULT;
5955                 }
5956 
5957                 mutex_exit(&class_lock);
5958                 break;
5959         case ZONE_ATTR_HOSTID:
5960                 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5961                     bufsize == sizeof (zone->zone_hostid)) {
5962                         size = sizeof (zone->zone_hostid);
5963                         if (buf != NULL && copyout(&zone->zone_hostid, buf,
5964                             bufsize) != 0)
5965                                 error = EFAULT;
5966                 } else {
5967                         error = EINVAL;
5968                 }
5969                 break;
5970         case ZONE_ATTR_FS_ALLOWED:
5971                 if (zone->zone_fs_allowed == NULL)
5972                         outstr = "";
5973                 else
5974                         outstr = zone->zone_fs_allowed;
5975                 size = strlen(outstr) + 1;
5976                 if (bufsize > size)
5977                         bufsize = size;
5978                 if (buf != NULL) {
5979                         err = copyoutstr(outstr, buf, bufsize, NULL);
5980                         if (err != 0 && err != ENAMETOOLONG)
5981                                 error = EFAULT;
5982                 }
5983                 break;
5984         case ZONE_ATTR_NETWORK:
5985                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5986                 if (copyin(buf, zbuf, bufsize) != 0) {
5987                         error = EFAULT;
5988                 } else {
5989                         error = zone_get_network(zoneid, zbuf);
5990                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5991                                 error = EFAULT;
5992                 }
5993                 kmem_free(zbuf, bufsize);
5994                 break;
5995         case ZONE_ATTR_SCHED_FIXEDHI:
5996                 size = sizeof (boolean_t);
5997                 if (bufsize > size)
5998                         bufsize = size;
5999 
6000                 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6001                     bufsize) != 0)
6002                         error = EFAULT;
6003                 break;
6004         default:
6005                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6006                         size = bufsize;
6007                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6008                 } else {
6009                         error = EINVAL;
6010                 }
6011         }
6012         zone_rele(zone);
6013 
6014         if (error)
6015                 return (set_errno(error));
6016         return ((ssize_t)size);
6017 }
6018 
6019 /*
6020  * Systemcall entry point for zone_setattr(2).
6021  */
6022 /*ARGSUSED*/
6023 static int
6024 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
6025 {
6026         zone_t *zone;
6027         zone_status_t zone_status;
6028         int err = -1;
6029         zone_net_data_t *zbuf;
6030 
6031         if (secpolicy_zone_config(CRED()) != 0)
6032                 return (set_errno(EPERM));
6033 
6034         /*
6035          * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
6036          * attributes can be set on the global zone.
6037          */
6038         if (zoneid == GLOBAL_ZONEID &&
6039             attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
6040                 return (set_errno(EINVAL));
6041         }
6042 
6043         mutex_enter(&zonehash_lock);
6044         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
6045                 mutex_exit(&zonehash_lock);
6046                 return (set_errno(EINVAL));
6047         }
6048         zone_hold(zone);
6049         mutex_exit(&zonehash_lock);
6050 
6051         /*
6052          * At present most attributes can only be set on non-running,
6053          * non-global zones.
6054          */
6055         zone_status = zone_status_get(zone);
6056         if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
6057             attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
6058             zone_status > ZONE_IS_READY) {
6059                 err = EINVAL;
6060                 goto done;
6061         }
6062 
6063         switch (attr) {
6064         case ZONE_ATTR_INITNAME:
6065                 err = zone_set_initname(zone, (const char *)buf);
6066                 break;
6067         case ZONE_ATTR_INITNORESTART:
6068                 zone->zone_restart_init = B_FALSE;
6069                 err = 0;
6070                 break;
6071         case ZONE_ATTR_BOOTARGS:
6072                 err = zone_set_bootargs(zone, (const char *)buf);
6073                 break;
6074         case ZONE_ATTR_BRAND:
6075                 err = zone_set_brand(zone, (const char *)buf);
6076                 break;
6077         case ZONE_ATTR_FS_ALLOWED:
6078                 err = zone_set_fs_allowed(zone, (const char *)buf);
6079                 break;
6080         case ZONE_ATTR_PMCAP_NOVER:
6081                 err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
6082                 break;
6083         case ZONE_ATTR_PMCAP_PAGEOUT:
6084                 err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
6085                 break;
6086         case ZONE_ATTR_PG_FLT_DELAY:
6087                 err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
6088                 break;
6089         case ZONE_ATTR_RSS:
6090                 err = zone_set_rss(zone, (const uint64_t *)buf);
6091                 break;
6092         case ZONE_ATTR_SCHED_CLASS:
6093                 err = zone_set_sched_class(zone, (const char *)buf);
6094                 break;
6095         case ZONE_ATTR_HOSTID:
6096                 if (bufsize == sizeof (zone->zone_hostid)) {
6097                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
6098                                 err = 0;
6099                         else
6100                                 err = EFAULT;
6101                 } else {
6102                         err = EINVAL;
6103                 }
6104                 break;
6105         case ZONE_ATTR_NETWORK:
6106                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
6107                         err = EINVAL;
6108                         break;
6109                 }
6110                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6111                 if (copyin(buf, zbuf, bufsize) != 0) {
6112                         kmem_free(zbuf, bufsize);
6113                         err = EFAULT;
6114                         break;
6115                 }
6116                 err = zone_set_network(zoneid, zbuf);
6117                 kmem_free(zbuf, bufsize);
6118                 break;
6119         case ZONE_ATTR_APP_SVC_CT:
6120                 if (bufsize != sizeof (boolean_t)) {
6121                         err = EINVAL;
6122                 } else {
6123                         zone->zone_setup_app_contract = (boolean_t)buf;
6124                         err = 0;
6125                 }
6126                 break;
6127         case ZONE_ATTR_SCHED_FIXEDHI:
6128                 if (bufsize != sizeof (boolean_t)) {
6129                         err = EINVAL;
6130                 } else {
6131                         zone->zone_fixed_hipri = (boolean_t)buf;
6132                         err = 0;
6133                 }
6134                 break;
6135         default:
6136                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
6137                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
6138                 else
6139                         err = EINVAL;
6140         }
6141 
6142 done:
6143         zone_rele(zone);
6144         ASSERT(err != -1);
6145         return (err != 0 ? set_errno(err) : 0);
6146 }
6147 
6148 /*
6149  * Return zero if the process has at least one vnode mapped in to its
6150  * address space which shouldn't be allowed to change zones.
6151  *
6152  * Also return zero if the process has any shared mappings which reserve
6153  * swap.  This is because the counting for zone.max-swap does not allow swap
6154  * reservation to be shared between zones.  zone swap reservation is counted
6155  * on zone->zone_max_swap.
6156  */
6157 static int
6158 as_can_change_zones(void)
6159 {
6160         proc_t *pp = curproc;
6161         struct seg *seg;
6162         struct as *as = pp->p_as;
6163         vnode_t *vp;
6164         int allow = 1;
6165 
6166         ASSERT(pp->p_as != &kas);
6167         AS_LOCK_ENTER(as, RW_READER);
6168         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
6169 
6170                 /*
6171                  * Cannot enter zone with shared anon memory which
6172                  * reserves swap.  See comment above.
6173                  */
6174                 if (seg_can_change_zones(seg) == B_FALSE) {
6175                         allow = 0;
6176                         break;
6177                 }
6178                 /*
6179                  * if we can't get a backing vnode for this segment then skip
6180                  * it.
6181                  */
6182                 vp = NULL;
6183                 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
6184                         continue;
6185                 if (!vn_can_change_zones(vp)) { /* bail on first match */
6186                         allow = 0;
6187                         break;
6188                 }
6189         }
6190         AS_LOCK_EXIT(as);
6191         return (allow);
6192 }
6193 
6194 /*
6195  * Count swap reserved by curproc's address space
6196  */
6197 static size_t
6198 as_swresv(void)
6199 {
6200         proc_t *pp = curproc;
6201         struct seg *seg;
6202         struct as *as = pp->p_as;
6203         size_t swap = 0;
6204 
6205         ASSERT(pp->p_as != &kas);
6206         ASSERT(AS_WRITE_HELD(as));
6207         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
6208                 swap += seg_swresv(seg);
6209 
6210         return (swap);
6211 }
6212 
6213 /*
6214  * Systemcall entry point for zone_enter().
6215  *
6216  * The current process is injected into said zone.  In the process
6217  * it will change its project membership, privileges, rootdir/cwd,
6218  * zone-wide rctls, and pool association to match those of the zone.
6219  *
6220  * The first zone_enter() called while the zone is in the ZONE_IS_READY
6221  * state will transition it to ZONE_IS_RUNNING.  Processes may only
6222  * enter a zone that is "ready" or "running".
6223  */
6224 static int
6225 zone_enter(zoneid_t zoneid)
6226 {
6227         zone_t *zone;
6228         vnode_t *vp;
6229         proc_t *pp = curproc;
6230         contract_t *ct;
6231         cont_process_t *ctp;
6232         task_t *tk, *oldtk;
6233         kproject_t *zone_proj0;
6234         cred_t *cr, *newcr;
6235         pool_t *oldpool, *newpool;
6236         sess_t *sp;
6237         uid_t uid;
6238         zone_status_t status;
6239         int err = 0;
6240         rctl_entity_p_t e;
6241         size_t swap;
6242         kthread_id_t t;
6243 
6244         if (secpolicy_zone_config(CRED()) != 0)
6245                 return (set_errno(EPERM));
6246         if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
6247                 return (set_errno(EINVAL));
6248 
6249         /*
6250          * Stop all lwps so we don't need to hold a lock to look at
6251          * curproc->p_zone.  This needs to happen before we grab any
6252          * locks to avoid deadlock (another lwp in the process could
6253          * be waiting for the held lock).
6254          */
6255         if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
6256                 return (set_errno(EINTR));
6257 
6258         /*
6259          * Make sure we're not changing zones with files open or mapped in
6260          * to our address space which shouldn't be changing zones.
6261          */
6262         if (!files_can_change_zones()) {
6263                 err = EBADF;
6264                 goto out;
6265         }
6266         if (!as_can_change_zones()) {
6267                 err = EFAULT;
6268                 goto out;
6269         }
6270 
6271         mutex_enter(&zonehash_lock);
6272         if (pp->p_zone != global_zone) {
6273                 mutex_exit(&zonehash_lock);
6274                 err = EINVAL;
6275                 goto out;
6276         }
6277 
6278         zone = zone_find_all_by_id(zoneid);
6279         if (zone == NULL) {
6280                 mutex_exit(&zonehash_lock);
6281                 err = EINVAL;
6282                 goto out;
6283         }
6284 
6285         /*
6286          * To prevent processes in a zone from holding contracts on
6287          * extrazonal resources, and to avoid process contract
6288          * memberships which span zones, contract holders and processes
6289          * which aren't the sole members of their encapsulating process
6290          * contracts are not allowed to zone_enter.
6291          */
6292         ctp = pp->p_ct_process;
6293         ct = &ctp->conp_contract;
6294         mutex_enter(&ct->ct_lock);
6295         mutex_enter(&pp->p_lock);
6296         if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
6297                 mutex_exit(&pp->p_lock);
6298                 mutex_exit(&ct->ct_lock);
6299                 mutex_exit(&zonehash_lock);
6300                 err = EINVAL;
6301                 goto out;
6302         }
6303 
6304         /*
6305          * Moreover, we don't allow processes whose encapsulating
6306          * process contracts have inherited extrazonal contracts.
6307          * While it would be easier to eliminate all process contracts
6308          * with inherited contracts, we need to be able to give a
6309          * restarted init (or other zone-penetrating process) its
6310          * predecessor's contracts.
6311          */
6312         if (ctp->conp_ninherited != 0) {
6313                 contract_t *next;
6314                 for (next = list_head(&ctp->conp_inherited); next;
6315                     next = list_next(&ctp->conp_inherited, next)) {
6316                         if (contract_getzuniqid(next) != zone->zone_uniqid) {
6317                                 mutex_exit(&pp->p_lock);
6318                                 mutex_exit(&ct->ct_lock);
6319                                 mutex_exit(&zonehash_lock);
6320                                 err = EINVAL;
6321                                 goto out;
6322                         }
6323                 }
6324         }
6325 
6326         mutex_exit(&pp->p_lock);
6327         mutex_exit(&ct->ct_lock);
6328 
6329         status = zone_status_get(zone);
6330         if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
6331                 /*
6332                  * Can't join
6333                  */
6334                 mutex_exit(&zonehash_lock);
6335                 err = EINVAL;
6336                 goto out;
6337         }
6338 
6339         /*
6340          * Make sure new priv set is within the permitted set for caller
6341          */
6342         if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
6343                 mutex_exit(&zonehash_lock);
6344                 err = EPERM;
6345                 goto out;
6346         }
6347         /*
6348          * We want to momentarily drop zonehash_lock while we optimistically
6349          * bind curproc to the pool it should be running in.  This is safe
6350          * since the zone can't disappear (we have a hold on it).
6351          */
6352         zone_hold(zone);
6353         mutex_exit(&zonehash_lock);
6354 
6355         /*
6356          * Grab pool_lock to keep the pools configuration from changing
6357          * and to stop ourselves from getting rebound to another pool
6358          * until we join the zone.
6359          */
6360         if (pool_lock_intr() != 0) {
6361                 zone_rele(zone);
6362                 err = EINTR;
6363                 goto out;
6364         }
6365         ASSERT(secpolicy_pool(CRED()) == 0);
6366         /*
6367          * Bind ourselves to the pool currently associated with the zone.
6368          */
6369         oldpool = curproc->p_pool;
6370         newpool = zone_pool_get(zone);
6371         if (pool_state == POOL_ENABLED && newpool != oldpool &&
6372             (err = pool_do_bind(newpool, P_PID, P_MYID,
6373             POOL_BIND_ALL)) != 0) {
6374                 pool_unlock();
6375                 zone_rele(zone);
6376                 goto out;
6377         }
6378 
6379         /*
6380          * Grab cpu_lock now; we'll need it later when we call
6381          * task_join().
6382          */
6383         mutex_enter(&cpu_lock);
6384         mutex_enter(&zonehash_lock);
6385         /*
6386          * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6387          */
6388         if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6389                 /*
6390                  * Can't join anymore.
6391                  */
6392                 mutex_exit(&zonehash_lock);
6393                 mutex_exit(&cpu_lock);
6394                 if (pool_state == POOL_ENABLED &&
6395                     newpool != oldpool)
6396                         (void) pool_do_bind(oldpool, P_PID, P_MYID,
6397                             POOL_BIND_ALL);
6398                 pool_unlock();
6399                 zone_rele(zone);
6400                 err = EINVAL;
6401                 goto out;
6402         }
6403 
6404         /*
6405          * a_lock must be held while transfering locked memory and swap
6406          * reservation from the global zone to the non global zone because
6407          * asynchronous faults on the processes' address space can lock
6408          * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6409          * segments respectively.
6410          */
6411         AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6412         swap = as_swresv();
6413         mutex_enter(&pp->p_lock);
6414         zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6415         /* verify that we do not exceed and task or lwp limits */
6416         mutex_enter(&zone->zone_nlwps_lock);
6417         /* add new lwps to zone and zone's proj0 */
6418         zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6419         zone->zone_nlwps += pp->p_lwpcnt;
6420         /* add 1 task to zone's proj0 */
6421         zone_proj0->kpj_ntasks += 1;
6422 
6423         zone_proj0->kpj_nprocs++;
6424         zone->zone_nprocs++;
6425         mutex_exit(&zone->zone_nlwps_lock);
6426 
6427         mutex_enter(&zone->zone_mem_lock);
6428         zone->zone_locked_mem += pp->p_locked_mem;
6429         zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6430         zone->zone_max_swap += swap;
6431         mutex_exit(&zone->zone_mem_lock);
6432 
6433         mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6434         zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6435         mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6436 
6437         /* remove lwps and process from proc's old zone and old project */
6438         mutex_enter(&pp->p_zone->zone_nlwps_lock);
6439         pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6440         pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6441         pp->p_task->tk_proj->kpj_nprocs--;
6442         pp->p_zone->zone_nprocs--;
6443         mutex_exit(&pp->p_zone->zone_nlwps_lock);
6444 
6445         mutex_enter(&pp->p_zone->zone_mem_lock);
6446         pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6447         pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6448         pp->p_zone->zone_max_swap -= swap;
6449         mutex_exit(&pp->p_zone->zone_mem_lock);
6450 
6451         mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6452         pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6453         mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6454 
6455         pp->p_flag |= SZONETOP;
6456         pp->p_zone = zone;
6457         mutex_exit(&pp->p_lock);
6458         AS_LOCK_EXIT(pp->p_as);
6459 
6460         /*
6461          * Joining the zone cannot fail from now on.
6462          *
6463          * This means that a lot of the following code can be commonized and
6464          * shared with zsched().
6465          */
6466 
6467         /*
6468          * If the process contract fmri was inherited, we need to
6469          * flag this so that any contract status will not leak
6470          * extra zone information, svc_fmri in this case
6471          */
6472         if (ctp->conp_svc_ctid != ct->ct_id) {
6473                 mutex_enter(&ct->ct_lock);
6474                 ctp->conp_svc_zone_enter = ct->ct_id;
6475                 mutex_exit(&ct->ct_lock);
6476         }
6477 
6478         /*
6479          * Reset the encapsulating process contract's zone.
6480          */
6481         ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6482         contract_setzuniqid(ct, zone->zone_uniqid);
6483 
6484         /*
6485          * Create a new task and associate the process with the project keyed
6486          * by (projid,zoneid).
6487          *
6488          * We might as well be in project 0; the global zone's projid doesn't
6489          * make much sense in a zone anyhow.
6490          *
6491          * This also increments zone_ntasks, and returns with p_lock held.
6492          */
6493         tk = task_create(0, zone);
6494         oldtk = task_join(tk, 0);
6495         mutex_exit(&cpu_lock);
6496 
6497         /*
6498          * call RCTLOP_SET functions on this proc
6499          */
6500         e.rcep_p.zone = zone;
6501         e.rcep_t = RCENTITY_ZONE;
6502         (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6503             RCD_CALLBACK);
6504         mutex_exit(&pp->p_lock);
6505 
6506         /*
6507          * We don't need to hold any of zsched's locks here; not only do we know
6508          * the process and zone aren't going away, we know its session isn't
6509          * changing either.
6510          *
6511          * By joining zsched's session here, we mimic the behavior in the
6512          * global zone of init's sid being the pid of sched.  We extend this
6513          * to all zlogin-like zone_enter()'ing processes as well.
6514          */
6515         mutex_enter(&pidlock);
6516         sp = zone->zone_zsched->p_sessp;
6517         sess_hold(zone->zone_zsched);
6518         mutex_enter(&pp->p_lock);
6519         pgexit(pp);
6520         sess_rele(pp->p_sessp, B_TRUE);
6521         pp->p_sessp = sp;
6522         pgjoin(pp, zone->zone_zsched->p_pidp);
6523 
6524         /*
6525          * If any threads are scheduled to be placed on zone wait queue they
6526          * should abandon the idea since the wait queue is changing.
6527          * We need to be holding pidlock & p_lock to do this.
6528          */
6529         if ((t = pp->p_tlist) != NULL) {
6530                 do {
6531                         thread_lock(t);
6532                         /*
6533                          * Kick this thread so that he doesn't sit
6534                          * on a wrong wait queue.
6535                          */
6536                         if (ISWAITING(t))
6537                                 setrun_locked(t);
6538 
6539                         if (t->t_schedflag & TS_ANYWAITQ)
6540                                 t->t_schedflag &= ~ TS_ANYWAITQ;
6541 
6542                         thread_unlock(t);
6543                 } while ((t = t->t_forw) != pp->p_tlist);
6544         }
6545 
6546         /*
6547          * If there is a default scheduling class for the zone and it is not
6548          * the class we are currently in, change all of the threads in the
6549          * process to the new class.  We need to be holding pidlock & p_lock
6550          * when we call parmsset so this is a good place to do it.
6551          */
6552         if (zone->zone_defaultcid > 0 &&
6553             zone->zone_defaultcid != curthread->t_cid) {
6554                 pcparms_t pcparms;
6555 
6556                 pcparms.pc_cid = zone->zone_defaultcid;
6557                 pcparms.pc_clparms[0] = 0;
6558 
6559                 /*
6560                  * If setting the class fails, we still want to enter the zone.
6561                  */
6562                 if ((t = pp->p_tlist) != NULL) {
6563                         do {
6564                                 (void) parmsset(&pcparms, t);
6565                         } while ((t = t->t_forw) != pp->p_tlist);
6566                 }
6567         }
6568 
6569         mutex_exit(&pp->p_lock);
6570         mutex_exit(&pidlock);
6571 
6572         mutex_exit(&zonehash_lock);
6573         /*
6574          * We're firmly in the zone; let pools progress.
6575          */
6576         pool_unlock();
6577         task_rele(oldtk);
6578         /*
6579          * We don't need to retain a hold on the zone since we already
6580          * incremented zone_ntasks, so the zone isn't going anywhere.
6581          */
6582         zone_rele(zone);
6583 
6584         /*
6585          * Chroot
6586          */
6587         vp = zone->zone_rootvp;
6588         zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6589         zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6590 
6591         /*
6592          * Change process credentials
6593          */
6594         newcr = cralloc();
6595         mutex_enter(&pp->p_crlock);
6596         cr = pp->p_cred;
6597         crcopy_to(cr, newcr);
6598         crsetzone(newcr, zone);
6599         pp->p_cred = newcr;
6600 
6601         /*
6602          * Restrict all process privilege sets to zone limit
6603          */
6604         priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6605         priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6606         priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6607         priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6608         mutex_exit(&pp->p_crlock);
6609         crset(pp, newcr);
6610 
6611         /*
6612          * Adjust upcount to reflect zone entry.
6613          */
6614         uid = crgetruid(newcr);
6615         mutex_enter(&pidlock);
6616         upcount_dec(uid, GLOBAL_ZONEID);
6617         upcount_inc(uid, zoneid);
6618         mutex_exit(&pidlock);
6619 
6620         /*
6621          * Set up core file path and content.
6622          */
6623         set_core_defaults();
6624 
6625 out:
6626         /*
6627          * Let the other lwps continue.
6628          */
6629         mutex_enter(&pp->p_lock);
6630         if (curthread != pp->p_agenttp)
6631                 continuelwps(pp);
6632         mutex_exit(&pp->p_lock);
6633 
6634         return (err != 0 ? set_errno(err) : 0);
6635 }
6636 
6637 /*
6638  * Systemcall entry point for zone_list(2).
6639  *
6640  * Processes running in a (non-global) zone only see themselves.
6641  * On labeled systems, they see all zones whose label they dominate.
6642  */
6643 static int
6644 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6645 {
6646         zoneid_t *zoneids;
6647         zone_t *zone, *myzone;
6648         uint_t user_nzones, real_nzones;
6649         uint_t domi_nzones;
6650         int error;
6651 
6652         if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6653                 return (set_errno(EFAULT));
6654 
6655         myzone = curproc->p_zone;
6656         if (myzone != global_zone) {
6657                 bslabel_t *mybslab;
6658 
6659                 if (!is_system_labeled()) {
6660                         /* just return current zone */
6661                         real_nzones = domi_nzones = 1;
6662                         zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6663                         zoneids[0] = myzone->zone_id;
6664                 } else {
6665                         /* return all zones that are dominated */
6666                         mutex_enter(&zonehash_lock);
6667                         real_nzones = zonecount;
6668                         domi_nzones = 0;
6669                         if (real_nzones > 0) {
6670                                 zoneids = kmem_alloc(real_nzones *
6671                                     sizeof (zoneid_t), KM_SLEEP);
6672                                 mybslab = label2bslabel(myzone->zone_slabel);
6673                                 for (zone = list_head(&zone_active);
6674                                     zone != NULL;
6675                                     zone = list_next(&zone_active, zone)) {
6676                                         if (zone->zone_id == GLOBAL_ZONEID)
6677                                                 continue;
6678                                         if (zone != myzone &&
6679                                             (zone->zone_flags & ZF_IS_SCRATCH))
6680                                                 continue;
6681                                         /*
6682                                          * Note that a label always dominates
6683                                          * itself, so myzone is always included
6684                                          * in the list.
6685                                          */
6686                                         if (bldominates(mybslab,
6687                                             label2bslabel(zone->zone_slabel))) {
6688                                                 zoneids[domi_nzones++] =
6689                                                     zone->zone_id;
6690                                         }
6691                                 }
6692                         }
6693                         mutex_exit(&zonehash_lock);
6694                 }
6695         } else {
6696                 mutex_enter(&zonehash_lock);
6697                 real_nzones = zonecount;
6698                 domi_nzones = 0;
6699                 if (real_nzones > 0) {
6700                         zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6701                             KM_SLEEP);
6702                         for (zone = list_head(&zone_active); zone != NULL;
6703                             zone = list_next(&zone_active, zone))
6704                                 zoneids[domi_nzones++] = zone->zone_id;
6705                         ASSERT(domi_nzones == real_nzones);
6706                 }
6707                 mutex_exit(&zonehash_lock);
6708         }
6709 
6710         /*
6711          * If user has allocated space for fewer entries than we found, then
6712          * return only up to his limit.  Either way, tell him exactly how many
6713          * we found.
6714          */
6715         if (domi_nzones < user_nzones)
6716                 user_nzones = domi_nzones;
6717         error = 0;
6718         if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6719                 error = EFAULT;
6720         } else if (zoneidlist != NULL && user_nzones != 0) {
6721                 if (copyout(zoneids, zoneidlist,
6722                     user_nzones * sizeof (zoneid_t)) != 0)
6723                         error = EFAULT;
6724         }
6725 
6726         if (real_nzones > 0)
6727                 kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6728 
6729         if (error != 0)
6730                 return (set_errno(error));
6731         else
6732                 return (0);
6733 }
6734 
6735 /*
6736  * Systemcall entry point for zone_lookup(2).
6737  *
6738  * Non-global zones are only able to see themselves and (on labeled systems)
6739  * the zones they dominate.
6740  */
6741 static zoneid_t
6742 zone_lookup(const char *zone_name)
6743 {
6744         char *kname;
6745         zone_t *zone;
6746         zoneid_t zoneid;
6747         int err;
6748 
6749         if (zone_name == NULL) {
6750                 /* return caller's zone id */
6751                 return (getzoneid());
6752         }
6753 
6754         kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6755         if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6756                 kmem_free(kname, ZONENAME_MAX);
6757                 return (set_errno(err));
6758         }
6759 
6760         mutex_enter(&zonehash_lock);
6761         zone = zone_find_all_by_name(kname);
6762         kmem_free(kname, ZONENAME_MAX);
6763         /*
6764          * In a non-global zone, can only lookup global and own name.
6765          * In Trusted Extensions zone label dominance rules apply.
6766          */
6767         if (zone == NULL ||
6768             zone_status_get(zone) < ZONE_IS_READY ||
6769             !zone_list_access(zone)) {
6770                 mutex_exit(&zonehash_lock);
6771                 return (set_errno(EINVAL));
6772         } else {
6773                 zoneid = zone->zone_id;
6774                 mutex_exit(&zonehash_lock);
6775                 return (zoneid);
6776         }
6777 }
6778 
6779 static int
6780 zone_version(int *version_arg)
6781 {
6782         int version = ZONE_SYSCALL_API_VERSION;
6783 
6784         if (copyout(&version, version_arg, sizeof (int)) != 0)
6785                 return (set_errno(EFAULT));
6786         return (0);
6787 }
6788 
6789 /* ARGSUSED */
6790 long
6791 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6792 {
6793         zone_def zs;
6794         int err;
6795 
6796         switch (cmd) {
6797         case ZONE_CREATE:
6798                 if (get_udatamodel() == DATAMODEL_NATIVE) {
6799                         if (copyin(arg1, &zs, sizeof (zone_def))) {
6800                                 return (set_errno(EFAULT));
6801                         }
6802                 } else {
6803 #ifdef _SYSCALL32_IMPL
6804                         zone_def32 zs32;
6805 
6806                         if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6807                                 return (set_errno(EFAULT));
6808                         }
6809                         zs.zone_name =
6810                             (const char *)(unsigned long)zs32.zone_name;
6811                         zs.zone_root =
6812                             (const char *)(unsigned long)zs32.zone_root;
6813                         zs.zone_privs =
6814                             (const struct priv_set *)
6815                             (unsigned long)zs32.zone_privs;
6816                         zs.zone_privssz = zs32.zone_privssz;
6817                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6818                         zs.rctlbufsz = zs32.rctlbufsz;
6819                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6820                         zs.zfsbufsz = zs32.zfsbufsz;
6821                         zs.extended_error =
6822                             (int *)(unsigned long)zs32.extended_error;
6823                         zs.match = zs32.match;
6824                         zs.doi = zs32.doi;
6825                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6826                         zs.flags = zs32.flags;
6827 #else
6828                         panic("get_udatamodel() returned bogus result\n");
6829 #endif
6830                 }
6831 
6832                 return (zone_create(zs.zone_name, zs.zone_root,
6833                     zs.zone_privs, zs.zone_privssz,
6834                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6835                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6836                     zs.extended_error, zs.match, zs.doi,
6837                     zs.label, zs.flags));
6838         case ZONE_BOOT:
6839                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6840         case ZONE_DESTROY:
6841                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6842         case ZONE_GETATTR:
6843                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6844                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6845         case ZONE_SETATTR:
6846                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6847                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6848         case ZONE_ENTER:
6849                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6850         case ZONE_LIST:
6851                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6852         case ZONE_SHUTDOWN:
6853                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6854         case ZONE_LOOKUP:
6855                 return (zone_lookup((const char *)arg1));
6856         case ZONE_VERSION:
6857                 return (zone_version((int *)arg1));
6858         case ZONE_ADD_DATALINK:
6859                 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6860                     (datalink_id_t)(uintptr_t)arg2));
6861         case ZONE_DEL_DATALINK:
6862                 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6863                     (datalink_id_t)(uintptr_t)arg2));
6864         case ZONE_CHECK_DATALINK: {
6865                 zoneid_t        zoneid;
6866                 boolean_t       need_copyout;
6867 
6868                 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6869                         return (EFAULT);
6870                 need_copyout = (zoneid == ALL_ZONES);
6871                 err = zone_check_datalink(&zoneid,
6872                     (datalink_id_t)(uintptr_t)arg2);
6873                 if (err == 0 && need_copyout) {
6874                         if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6875                                 err = EFAULT;
6876                 }
6877                 return (err == 0 ? 0 : set_errno(err));
6878         }
6879         case ZONE_LIST_DATALINK:
6880                 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6881                     (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6882         default:
6883                 return (set_errno(EINVAL));
6884         }
6885 }
6886 
6887 struct zarg {
6888         zone_t *zone;
6889         zone_cmd_arg_t arg;
6890 };
6891 
6892 static int
6893 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6894 {
6895         char *buf;
6896         size_t buflen;
6897         int error;
6898 
6899         buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6900         buf = kmem_alloc(buflen, KM_SLEEP);
6901         (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6902         error = door_ki_open(buf, doorp);
6903         kmem_free(buf, buflen);
6904         return (error);
6905 }
6906 
6907 static void
6908 zone_release_door(door_handle_t *doorp)
6909 {
6910         door_ki_rele(*doorp);
6911         *doorp = NULL;
6912 }
6913 
6914 static void
6915 zone_ki_call_zoneadmd(struct zarg *zargp)
6916 {
6917         door_handle_t door = NULL;
6918         door_arg_t darg, save_arg;
6919         char *zone_name;
6920         size_t zone_namelen;
6921         zoneid_t zoneid;
6922         zone_t *zone;
6923         zone_cmd_arg_t arg;
6924         uint64_t uniqid;
6925         size_t size;
6926         int error;
6927         int retry;
6928 
6929         zone = zargp->zone;
6930         arg = zargp->arg;
6931         kmem_free(zargp, sizeof (*zargp));
6932 
6933         zone_namelen = strlen(zone->zone_name) + 1;
6934         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6935         bcopy(zone->zone_name, zone_name, zone_namelen);
6936         zoneid = zone->zone_id;
6937         uniqid = zone->zone_uniqid;
6938         arg.status = zone->zone_init_status;
6939         /*
6940          * zoneadmd may be down, but at least we can empty out the zone.
6941          * We can ignore the return value of zone_empty() since we're called
6942          * from a kernel thread and know we won't be delivered any signals.
6943          */
6944         ASSERT(curproc == &p0);
6945         (void) zone_empty(zone);
6946         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6947         zone_rele(zone);
6948 
6949         size = sizeof (arg);
6950         darg.rbuf = (char *)&arg;
6951         darg.data_ptr = (char *)&arg;
6952         darg.rsize = size;
6953         darg.data_size = size;
6954         darg.desc_ptr = NULL;
6955         darg.desc_num = 0;
6956 
6957         save_arg = darg;
6958         /*
6959          * Since we're not holding a reference to the zone, any number of
6960          * things can go wrong, including the zone disappearing before we get a
6961          * chance to talk to zoneadmd.
6962          */
6963         for (retry = 0; /* forever */; retry++) {
6964                 if (door == NULL &&
6965                     (error = zone_lookup_door(zone_name, &door)) != 0) {
6966                         goto next;
6967                 }
6968                 ASSERT(door != NULL);
6969 
6970                 if ((error = door_ki_upcall_limited(door, &darg, NULL,
6971                     SIZE_MAX, 0)) == 0) {
6972                         break;
6973                 }
6974                 switch (error) {
6975                 case EINTR:
6976                         /* FALLTHROUGH */
6977                 case EAGAIN:    /* process may be forking */
6978                         /*
6979                          * Back off for a bit
6980                          */
6981                         break;
6982                 case EBADF:
6983                         zone_release_door(&door);
6984                         if (zone_lookup_door(zone_name, &door) != 0) {
6985                                 /*
6986                                  * zoneadmd may be dead, but it may come back to
6987                                  * life later.
6988                                  */
6989                                 break;
6990                         }
6991                         break;
6992                 default:
6993                         cmn_err(CE_WARN,
6994                             "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6995                             error);
6996                         goto out;
6997                 }
6998 next:
6999                 /*
7000                  * If this isn't the same zone_t that we originally had in mind,
7001                  * then this is the same as if two kadmin requests come in at
7002                  * the same time: the first one wins.  This means we lose, so we
7003                  * bail.
7004                  */
7005                 if ((zone = zone_find_by_id(zoneid)) == NULL) {
7006                         /*
7007                          * Problem is solved.
7008                          */
7009                         break;
7010                 }
7011                 if (zone->zone_uniqid != uniqid) {
7012                         /*
7013                          * zoneid recycled
7014                          */
7015                         zone_rele(zone);
7016                         break;
7017                 }
7018                 /*
7019                  * We could zone_status_timedwait(), but there doesn't seem to
7020                  * be much point in doing that (plus, it would mean that
7021                  * zone_free() isn't called until this thread exits).
7022                  */
7023                 zone_rele(zone);
7024                 delay(hz);
7025                 darg = save_arg;
7026         }
7027 out:
7028         if (door != NULL) {
7029                 zone_release_door(&door);
7030         }
7031         kmem_free(zone_name, zone_namelen);
7032         thread_exit();
7033 }
7034 
7035 /*
7036  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
7037  * kadmin().  The caller is a process in the zone.
7038  *
7039  * In order to shutdown the zone, we will hand off control to zoneadmd
7040  * (running in the global zone) via a door.  We do a half-hearted job at
7041  * killing all processes in the zone, create a kernel thread to contact
7042  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
7043  * a form of generation number used to let zoneadmd (as well as
7044  * zone_destroy()) know exactly which zone they're re talking about.
7045  */
7046 int
7047 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
7048 {
7049         struct zarg *zargp;
7050         zone_cmd_t zcmd;
7051         zone_t *zone;
7052 
7053         zone = curproc->p_zone;
7054         ASSERT(getzoneid() != GLOBAL_ZONEID);
7055 
7056         switch (cmd) {
7057         case A_SHUTDOWN:
7058                 switch (fcn) {
7059                 case AD_HALT:
7060                 case AD_POWEROFF:
7061                         zcmd = Z_HALT;
7062                         break;
7063                 case AD_BOOT:
7064                         zcmd = Z_REBOOT;
7065                         break;
7066                 case AD_IBOOT:
7067                 case AD_SBOOT:
7068                 case AD_SIBOOT:
7069                 case AD_NOSYNC:
7070                         return (ENOTSUP);
7071                 default:
7072                         return (EINVAL);
7073                 }
7074                 break;
7075         case A_REBOOT:
7076                 zcmd = Z_REBOOT;
7077                 break;
7078         case A_FTRACE:
7079         case A_REMOUNT:
7080         case A_FREEZE:
7081         case A_DUMP:
7082         case A_CONFIG:
7083                 return (ENOTSUP);
7084         default:
7085                 ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
7086                 return (EINVAL);
7087         }
7088 
7089         if (secpolicy_zone_admin(credp, B_FALSE))
7090                 return (EPERM);
7091         mutex_enter(&zone_status_lock);
7092 
7093         /*
7094          * zone_status can't be ZONE_IS_EMPTY or higher since curproc
7095          * is in the zone.
7096          */
7097         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
7098         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
7099                 /*
7100                  * This zone is already on its way down.
7101                  */
7102                 mutex_exit(&zone_status_lock);
7103                 return (0);
7104         }
7105         /*
7106          * Prevent future zone_enter()s
7107          */
7108         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
7109         mutex_exit(&zone_status_lock);
7110 
7111         /*
7112          * Kill everyone now and call zoneadmd later.
7113          * zone_ki_call_zoneadmd() will do a more thorough job of this
7114          * later.
7115          */
7116         killall(zone->zone_id);
7117         /*
7118          * Now, create the thread to contact zoneadmd and do the rest of the
7119          * work.  This thread can't be created in our zone otherwise
7120          * zone_destroy() would deadlock.
7121          */
7122         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
7123         zargp->arg.cmd = zcmd;
7124         zargp->arg.uniqid = zone->zone_uniqid;
7125         zargp->zone = zone;
7126         (void) strcpy(zargp->arg.locale, "C");
7127         /* mdep was already copied in for us by uadmin */
7128         if (mdep != NULL)
7129                 (void) strlcpy(zargp->arg.bootbuf, mdep,
7130                     sizeof (zargp->arg.bootbuf));
7131         zone_hold(zone);
7132 
7133         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
7134             TS_RUN, minclsyspri);
7135         exit(CLD_EXITED, 0);
7136 
7137         return (EINVAL);
7138 }
7139 
7140 /*
7141  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
7142  * status to ZONE_IS_SHUTTING_DOWN.
7143  *
7144  * This function also shuts down all running zones to ensure that they won't
7145  * fork new processes.
7146  */
7147 void
7148 zone_shutdown_global(void)
7149 {
7150         zone_t *current_zonep;
7151 
7152         ASSERT(INGLOBALZONE(curproc));
7153         mutex_enter(&zonehash_lock);
7154         mutex_enter(&zone_status_lock);
7155 
7156         /* Modify the global zone's status first. */
7157         ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
7158         zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
7159 
7160         /*
7161          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
7162          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
7163          * could cause assertions to fail (e.g., assertions about a zone's
7164          * state during initialization, readying, or booting) or produce races.
7165          * We'll let threads continue to initialize and ready new zones: they'll
7166          * fail to boot the new zones when they see that the global zone is
7167          * shutting down.
7168          */
7169         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
7170             current_zonep = list_next(&zone_active, current_zonep)) {
7171                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
7172                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
7173         }
7174         mutex_exit(&zone_status_lock);
7175         mutex_exit(&zonehash_lock);
7176 }
7177 
7178 /*
7179  * Returns true if the named dataset is visible in the specified zone.
7180  * The 'write' parameter is set to 1 if the dataset is also writable.
7181  */
7182 int
7183 zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
7184 {
7185         static int zfstype = -1;
7186         zone_dataset_t *zd;
7187         size_t len;
7188         const char *name = NULL;
7189         vfs_t *vfsp = NULL;
7190 
7191         if (dataset[0] == '\0')
7192                 return (0);
7193 
7194         /*
7195          * Walk the list once, looking for datasets which match exactly, or
7196          * specify a dataset underneath an exported dataset.  If found, return
7197          * true and note that it is writable.
7198          */
7199         for (zd = list_head(&zone->zone_datasets); zd != NULL;
7200             zd = list_next(&zone->zone_datasets, zd)) {
7201 
7202                 len = strlen(zd->zd_dataset);
7203                 if (strlen(dataset) >= len &&
7204                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
7205                     (dataset[len] == '\0' || dataset[len] == '/' ||
7206                     dataset[len] == '@')) {
7207                         if (write)
7208                                 *write = 1;
7209                         return (1);
7210                 }
7211         }
7212 
7213         /*
7214          * Walk the list a second time, searching for datasets which are parents
7215          * of exported datasets.  These should be visible, but read-only.
7216          *
7217          * Note that we also have to support forms such as 'pool/dataset/', with
7218          * a trailing slash.
7219          */
7220         for (zd = list_head(&zone->zone_datasets); zd != NULL;
7221             zd = list_next(&zone->zone_datasets, zd)) {
7222 
7223                 len = strlen(dataset);
7224                 if (dataset[len - 1] == '/')
7225                         len--;  /* Ignore trailing slash */
7226                 if (len < strlen(zd->zd_dataset) &&
7227                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
7228                     zd->zd_dataset[len] == '/') {
7229                         if (write)
7230                                 *write = 0;
7231                         return (1);
7232                 }
7233         }
7234 
7235         /*
7236          * We reach here if the given dataset is not found in the zone_dataset
7237          * list. Check if this dataset was added as a filesystem (ie. "add fs")
7238          * instead of delegation. For this we search for the dataset in the
7239          * zone_vfslist of this zone. If found, return true and note that it is
7240          * not writable.
7241          */
7242 
7243         /*
7244          * Initialize zfstype if it is not initialized yet.
7245          */
7246         if (zfstype == -1) {
7247                 struct vfssw *vswp = vfs_getvfssw("zfs");
7248                 zfstype = vswp - vfssw;
7249                 vfs_unrefvfssw(vswp);
7250         }
7251 
7252         vfs_list_read_lock();
7253         vfsp = zone->zone_vfslist;
7254         do {
7255                 if (vfsp == NULL)
7256                         break;
7257                 if (vfsp->vfs_fstype == zfstype) {
7258                         name = refstr_value(vfsp->vfs_resource);
7259 
7260                         /*
7261                          * Check if we have an exact match.
7262                          */
7263                         if (strcmp(dataset, name) == 0) {
7264                                 vfs_list_unlock();
7265                                 if (write)
7266                                         *write = 0;
7267                                 return (1);
7268                         }
7269                         /*
7270                          * We need to check if we are looking for parents of
7271                          * a dataset. These should be visible, but read-only.
7272                          */
7273                         len = strlen(dataset);
7274                         if (dataset[len - 1] == '/')
7275                                 len--;
7276 
7277                         if (len < strlen(name) &&
7278                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
7279                                 vfs_list_unlock();
7280                                 if (write)
7281                                         *write = 0;
7282                                 return (1);
7283                         }
7284                 }
7285                 vfsp = vfsp->vfs_zone_next;
7286         } while (vfsp != zone->zone_vfslist);
7287 
7288         vfs_list_unlock();
7289         return (0);
7290 }
7291 
7292 /*
7293  * Returns true if the named dataset is visible in the current zone.
7294  * The 'write' parameter is set to 1 if the dataset is also writable.
7295  */
7296 int
7297 zone_dataset_visible(const char *dataset, int *write)
7298 {
7299         zone_t *zone = curproc->p_zone;
7300 
7301         return (zone_dataset_visible_inzone(zone, dataset, write));
7302 }
7303 
7304 /*
7305  * zone_find_by_any_path() -
7306  *
7307  * kernel-private routine similar to zone_find_by_path(), but which
7308  * effectively compares against zone paths rather than zonerootpath
7309  * (i.e., the last component of zonerootpaths, which should be "root/",
7310  * are not compared.)  This is done in order to accurately identify all
7311  * paths, whether zone-visible or not, including those which are parallel
7312  * to /root/, such as /dev/, /home/, etc...
7313  *
7314  * If the specified path does not fall under any zone path then global
7315  * zone is returned.
7316  *
7317  * The treat_abs parameter indicates whether the path should be treated as
7318  * an absolute path although it does not begin with "/".  (This supports
7319  * nfs mount syntax such as host:any/path.)
7320  *
7321  * The caller is responsible for zone_rele of the returned zone.
7322  */
7323 zone_t *
7324 zone_find_by_any_path(const char *path, boolean_t treat_abs)
7325 {
7326         zone_t *zone;
7327         int path_offset = 0;
7328 
7329         if (path == NULL) {
7330                 zone_hold(global_zone);
7331                 return (global_zone);
7332         }
7333 
7334         if (*path != '/') {
7335                 ASSERT(treat_abs);
7336                 path_offset = 1;
7337         }
7338 
7339         mutex_enter(&zonehash_lock);
7340         for (zone = list_head(&zone_active); zone != NULL;
7341             zone = list_next(&zone_active, zone)) {
7342                 char    *c;
7343                 size_t  pathlen;
7344                 char *rootpath_start;
7345 
7346                 if (zone == global_zone)        /* skip global zone */
7347                         continue;
7348 
7349                 /* scan backwards to find start of last component */
7350                 c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
7351                 do {
7352                         c--;
7353                 } while (*c != '/');
7354 
7355                 pathlen = c - zone->zone_rootpath + 1 - path_offset;
7356                 rootpath_start = (zone->zone_rootpath + path_offset);
7357                 if (strncmp(path, rootpath_start, pathlen) == 0)
7358                         break;
7359         }
7360         if (zone == NULL)
7361                 zone = global_zone;
7362         zone_hold(zone);
7363         mutex_exit(&zonehash_lock);
7364         return (zone);
7365 }
7366 
7367 /*
7368  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7369  * zone_dl_t pointer if found, and NULL otherwise.
7370  */
7371 static zone_dl_t *
7372 zone_find_dl(zone_t *zone, datalink_id_t linkid)
7373 {
7374         zone_dl_t *zdl;
7375 
7376         ASSERT(mutex_owned(&zone->zone_lock));
7377         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7378             zdl = list_next(&zone->zone_dl_list, zdl)) {
7379                 if (zdl->zdl_id == linkid)
7380                         break;
7381         }
7382         return (zdl);
7383 }
7384 
7385 static boolean_t
7386 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7387 {
7388         boolean_t exists;
7389 
7390         mutex_enter(&zone->zone_lock);
7391         exists = (zone_find_dl(zone, linkid) != NULL);
7392         mutex_exit(&zone->zone_lock);
7393         return (exists);
7394 }
7395 
7396 /*
7397  * Add an data link name for the zone.
7398  */
7399 static int
7400 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7401 {
7402         zone_dl_t *zdl;
7403         zone_t *zone;
7404         zone_t *thiszone;
7405 
7406         if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7407                 return (set_errno(ENXIO));
7408 
7409         /* Verify that the datalink ID doesn't already belong to a zone. */
7410         mutex_enter(&zonehash_lock);
7411         for (zone = list_head(&zone_active); zone != NULL;
7412             zone = list_next(&zone_active, zone)) {
7413                 if (zone_dl_exists(zone, linkid)) {
7414                         mutex_exit(&zonehash_lock);
7415                         zone_rele(thiszone);
7416                         return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7417                 }
7418         }
7419 
7420         zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7421         zdl->zdl_id = linkid;
7422         zdl->zdl_net = NULL;
7423         mutex_enter(&thiszone->zone_lock);
7424         list_insert_head(&thiszone->zone_dl_list, zdl);
7425         mutex_exit(&thiszone->zone_lock);
7426         mutex_exit(&zonehash_lock);
7427         zone_rele(thiszone);
7428         return (0);
7429 }
7430 
7431 static int
7432 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7433 {
7434         zone_dl_t *zdl;
7435         zone_t *zone;
7436         int err = 0;
7437 
7438         if ((zone = zone_find_by_id(zoneid)) == NULL)
7439                 return (set_errno(EINVAL));
7440 
7441         mutex_enter(&zone->zone_lock);
7442         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7443                 err = ENXIO;
7444         } else {
7445                 list_remove(&zone->zone_dl_list, zdl);
7446                 nvlist_free(zdl->zdl_net);
7447                 kmem_free(zdl, sizeof (zone_dl_t));
7448         }
7449         mutex_exit(&zone->zone_lock);
7450         zone_rele(zone);
7451         return (err == 0 ? 0 : set_errno(err));
7452 }
7453 
7454 /*
7455  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7456  * the linkid.  Otherwise we just check if the specified zoneidp has been
7457  * assigned the supplied linkid.
7458  */
7459 int
7460 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7461 {
7462         zone_t *zone;
7463         int err = ENXIO;
7464 
7465         if (*zoneidp != ALL_ZONES) {
7466                 if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7467                         if (zone_dl_exists(zone, linkid))
7468                                 err = 0;
7469                         zone_rele(zone);
7470                 }
7471                 return (err);
7472         }
7473 
7474         mutex_enter(&zonehash_lock);
7475         for (zone = list_head(&zone_active); zone != NULL;
7476             zone = list_next(&zone_active, zone)) {
7477                 if (zone_dl_exists(zone, linkid)) {
7478                         *zoneidp = zone->zone_id;
7479                         err = 0;
7480                         break;
7481                 }
7482         }
7483         mutex_exit(&zonehash_lock);
7484         return (err);
7485 }
7486 
7487 /*
7488  * Get the list of datalink IDs assigned to a zone.
7489  *
7490  * On input, *nump is the number of datalink IDs that can fit in the supplied
7491  * idarray.  Upon return, *nump is either set to the number of datalink IDs
7492  * that were placed in the array if the array was large enough, or to the
7493  * number of datalink IDs that the function needs to place in the array if the
7494  * array is too small.
7495  */
7496 static int
7497 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7498 {
7499         uint_t num, dlcount;
7500         zone_t *zone;
7501         zone_dl_t *zdl;
7502         datalink_id_t *idptr = idarray;
7503 
7504         if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7505                 return (set_errno(EFAULT));
7506         if ((zone = zone_find_by_id(zoneid)) == NULL)
7507                 return (set_errno(ENXIO));
7508 
7509         num = 0;
7510         mutex_enter(&zone->zone_lock);
7511         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7512             zdl = list_next(&zone->zone_dl_list, zdl)) {
7513                 /*
7514                  * If the list is bigger than what the caller supplied, just
7515                  * count, don't do copyout.
7516                  */
7517                 if (++num > dlcount)
7518                         continue;
7519                 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7520                         mutex_exit(&zone->zone_lock);
7521                         zone_rele(zone);
7522                         return (set_errno(EFAULT));
7523                 }
7524                 idptr++;
7525         }
7526         mutex_exit(&zone->zone_lock);
7527         zone_rele(zone);
7528 
7529         /* Increased or decreased, caller should be notified. */
7530         if (num != dlcount) {
7531                 if (copyout(&num, nump, sizeof (num)) != 0)
7532                         return (set_errno(EFAULT));
7533         }
7534         return (0);
7535 }
7536 
7537 /*
7538  * Public interface for looking up a zone by zoneid. It's a customized version
7539  * for netstack_zone_create(). It can only be called from the zsd create
7540  * callbacks, since it doesn't have reference on the zone structure hence if
7541  * it is called elsewhere the zone could disappear after the zonehash_lock
7542  * is dropped.
7543  *
7544  * Furthermore it
7545  * 1. Doesn't check the status of the zone.
7546  * 2. It will be called even before zone_init is called, in that case the
7547  *    address of zone0 is returned directly, and netstack_zone_create()
7548  *    will only assign a value to zone0.zone_netstack, won't break anything.
7549  * 3. Returns without the zone being held.
7550  */
7551 zone_t *
7552 zone_find_by_id_nolock(zoneid_t zoneid)
7553 {
7554         zone_t *zone;
7555 
7556         mutex_enter(&zonehash_lock);
7557         if (zonehashbyid == NULL)
7558                 zone = &zone0;
7559         else
7560                 zone = zone_find_all_by_id(zoneid);
7561         mutex_exit(&zonehash_lock);
7562         return (zone);
7563 }
7564 
7565 /*
7566  * Walk the datalinks for a given zone
7567  */
7568 int
7569 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7570     void *data)
7571 {
7572         zone_t          *zone;
7573         zone_dl_t       *zdl;
7574         datalink_id_t   *idarray;
7575         uint_t          idcount = 0;
7576         int             i, ret = 0;
7577 
7578         if ((zone = zone_find_by_id(zoneid)) == NULL)
7579                 return (ENOENT);
7580 
7581         /*
7582          * We first build an array of linkid's so that we can walk these and
7583          * execute the callback with the zone_lock dropped.
7584          */
7585         mutex_enter(&zone->zone_lock);
7586         for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7587             zdl = list_next(&zone->zone_dl_list, zdl)) {
7588                 idcount++;
7589         }
7590 
7591         if (idcount == 0) {
7592                 mutex_exit(&zone->zone_lock);
7593                 zone_rele(zone);
7594                 return (0);
7595         }
7596 
7597         idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7598         if (idarray == NULL) {
7599                 mutex_exit(&zone->zone_lock);
7600                 zone_rele(zone);
7601                 return (ENOMEM);
7602         }
7603 
7604         for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7605             i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7606                 idarray[i] = zdl->zdl_id;
7607         }
7608 
7609         mutex_exit(&zone->zone_lock);
7610 
7611         for (i = 0; i < idcount && ret == 0; i++) {
7612                 if ((ret = (*cb)(idarray[i], data)) != 0)
7613                         break;
7614         }
7615 
7616         zone_rele(zone);
7617         kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7618         return (ret);
7619 }
7620 
7621 static char *
7622 zone_net_type2name(int type)
7623 {
7624         switch (type) {
7625         case ZONE_NETWORK_ADDRESS:
7626                 return (ZONE_NET_ADDRNAME);
7627         case ZONE_NETWORK_DEFROUTER:
7628                 return (ZONE_NET_RTRNAME);
7629         default:
7630                 return (NULL);
7631         }
7632 }
7633 
7634 static int
7635 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7636 {
7637         zone_t *zone;
7638         zone_dl_t *zdl;
7639         nvlist_t *nvl;
7640         int err = 0;
7641         uint8_t *new = NULL;
7642         char *nvname;
7643         int bufsize;
7644         datalink_id_t linkid = znbuf->zn_linkid;
7645 
7646         if (secpolicy_zone_config(CRED()) != 0)
7647                 return (set_errno(EPERM));
7648 
7649         if (zoneid == GLOBAL_ZONEID)
7650                 return (set_errno(EINVAL));
7651 
7652         nvname = zone_net_type2name(znbuf->zn_type);
7653         bufsize = znbuf->zn_len;
7654         new = znbuf->zn_val;
7655         if (nvname == NULL)
7656                 return (set_errno(EINVAL));
7657 
7658         if ((zone = zone_find_by_id(zoneid)) == NULL) {
7659                 return (set_errno(EINVAL));
7660         }
7661 
7662         mutex_enter(&zone->zone_lock);
7663         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7664                 err = ENXIO;
7665                 goto done;
7666         }
7667         if ((nvl = zdl->zdl_net) == NULL) {
7668                 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7669                         err = ENOMEM;
7670                         goto done;
7671                 } else {
7672                         zdl->zdl_net = nvl;
7673                 }
7674         }
7675         if (nvlist_exists(nvl, nvname)) {
7676                 err = EINVAL;
7677                 goto done;
7678         }
7679         err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7680         ASSERT(err == 0);
7681 done:
7682         mutex_exit(&zone->zone_lock);
7683         zone_rele(zone);
7684         if (err != 0)
7685                 return (set_errno(err));
7686         else
7687                 return (0);
7688 }
7689 
7690 static int
7691 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7692 {
7693         zone_t *zone;
7694         zone_dl_t *zdl;
7695         nvlist_t *nvl;
7696         uint8_t *ptr;
7697         uint_t psize;
7698         int err = 0;
7699         char *nvname;
7700         int bufsize;
7701         void *buf;
7702         datalink_id_t linkid = znbuf->zn_linkid;
7703 
7704         if (zoneid == GLOBAL_ZONEID)
7705                 return (set_errno(EINVAL));
7706 
7707         nvname = zone_net_type2name(znbuf->zn_type);
7708         bufsize = znbuf->zn_len;
7709         buf = znbuf->zn_val;
7710 
7711         if (nvname == NULL)
7712                 return (set_errno(EINVAL));
7713         if ((zone = zone_find_by_id(zoneid)) == NULL)
7714                 return (set_errno(EINVAL));
7715 
7716         mutex_enter(&zone->zone_lock);
7717         if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7718                 err = ENXIO;
7719                 goto done;
7720         }
7721         if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7722                 err = ENOENT;
7723                 goto done;
7724         }
7725         err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7726         ASSERT(err == 0);
7727 
7728         if (psize > bufsize) {
7729                 err = ENOBUFS;
7730                 goto done;
7731         }
7732         znbuf->zn_len = psize;
7733         bcopy(ptr, buf, psize);
7734 done:
7735         mutex_exit(&zone->zone_lock);
7736         zone_rele(zone);
7737         if (err != 0)
7738                 return (set_errno(err));
7739         else
7740                 return (0);
7741 }