illumos-gate Wdiff usr/src/uts/common/os/zone.c

Print this page

OS-3342+co

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2013, Joyent Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Zones
  29   29   *
  30   30   *   A zone is a named collection of processes, namespace constraints,
  31   31   *   and other system resources which comprise a secure and manageable
  32   32   *   application containment facility.
  33   33   *
  34   34   *   Zones (represented by the reference counted zone_t) are tracked in
  35   35   *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36   36   *   (zoneid_t) are used to track zone association.  Zone IDs are
  37   37   *   dynamically generated when the zone is created; if a persistent
  38   38   *   identifier is needed (core files, accounting logs, audit trail,
  39   39   *   etc.), the zone name should be used.
  40   40   *
  41   41   *
  42   42   *   Global Zone:
  43   43   *
  44   44   *   The global zone (zoneid 0) is automatically associated with all
  45   45   *   system resources that have not been bound to a user-created zone.
  46   46   *   This means that even systems where zones are not in active use
  47   47   *   have a global zone, and all processes, mounts, etc. are
  48   48   *   associated with that zone.  The global zone is generally
  49   49   *   unconstrained in terms of privileges and access, though the usual
  50   50   *   credential and privilege based restrictions apply.
  51   51   *
  52   52   *
  53   53   *   Zone States:
  54   54   *
  55   55   *   The states in which a zone may be in and the transitions are as
  56   56   *   follows:
  57   57   *
  58   58   *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  59   59   *   initialized zone is added to the list of active zones on the system but
  60   60   *   isn't accessible.
  61   61   *
  62   62   *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  63   63   *   not yet completed. Not possible to enter the zone, but attributes can
  64   64   *   be retrieved.
  65   65   *
  66   66   *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  67   67   *   ready.  The zone is made visible after the ZSD constructor callbacks are
  68   68   *   executed.  A zone remains in this state until it transitions into
  69   69   *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  70   70   *
  71   71   *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  72   72   *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  73   73   *   state.
  74   74   *
  75   75   *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  76   76   *   successfully started init.   A zone remains in this state until
  77   77   *   zone_shutdown() is called.
  78   78   *
  79   79   *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  80   80   *   killing all processes running in the zone. The zone remains
  81   81   *   in this state until there are no more user processes running in the zone.
  82   82   *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  83   83   *   Since zone_shutdown() is restartable, it may be called successfully
  84   84   *   multiple times for the same zone_t.  Setting of the zone's state to
  85   85   *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  86   86   *   the zone's status without worrying about it being a moving target.
  87   87   *
  88   88   *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  89   89   *   are no more user processes in the zone.  The zone remains in this
  90   90   *   state until there are no more kernel threads associated with the
  91   91   *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  92   92   *   fail.
  93   93   *
  94   94   *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  95   95   *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  96   96   *   join the zone or create kernel threads therein.
  97   97   *
  98   98   *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
  99   99   *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 100  100   *   return NULL from now on.
 101  101   *
 102  102   *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 103  103   *   processes or threads doing work on behalf of the zone.  The zone is
 104  104   *   removed from the list of active zones.  zone_destroy() returns, and
 105  105   *   the zone can be recreated.
 106  106   *
 107  107   *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 108  108   *   callbacks are executed, and all memory associated with the zone is
 109  109   *   freed.
 110  110   *
 111  111   *   Threads can wait for the zone to enter a requested state by using
 112  112   *   zone_status_wait() or zone_status_timedwait() with the desired
 113  113   *   state passed in as an argument.  Zone state transitions are
 114  114   *   uni-directional; it is not possible to move back to an earlier state.
 115  115   *
 116  116   *
 117  117   *   Zone-Specific Data:
 118  118   *
 119  119   *   Subsystems needing to maintain zone-specific data can store that
 120  120   *   data using the ZSD mechanism.  This provides a zone-specific data
 121  121   *   store, similar to thread-specific data (see pthread_getspecific(3C)
 122  122   *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 123  123   *   to register callbacks to be invoked when a zone is created, shut
 124  124   *   down, or destroyed.  This can be used to initialize zone-specific
 125  125   *   data for new zones and to clean up when zones go away.
 126  126   *
 127  127   *
 128  128   *   Data Structures:
 129  129   *
 130  130   *   The per-zone structure (zone_t) is reference counted, and freed
 131  131   *   when all references are released.  zone_hold and zone_rele can be
 132  132   *   used to adjust the reference count.  In addition, reference counts
 133  133   *   associated with the cred_t structure are tracked separately using
 134  134   *   zone_cred_hold and zone_cred_rele.
 135  135   *
 136  136   *   Pointers to active zone_t's are stored in two hash tables; one
 137  137   *   for searching by id, the other for searching by name.  Lookups
 138  138   *   can be performed on either basis, using zone_find_by_id and
 139  139   *   zone_find_by_name.  Both return zone_t pointers with the zone
 140  140   *   held, so zone_rele should be called when the pointer is no longer
 141  141   *   needed.  Zones can also be searched by path; zone_find_by_path
 142  142   *   returns the zone with which a path name is associated (global
 143  143   *   zone if the path is not within some other zone's file system
 144  144   *   hierarchy).  This currently requires iterating through each zone,
 145  145   *   so it is slower than an id or name search via a hash table.
 146  146   *
 147  147   *
 148  148   *   Locking:
 149  149   *
 150  150   *   zonehash_lock: This is a top-level global lock used to protect the
 151  151   *       zone hash tables and lists.  Zones cannot be created or destroyed
 152  152   *       while this lock is held.
 153  153   *   zone_status_lock: This is a global lock protecting zone state.
 154  154   *       Zones cannot change state while this lock is held.  It also
 155  155   *       protects the list of kernel threads associated with a zone.
 156  156   *   zone_lock: This is a per-zone lock used to protect several fields of
 157  157   *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 158  158   *       this lock means that the zone cannot go away.
 159  159   *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 160  160   *       related to the zone.max-lwps rctl.
 161  161   *   zone_mem_lock: This is a per-zone lock used to protect the fields
 162  162   *       related to the zone.max-locked-memory and zone.max-swap rctls.
 163  163   *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 164  164   *       currently just max_lofi
 165  165   *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 166  166   *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 167  167   *       list (a list of zones in the ZONE_IS_DEAD state).
 168  168   *
 169  169   *   Ordering requirements:
 170  170   *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 171  171   *              zone_lock --> zsd_key_lock --> pidlock --> p_lock
 172  172   *
 173  173   *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 174  174   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 175  175   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 176  176   *
 177  177   *   Blocking memory allocations are permitted while holding any of the
 178  178   *   zone locks.
 179  179   *
 180  180   *
 181  181   *   System Call Interface:
 182  182   *
 183  183   *   The zone subsystem can be managed and queried from user level with
 184  184   *   the following system calls (all subcodes of the primary "zone"
 185  185   *   system call):
 186  186   *   - zone_create: creates a zone with selected attributes (name,
 187  187   *     root path, privileges, resource controls, ZFS datasets)
 188  188   *   - zone_enter: allows the current process to enter a zone
 189  189   *   - zone_getattr: reports attributes of a zone
 190  190   *   - zone_setattr: set attributes of a zone
 191  191   *   - zone_boot: set 'init' running for the zone
 192  192   *   - zone_list: lists all zones active in the system
 193  193   *   - zone_lookup: looks up zone id based on name
 194  194   *   - zone_shutdown: initiates shutdown process (see states above)
 195  195   *   - zone_destroy: completes shutdown process (see states above)
 196  196   *
 197  197   */
 198  198  
 199  199  #include <sys/priv_impl.h>
 200  200  #include <sys/cred.h>
 201  201  #include <c2/audit.h>
 202  202  #include <sys/debug.h>
 203  203  #include <sys/file.h>
 204  204  #include <sys/kmem.h>
 205  205  #include <sys/kstat.h>
 206  206  #include <sys/mutex.h>
 207  207  #include <sys/note.h>
 208  208  #include <sys/pathname.h>
 209  209  #include <sys/proc.h>
 210  210  #include <sys/project.h>
 211  211  #include <sys/sysevent.h>
 212  212  #include <sys/task.h>
 213  213  #include <sys/systm.h>
 214  214  #include <sys/types.h>
 215  215  #include <sys/utsname.h>
 216  216  #include <sys/vnode.h>
 217  217  #include <sys/vfs.h>
 218  218  #include <sys/systeminfo.h>
 219  219  #include <sys/policy.h>
 220  220  #include <sys/cred_impl.h>
 221  221  #include <sys/contract_impl.h>
 222  222  #include <sys/contract/process_impl.h>
 223  223  #include <sys/class.h>
 224  224  #include <sys/pool.h>
 225  225  #include <sys/pool_pset.h>
 226  226  #include <sys/pset.h>
 227  227  #include <sys/strlog.h>
 228  228  #include <sys/sysmacros.h>
 229  229  #include <sys/callb.h>
 230  230  #include <sys/vmparam.h>
 231  231  #include <sys/corectl.h>
 232  232  #include <sys/ipc_impl.h>
 233  233  #include <sys/klpd.h>
 234  234  
 235  235  #include <sys/door.h>
 236  236  #include <sys/cpuvar.h>
 237  237  #include <sys/sdt.h>
 238  238  
 239  239  #include <sys/uadmin.h>
 240  240  #include <sys/session.h>
 241  241  #include <sys/cmn_err.h>
 242  242  #include <sys/modhash.h>
 243  243  #include <sys/sunddi.h>
 244  244  #include <sys/nvpair.h>
 245  245  #include <sys/rctl.h>
 246  246  #include <sys/fss.h>
 247  247  #include <sys/brand.h>
 248  248  #include <sys/zone.h>
 249  249  #include <net/if.h>
 250  250  #include <sys/cpucaps.h>
 251  251  #include <vm/seg.h>
 252  252  #include <sys/mac.h>
 253  253  
 254  254  /*
 255  255   * This constant specifies the number of seconds that threads waiting for
 256  256   * subsystems to release a zone's general-purpose references will wait before
 257  257   * they log the zone's reference counts.  The constant's value shouldn't
 258  258   * be so small that reference counts are unnecessarily reported for zones
 259  259   * whose references are slowly released.  On the other hand, it shouldn't be so
 260  260   * large that users reboot their systems out of frustration over hung zones
 261  261   * before the system logs the zones' reference counts.
 262  262   */
 263  263  #define ZONE_DESTROY_TIMEOUT_SECS       60
 264  264  
 265  265  /* List of data link IDs which are accessible from the zone */
 266  266  typedef struct zone_dl {
 267  267          datalink_id_t   zdl_id;
 268  268          nvlist_t        *zdl_net;
 269  269          list_node_t     zdl_linkage;
 270  270  } zone_dl_t;
 271  271  
 272  272  /*
 273  273   * cv used to signal that all references to the zone have been released.  This
 274  274   * needs to be global since there may be multiple waiters, and the first to
 275  275   * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 276  276   */
 277  277  static kcondvar_t zone_destroy_cv;
 278  278  /*
 279  279   * Lock used to serialize access to zone_cv.  This could have been per-zone,
 280  280   * but then we'd need another lock for zone_destroy_cv, and why bother?
 281  281   */
 282  282  static kmutex_t zone_status_lock;
 283  283  
 284  284  /*
 285  285   * ZSD-related global variables.
 286  286   */
 287  287  static kmutex_t zsd_key_lock;   /* protects the following two */
 288  288  /*
 289  289   * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 290  290   */
 291  291  static zone_key_t zsd_keyval = 0;
 292  292  /*
 293  293   * Global list of registered keys.  We use this when a new zone is created.
 294  294   */
 295  295  static list_t zsd_registered_keys;
 296  296  
 297  297  int zone_hash_size = 256;
 298  298  static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 299  299  static kmutex_t zonehash_lock;
 300  300  static uint_t zonecount;
 301  301  static id_space_t *zoneid_space;
 302  302  
 303  303  /*
 304  304   * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 305  305   * kernel proper runs, and which manages all other zones.
 306  306   *
 307  307   * Although not declared as static, the variable "zone0" should not be used
 308  308   * except for by code that needs to reference the global zone early on in boot,
 309  309   * before it is fully initialized.  All other consumers should use
 310  310   * 'global_zone'.
 311  311   */
 312  312  zone_t zone0;
 313  313  zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 314  314  
 315  315  /*
 316  316   * List of active zones, protected by zonehash_lock.
 317  317   */
 318  318  static list_t zone_active;
 319  319  
 320  320  /*
 321  321   * List of destroyed zones that still have outstanding cred references.
 322  322   * Used for debugging.  Uses a separate lock to avoid lock ordering
 323  323   * problems in zone_free.
 324  324   */
 325  325  static list_t zone_deathrow;
 326  326  static kmutex_t zone_deathrow_lock;
 327  327  
 328  328  /* number of zones is limited by virtual interface limit in IP */
 329  329  uint_t maxzones = 8192;
 330  330  
 331  331  /* Event channel to sent zone state change notifications */
 332  332  evchan_t *zone_event_chan;
 333  333  
 334  334  /*
 335  335   * This table holds the mapping from kernel zone states to
 336  336   * states visible in the state notification API.
 337  337   * The idea is that we only expose "obvious" states and
 338  338   * do not expose states which are just implementation details.
 339  339   */
 340  340  const char  *zone_status_table[] = {
 341  341          ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 342  342          ZONE_EVENT_INITIALIZED,         /* initialized */
 343  343          ZONE_EVENT_READY,               /* ready */
 344  344          ZONE_EVENT_READY,               /* booting */
 345  345          ZONE_EVENT_RUNNING,             /* running */
 346  346          ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 347  347          ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 348  348          ZONE_EVENT_SHUTTING_DOWN,       /* down */
 349  349          ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 350  350          ZONE_EVENT_UNINITIALIZED,       /* dead */
 351  351  };
 352  352  
 353  353  /*
 354  354   * This array contains the names of the subsystems listed in zone_ref_subsys_t
 355  355   * (see sys/zone.h).
 356  356   */
 357  357  static char *zone_ref_subsys_names[] = {
 358  358          "NFS",          /* ZONE_REF_NFS */
 359  359          "NFSv4",        /* ZONE_REF_NFSV4 */
 360  360          "SMBFS",        /* ZONE_REF_SMBFS */
 361  361          "MNTFS",        /* ZONE_REF_MNTFS */
 362  362          "LOFI",         /* ZONE_REF_LOFI */
 363  363          "VFS",          /* ZONE_REF_VFS */
 364  364          "IPC"           /* ZONE_REF_IPC */
 365  365  };
 366  366  
 367  367  /*
 368  368   * This isn't static so lint doesn't complain.
 369  369   */
 370  370  rctl_hndl_t rc_zone_cpu_shares;
 371  371  rctl_hndl_t rc_zone_locked_mem;
 372  372  rctl_hndl_t rc_zone_max_swap;
 373  373  rctl_hndl_t rc_zone_max_lofi;
 374  374  rctl_hndl_t rc_zone_cpu_cap;
 375  375  rctl_hndl_t rc_zone_nlwps;
 376  376  rctl_hndl_t rc_zone_nprocs;
 377  377  rctl_hndl_t rc_zone_shmmax;
 378  378  rctl_hndl_t rc_zone_shmmni;
 379  379  rctl_hndl_t rc_zone_semmni;
 380  380  rctl_hndl_t rc_zone_msgmni;
 381  381  
 382  382  const char * const zone_default_initname = "/sbin/init";
 383  383  static char * const zone_prefix = "/zone/";
 384  384  static int zone_shutdown(zoneid_t zoneid);
 385  385  static int zone_add_datalink(zoneid_t, datalink_id_t);
 386  386  static int zone_remove_datalink(zoneid_t, datalink_id_t);
 387  387  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 388  388  static int zone_set_network(zoneid_t, zone_net_data_t *);
 389  389  static int zone_get_network(zoneid_t, zone_net_data_t *);
 390  390  
 391  391  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 392  392  
 393  393  static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 394  394  static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 395  395  static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 396  396  static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 397  397      zone_key_t);
 398  398  static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 399  399  static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 400  400      kmutex_t *);
 401  401  static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 402  402      kmutex_t *);
 403  403  
 404  404  /*
 405  405   * Bump this number when you alter the zone syscall interfaces; this is
 406  406   * because we need to have support for previous API versions in libc
 407  407   * to support patching; libc calls into the kernel to determine this number.
 408  408   *
 409  409   * Version 1 of the API is the version originally shipped with Solaris 10
 410  410   * Version 2 alters the zone_create system call in order to support more
 411  411   *     arguments by moving the args into a structure; and to do better
 412  412   *     error reporting when zone_create() fails.
 413  413   * Version 3 alters the zone_create system call in order to support the
 414  414   *     import of ZFS datasets to zones.
 415  415   * Version 4 alters the zone_create system call in order to support
 416  416   *     Trusted Extensions.
 417  417   * Version 5 alters the zone_boot system call, and converts its old
 418  418   *     bootargs parameter to be set by the zone_setattr API instead.
 419  419   * Version 6 adds the flag argument to zone_create.
 420  420   */
 421  421  static const int ZONE_SYSCALL_API_VERSION = 6;
 422  422  
 423  423  /*
 424  424   * Certain filesystems (such as NFS and autofs) need to know which zone
 425  425   * the mount is being placed in.  Because of this, we need to be able to
 426  426   * ensure that a zone isn't in the process of being created/destroyed such
 427  427   * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 428  428   * it gets added the list of mounted zones, it ends up on the wrong zone's
 429  429   * mount list. Since a zone can't reside on an NFS file system, we don't
 430  430   * have to worry about the zonepath itself.
 431  431   *
 432  432   * The following functions: block_mounts()/resume_mounts() and
 433  433   * mount_in_progress()/mount_completed() are used by zones and the VFS
 434  434   * layer (respectively) to synchronize zone state transitions and new
 435  435   * mounts within a zone. This syncronization is on a per-zone basis, so
 436  436   * activity for one zone will not interfere with activity for another zone.
 437  437   *
 438  438   * The semantics are like a reader-reader lock such that there may
 439  439   * either be multiple mounts (or zone state transitions, if that weren't
 440  440   * serialized by zonehash_lock) in progress at the same time, but not
 441  441   * both.
 442  442   *
 443  443   * We use cv's so the user can ctrl-C out of the operation if it's
 444  444   * taking too long.
 445  445   *
 446  446   * The semantics are such that there is unfair bias towards the
 447  447   * "current" operation.  This means that zone halt may starve if
 448  448   * there is a rapid succession of new mounts coming in to the zone.
 449  449   */
 450  450  /*
 451  451   * Prevent new mounts from progressing to the point of calling
 452  452   * VFS_MOUNT().  If there are already mounts in this "region", wait for
 453  453   * them to complete.
 454  454   */
 455  455  static int
 456  456  block_mounts(zone_t *zp)
 457  457  {
 458  458          int retval = 0;
 459  459  
 460  460          /*
 461  461           * Since it may block for a long time, block_mounts() shouldn't be
 462  462           * called with zonehash_lock held.
 463  463           */
 464  464          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 465  465          mutex_enter(&zp->zone_mount_lock);
 466  466          while (zp->zone_mounts_in_progress > 0) {
 467  467                  if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 468  468                          goto signaled;
 469  469          }
 470  470          /*
 471  471           * A negative value of mounts_in_progress indicates that mounts
 472  472           * have been blocked by (-mounts_in_progress) different callers
 473  473           * (remotely possible if two threads enter zone_shutdown at the same
 474  474           * time).
 475  475           */
 476  476          zp->zone_mounts_in_progress--;
 477  477          retval = 1;
 478  478  signaled:
 479  479          mutex_exit(&zp->zone_mount_lock);
 480  480          return (retval);
 481  481  }
 482  482  
 483  483  /*
 484  484   * The VFS layer may progress with new mounts as far as we're concerned.
 485  485   * Allow them to progress if we were the last obstacle.
 486  486   */
 487  487  static void
 488  488  resume_mounts(zone_t *zp)
 489  489  {
 490  490          mutex_enter(&zp->zone_mount_lock);
 491  491          if (++zp->zone_mounts_in_progress == 0)
 492  492                  cv_broadcast(&zp->zone_mount_cv);
 493  493          mutex_exit(&zp->zone_mount_lock);
 494  494  }
 495  495  
 496  496  /*
 497  497   * The VFS layer is busy with a mount; this zone should wait until all
 498  498   * of its mounts are completed to progress.
 499  499   */
 500  500  void
 501  501  mount_in_progress(zone_t *zp)
 502  502  {
 503  503          mutex_enter(&zp->zone_mount_lock);
 504  504          while (zp->zone_mounts_in_progress < 0)
 505  505                  cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 506  506          zp->zone_mounts_in_progress++;
 507  507          mutex_exit(&zp->zone_mount_lock);
 508  508  }
 509  509  
 510  510  /*
 511  511   * VFS is done with one mount; wake up any waiting block_mounts()
 512  512   * callers if this is the last mount.
 513  513   */
 514  514  void
 515  515  mount_completed(zone_t *zp)
 516  516  {
 517  517          mutex_enter(&zp->zone_mount_lock);
 518  518          if (--zp->zone_mounts_in_progress == 0)
 519  519                  cv_broadcast(&zp->zone_mount_cv);
 520  520          mutex_exit(&zp->zone_mount_lock);
 521  521  }
 522  522  
 523  523  /*
 524  524   * ZSD routines.
 525  525   *
 526  526   * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 527  527   * defined by the pthread_key_create() and related interfaces.
 528  528   *
 529  529   * Kernel subsystems may register one or more data items and/or
 530  530   * callbacks to be executed when a zone is created, shutdown, or
 531  531   * destroyed.
 532  532   *
 533  533   * Unlike the thread counterpart, destructor callbacks will be executed
 534  534   * even if the data pointer is NULL and/or there are no constructor
 535  535   * callbacks, so it is the responsibility of such callbacks to check for
 536  536   * NULL data values if necessary.
 537  537   *
 538  538   * The locking strategy and overall picture is as follows:
 539  539   *
 540  540   * When someone calls zone_key_create(), a template ZSD entry is added to the
 541  541   * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 542  542   * holding that lock all the existing zones are marked as
 543  543   * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 544  544   * zone_zsd list (protected by zone_lock). The global list is updated first
 545  545   * (under zone_key_lock) to make sure that newly created zones use the
 546  546   * most recent list of keys. Then under zonehash_lock we walk the zones
 547  547   * and mark them.  Similar locking is used in zone_key_delete().
 548  548   *
 549  549   * The actual create, shutdown, and destroy callbacks are done without
 550  550   * holding any lock. And zsd_flags are used to ensure that the operations
 551  551   * completed so that when zone_key_create (and zone_create) is done, as well as
 552  552   * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 553  553   * are completed.
 554  554   *
 555  555   * When new zones are created constructor callbacks for all registered ZSD
 556  556   * entries will be called. That also uses the above two phases of marking
 557  557   * what needs to be done, and then running the callbacks without holding
 558  558   * any locks.
 559  559   *
 560  560   * The framework does not provide any locking around zone_getspecific() and
 561  561   * zone_setspecific() apart from that needed for internal consistency, so
 562  562   * callers interested in atomic "test-and-set" semantics will need to provide
 563  563   * their own locking.
 564  564   */
 565  565  
 566  566  /*
 567  567   * Helper function to find the zsd_entry associated with the key in the
 568  568   * given list.
 569  569   */
 570  570  static struct zsd_entry *
 571  571  zsd_find(list_t *l, zone_key_t key)
 572  572  {
 573  573          struct zsd_entry *zsd;
 574  574  
 575  575          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 576  576                  if (zsd->zsd_key == key) {
 577  577                          return (zsd);
 578  578                  }
 579  579          }
 580  580          return (NULL);
 581  581  }
 582  582  
 583  583  /*
 584  584   * Helper function to find the zsd_entry associated with the key in the
 585  585   * given list. Move it to the front of the list.
 586  586   */
 587  587  static struct zsd_entry *
 588  588  zsd_find_mru(list_t *l, zone_key_t key)
 589  589  {
 590  590          struct zsd_entry *zsd;
 591  591  
 592  592          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 593  593                  if (zsd->zsd_key == key) {
 594  594                          /*
 595  595                           * Move to head of list to keep list in MRU order.
 596  596                           */
 597  597                          if (zsd != list_head(l)) {
 598  598                                  list_remove(l, zsd);
 599  599                                  list_insert_head(l, zsd);
 600  600                          }
 601  601                          return (zsd);
 602  602                  }
 603  603          }
 604  604          return (NULL);
 605  605  }
 606  606  
 607  607  void
 608  608  zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 609  609      void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 610  610  {
 611  611          struct zsd_entry *zsdp;
 612  612          struct zsd_entry *t;
 613  613          struct zone *zone;
 614  614          zone_key_t  key;
 615  615  
 616  616          zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 617  617          zsdp->zsd_data = NULL;
 618  618          zsdp->zsd_create = create;
 619  619          zsdp->zsd_shutdown = shutdown;
 620  620          zsdp->zsd_destroy = destroy;
 621  621  
 622  622          /*
 623  623           * Insert in global list of callbacks. Makes future zone creations
 624  624           * see it.
 625  625           */
 626  626          mutex_enter(&zsd_key_lock);
 627  627          key = zsdp->zsd_key = ++zsd_keyval;
 628  628          ASSERT(zsd_keyval != 0);
 629  629          list_insert_tail(&zsd_registered_keys, zsdp);
 630  630          mutex_exit(&zsd_key_lock);
 631  631  
 632  632          /*
 633  633           * Insert for all existing zones and mark them as needing
 634  634           * a create callback.
 635  635           */
 636  636          mutex_enter(&zonehash_lock);    /* stop the world */
 637  637          for (zone = list_head(&zone_active); zone != NULL;
 638  638              zone = list_next(&zone_active, zone)) {
 639  639                  zone_status_t status;
 640  640  
 641  641                  mutex_enter(&zone->zone_lock);
 642  642  
 643  643                  /* Skip zones that are on the way down or not yet up */
 644  644                  status = zone_status_get(zone);
 645  645                  if (status >= ZONE_IS_DOWN ||
 646  646                      status == ZONE_IS_UNINITIALIZED) {
 647  647                          mutex_exit(&zone->zone_lock);
 648  648                          continue;
 649  649                  }
 650  650  
 651  651                  t = zsd_find_mru(&zone->zone_zsd, key);
 652  652                  if (t != NULL) {
 653  653                          /*
 654  654                           * A zsd_configure already inserted it after
 655  655                           * we dropped zsd_key_lock above.
 656  656                           */
 657  657                          mutex_exit(&zone->zone_lock);
 658  658                          continue;
 659  659                  }
 660  660                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 661  661                  t->zsd_key = key;
 662  662                  t->zsd_create = create;
 663  663                  t->zsd_shutdown = shutdown;
 664  664                  t->zsd_destroy = destroy;
 665  665                  if (create != NULL) {
 666  666                          t->zsd_flags = ZSD_CREATE_NEEDED;
 667  667                          DTRACE_PROBE2(zsd__create__needed,
 668  668                              zone_t *, zone, zone_key_t, key);
 669  669                  }
 670  670                  list_insert_tail(&zone->zone_zsd, t);
 671  671                  mutex_exit(&zone->zone_lock);
 672  672          }
 673  673          mutex_exit(&zonehash_lock);
 674  674  
 675  675          if (create != NULL) {
 676  676                  /* Now call the create callback for this key */
 677  677                  zsd_apply_all_zones(zsd_apply_create, key);
 678  678          }
 679  679          /*
 680  680           * It is safe for consumers to use the key now, make it
 681  681           * globally visible. Specifically zone_getspecific() will
 682  682           * always successfully return the zone specific data associated
 683  683           * with the key.
 684  684           */
 685  685          *keyp = key;
 686  686  
 687  687  }
 688  688  
 689  689  /*
 690  690   * Function called when a module is being unloaded, or otherwise wishes
 691  691   * to unregister its ZSD key and callbacks.
 692  692   *
 693  693   * Remove from the global list and determine the functions that need to
 694  694   * be called under a global lock. Then call the functions without
 695  695   * holding any locks. Finally free up the zone_zsd entries. (The apply
 696  696   * functions need to access the zone_zsd entries to find zsd_data etc.)
 697  697   */
 698  698  int
 699  699  zone_key_delete(zone_key_t key)
 700  700  {
 701  701          struct zsd_entry *zsdp = NULL;
 702  702          zone_t *zone;
 703  703  
 704  704          mutex_enter(&zsd_key_lock);
 705  705          zsdp = zsd_find_mru(&zsd_registered_keys, key);
 706  706          if (zsdp == NULL) {
 707  707                  mutex_exit(&zsd_key_lock);
 708  708                  return (-1);
 709  709          }
 710  710          list_remove(&zsd_registered_keys, zsdp);
 711  711          mutex_exit(&zsd_key_lock);
 712  712  
 713  713          mutex_enter(&zonehash_lock);
 714  714          for (zone = list_head(&zone_active); zone != NULL;
 715  715              zone = list_next(&zone_active, zone)) {
 716  716                  struct zsd_entry *del;
 717  717  
 718  718                  mutex_enter(&zone->zone_lock);
 719  719                  del = zsd_find_mru(&zone->zone_zsd, key);
 720  720                  if (del == NULL) {
 721  721                          /*
 722  722                           * Somebody else got here first e.g the zone going
 723  723                           * away.
 724  724                           */
 725  725                          mutex_exit(&zone->zone_lock);
 726  726                          continue;
 727  727                  }
 728  728                  ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 729  729                  ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 730  730                  if (del->zsd_shutdown != NULL &&
 731  731                      (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 732  732                          del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 733  733                          DTRACE_PROBE2(zsd__shutdown__needed,
 734  734                              zone_t *, zone, zone_key_t, key);
 735  735                  }
 736  736                  if (del->zsd_destroy != NULL &&
 737  737                      (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 738  738                          del->zsd_flags |= ZSD_DESTROY_NEEDED;
 739  739                          DTRACE_PROBE2(zsd__destroy__needed,
 740  740                              zone_t *, zone, zone_key_t, key);
 741  741                  }
 742  742                  mutex_exit(&zone->zone_lock);
 743  743          }
 744  744          mutex_exit(&zonehash_lock);
 745  745          kmem_free(zsdp, sizeof (*zsdp));
 746  746  
 747  747          /* Now call the shutdown and destroy callback for this key */
 748  748          zsd_apply_all_zones(zsd_apply_shutdown, key);
 749  749          zsd_apply_all_zones(zsd_apply_destroy, key);
 750  750  
 751  751          /* Now we can free up the zsdp structures in each zone */
 752  752          mutex_enter(&zonehash_lock);
 753  753          for (zone = list_head(&zone_active); zone != NULL;
 754  754              zone = list_next(&zone_active, zone)) {
 755  755                  struct zsd_entry *del;
 756  756  
 757  757                  mutex_enter(&zone->zone_lock);
 758  758                  del = zsd_find(&zone->zone_zsd, key);
 759  759                  if (del != NULL) {
 760  760                          list_remove(&zone->zone_zsd, del);
 761  761                          ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 762  762                          kmem_free(del, sizeof (*del));
 763  763                  }
 764  764                  mutex_exit(&zone->zone_lock);
 765  765          }
 766  766          mutex_exit(&zonehash_lock);
 767  767  
 768  768          return (0);
 769  769  }
 770  770  
 771  771  /*
 772  772   * ZSD counterpart of pthread_setspecific().
 773  773   *
 774  774   * Since all zsd callbacks, including those with no create function,
 775  775   * have an entry in zone_zsd, if the key is registered it is part of
 776  776   * the zone_zsd list.
 777  777   * Return an error if the key wasn't registerd.
 778  778   */
 779  779  int
 780  780  zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 781  781  {
 782  782          struct zsd_entry *t;
 783  783  
 784  784          mutex_enter(&zone->zone_lock);
 785  785          t = zsd_find_mru(&zone->zone_zsd, key);
 786  786          if (t != NULL) {
 787  787                  /*
 788  788                   * Replace old value with new
 789  789                   */
 790  790                  t->zsd_data = (void *)data;
 791  791                  mutex_exit(&zone->zone_lock);
 792  792                  return (0);
 793  793          }
 794  794          mutex_exit(&zone->zone_lock);
 795  795          return (-1);
 796  796  }
 797  797  
 798  798  /*
 799  799   * ZSD counterpart of pthread_getspecific().
 800  800   */
 801  801  void *
 802  802  zone_getspecific(zone_key_t key, zone_t *zone)
 803  803  {
 804  804          struct zsd_entry *t;
 805  805          void *data;
 806  806  
 807  807          mutex_enter(&zone->zone_lock);
 808  808          t = zsd_find_mru(&zone->zone_zsd, key);
 809  809          data = (t == NULL ? NULL : t->zsd_data);
 810  810          mutex_exit(&zone->zone_lock);
 811  811          return (data);
 812  812  }
 813  813  
 814  814  /*
 815  815   * Function used to initialize a zone's list of ZSD callbacks and data
 816  816   * when the zone is being created.  The callbacks are initialized from
 817  817   * the template list (zsd_registered_keys). The constructor callback is
 818  818   * executed later (once the zone exists and with locks dropped).
 819  819   */
 820  820  static void
 821  821  zone_zsd_configure(zone_t *zone)
 822  822  {
 823  823          struct zsd_entry *zsdp;
 824  824          struct zsd_entry *t;
 825  825  
 826  826          ASSERT(MUTEX_HELD(&zonehash_lock));
 827  827          ASSERT(list_head(&zone->zone_zsd) == NULL);
 828  828          mutex_enter(&zone->zone_lock);
 829  829          mutex_enter(&zsd_key_lock);
 830  830          for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 831  831              zsdp = list_next(&zsd_registered_keys, zsdp)) {
 832  832                  /*
 833  833                   * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 834  834                   * should not have added anything to it.
 835  835                   */
 836  836                  ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 837  837  
 838  838                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 839  839                  t->zsd_key = zsdp->zsd_key;
 840  840                  t->zsd_create = zsdp->zsd_create;
 841  841                  t->zsd_shutdown = zsdp->zsd_shutdown;
 842  842                  t->zsd_destroy = zsdp->zsd_destroy;
 843  843                  if (zsdp->zsd_create != NULL) {
 844  844                          t->zsd_flags = ZSD_CREATE_NEEDED;
 845  845                          DTRACE_PROBE2(zsd__create__needed,
 846  846                              zone_t *, zone, zone_key_t, zsdp->zsd_key);
 847  847                  }
 848  848                  list_insert_tail(&zone->zone_zsd, t);
 849  849          }
 850  850          mutex_exit(&zsd_key_lock);
 851  851          mutex_exit(&zone->zone_lock);
 852  852  }
 853  853  
 854  854  enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 855  855  
 856  856  /*
 857  857   * Helper function to execute shutdown or destructor callbacks.
 858  858   */
 859  859  static void
 860  860  zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 861  861  {
 862  862          struct zsd_entry *t;
 863  863  
 864  864          ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 865  865          ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 866  866          ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 867  867  
 868  868          /*
 869  869           * Run the callback solely based on what is registered for the zone
 870  870           * in zone_zsd. The global list can change independently of this
 871  871           * as keys are registered and unregistered and we don't register new
 872  872           * callbacks for a zone that is in the process of going away.
 873  873           */
 874  874          mutex_enter(&zone->zone_lock);
 875  875          for (t = list_head(&zone->zone_zsd); t != NULL;
 876  876              t = list_next(&zone->zone_zsd, t)) {
 877  877                  zone_key_t key = t->zsd_key;
 878  878  
 879  879                  /* Skip if no callbacks registered */
 880  880  
 881  881                  if (ct == ZSD_SHUTDOWN) {
 882  882                          if (t->zsd_shutdown != NULL &&
 883  883                              (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 884  884                                  t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 885  885                                  DTRACE_PROBE2(zsd__shutdown__needed,
 886  886                                      zone_t *, zone, zone_key_t, key);
 887  887                          }
 888  888                  } else {
 889  889                          if (t->zsd_destroy != NULL &&
 890  890                              (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 891  891                                  t->zsd_flags |= ZSD_DESTROY_NEEDED;
 892  892                                  DTRACE_PROBE2(zsd__destroy__needed,
 893  893                                      zone_t *, zone, zone_key_t, key);
 894  894                          }
 895  895                  }
 896  896          }
 897  897          mutex_exit(&zone->zone_lock);
 898  898  
 899  899          /* Now call the shutdown and destroy callback for this key */
 900  900          zsd_apply_all_keys(zsd_apply_shutdown, zone);
 901  901          zsd_apply_all_keys(zsd_apply_destroy, zone);
 902  902  
 903  903  }
 904  904  
 905  905  /*
 906  906   * Called when the zone is going away; free ZSD-related memory, and
 907  907   * destroy the zone_zsd list.
 908  908   */
 909  909  static void
 910  910  zone_free_zsd(zone_t *zone)
 911  911  {
 912  912          struct zsd_entry *t, *next;
 913  913  
 914  914          /*
 915  915           * Free all the zsd_entry's we had on this zone.
 916  916           */
 917  917          mutex_enter(&zone->zone_lock);
 918  918          for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 919  919                  next = list_next(&zone->zone_zsd, t);
 920  920                  list_remove(&zone->zone_zsd, t);
 921  921                  ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 922  922                  kmem_free(t, sizeof (*t));
 923  923          }
 924  924          list_destroy(&zone->zone_zsd);
 925  925          mutex_exit(&zone->zone_lock);
 926  926  
 927  927  }
 928  928  
 929  929  /*
 930  930   * Apply a function to all zones for particular key value.
 931  931   *
 932  932   * The applyfn has to drop zonehash_lock if it does some work, and
 933  933   * then reacquire it before it returns.
 934  934   * When the lock is dropped we don't follow list_next even
 935  935   * if it is possible to do so without any hazards. This is
 936  936   * because we want the design to allow for the list of zones
 937  937   * to change in any arbitrary way during the time the
 938  938   * lock was dropped.
 939  939   *
 940  940   * It is safe to restart the loop at list_head since the applyfn
 941  941   * changes the zsd_flags as it does work, so a subsequent
 942  942   * pass through will have no effect in applyfn, hence the loop will terminate
 943  943   * in at worst O(N^2).
 944  944   */
 945  945  static void
 946  946  zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 947  947  {
 948  948          zone_t *zone;
 949  949  
 950  950          mutex_enter(&zonehash_lock);
 951  951          zone = list_head(&zone_active);
 952  952          while (zone != NULL) {
 953  953                  if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 954  954                          /* Lock dropped - restart at head */
 955  955                          zone = list_head(&zone_active);
 956  956                  } else {
 957  957                          zone = list_next(&zone_active, zone);
 958  958                  }
 959  959          }
 960  960          mutex_exit(&zonehash_lock);
 961  961  }
 962  962  
 963  963  /*
 964  964   * Apply a function to all keys for a particular zone.
 965  965   *
 966  966   * The applyfn has to drop zonehash_lock if it does some work, and
 967  967   * then reacquire it before it returns.
 968  968   * When the lock is dropped we don't follow list_next even
 969  969   * if it is possible to do so without any hazards. This is
 970  970   * because we want the design to allow for the list of zsd callbacks
 971  971   * to change in any arbitrary way during the time the
 972  972   * lock was dropped.
 973  973   *
 974  974   * It is safe to restart the loop at list_head since the applyfn
 975  975   * changes the zsd_flags as it does work, so a subsequent
 976  976   * pass through will have no effect in applyfn, hence the loop will terminate
 977  977   * in at worst O(N^2).
 978  978   */
 979  979  static void
 980  980  zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 981  981  {
 982  982          struct zsd_entry *t;
 983  983  
 984  984          mutex_enter(&zone->zone_lock);
 985  985          t = list_head(&zone->zone_zsd);
 986  986          while (t != NULL) {
 987  987                  if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 988  988                          /* Lock dropped - restart at head */
 989  989                          t = list_head(&zone->zone_zsd);
 990  990                  } else {
 991  991                          t = list_next(&zone->zone_zsd, t);
 992  992                  }
 993  993          }
 994  994          mutex_exit(&zone->zone_lock);
 995  995  }
 996  996  
 997  997  /*
 998  998   * Call the create function for the zone and key if CREATE_NEEDED
 999  999   * is set.
1000 1000   * If some other thread gets here first and sets CREATE_INPROGRESS, then
1001 1001   * we wait for that thread to complete so that we can ensure that
1002 1002   * all the callbacks are done when we've looped over all zones/keys.
1003 1003   *
1004 1004   * When we call the create function, we drop the global held by the
1005 1005   * caller, and return true to tell the caller it needs to re-evalute the
1006 1006   * state.
1007 1007   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1008 1008   * remains held on exit.
1009 1009   */
1010 1010  static boolean_t
1011 1011  zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1012 1012      zone_t *zone, zone_key_t key)
1013 1013  {
1014 1014          void *result;
1015 1015          struct zsd_entry *t;
1016 1016          boolean_t dropped;
1017 1017  
1018 1018          if (lockp != NULL) {
1019 1019                  ASSERT(MUTEX_HELD(lockp));
1020 1020          }
1021 1021          if (zone_lock_held) {
1022 1022                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1023 1023          } else {
1024 1024                  mutex_enter(&zone->zone_lock);
1025 1025          }
1026 1026  
1027 1027          t = zsd_find(&zone->zone_zsd, key);
1028 1028          if (t == NULL) {
1029 1029                  /*
1030 1030                   * Somebody else got here first e.g the zone going
1031 1031                   * away.
1032 1032                   */
1033 1033                  if (!zone_lock_held)
1034 1034                          mutex_exit(&zone->zone_lock);
1035 1035                  return (B_FALSE);
1036 1036          }
1037 1037          dropped = B_FALSE;
1038 1038          if (zsd_wait_for_inprogress(zone, t, lockp))
1039 1039                  dropped = B_TRUE;
1040 1040  
1041 1041          if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1042 1042                  t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1043 1043                  t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1044 1044                  DTRACE_PROBE2(zsd__create__inprogress,
1045 1045                      zone_t *, zone, zone_key_t, key);
1046 1046                  mutex_exit(&zone->zone_lock);
1047 1047                  if (lockp != NULL)
1048 1048                          mutex_exit(lockp);
1049 1049  
1050 1050                  dropped = B_TRUE;
1051 1051                  ASSERT(t->zsd_create != NULL);
1052 1052                  DTRACE_PROBE2(zsd__create__start,
1053 1053                      zone_t *, zone, zone_key_t, key);
1054 1054  
1055 1055                  result = (*t->zsd_create)(zone->zone_id);
1056 1056  
1057 1057                  DTRACE_PROBE2(zsd__create__end,
1058 1058                      zone_t *, zone, voidn *, result);
1059 1059  
1060 1060                  ASSERT(result != NULL);
1061 1061                  if (lockp != NULL)
1062 1062                          mutex_enter(lockp);
1063 1063                  mutex_enter(&zone->zone_lock);
1064 1064                  t->zsd_data = result;
1065 1065                  t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1066 1066                  t->zsd_flags |= ZSD_CREATE_COMPLETED;
1067 1067                  cv_broadcast(&t->zsd_cv);
1068 1068                  DTRACE_PROBE2(zsd__create__completed,
1069 1069                      zone_t *, zone, zone_key_t, key);
1070 1070          }
1071 1071          if (!zone_lock_held)
1072 1072                  mutex_exit(&zone->zone_lock);
1073 1073          return (dropped);
1074 1074  }
1075 1075  
1076 1076  /*
1077 1077   * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1078 1078   * is set.
1079 1079   * If some other thread gets here first and sets *_INPROGRESS, then
1080 1080   * we wait for that thread to complete so that we can ensure that
1081 1081   * all the callbacks are done when we've looped over all zones/keys.
1082 1082   *
1083 1083   * When we call the shutdown function, we drop the global held by the
1084 1084   * caller, and return true to tell the caller it needs to re-evalute the
1085 1085   * state.
1086 1086   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1087 1087   * remains held on exit.
1088 1088   */
1089 1089  static boolean_t
1090 1090  zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1091 1091      zone_t *zone, zone_key_t key)
1092 1092  {
1093 1093          struct zsd_entry *t;
1094 1094          void *data;
1095 1095          boolean_t dropped;
1096 1096  
1097 1097          if (lockp != NULL) {
1098 1098                  ASSERT(MUTEX_HELD(lockp));
1099 1099          }
1100 1100          if (zone_lock_held) {
1101 1101                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1102 1102          } else {
1103 1103                  mutex_enter(&zone->zone_lock);
1104 1104          }
1105 1105  
1106 1106          t = zsd_find(&zone->zone_zsd, key);
1107 1107          if (t == NULL) {
1108 1108                  /*
1109 1109                   * Somebody else got here first e.g the zone going
1110 1110                   * away.
1111 1111                   */
1112 1112                  if (!zone_lock_held)
1113 1113                          mutex_exit(&zone->zone_lock);
1114 1114                  return (B_FALSE);
1115 1115          }
1116 1116          dropped = B_FALSE;
1117 1117          if (zsd_wait_for_creator(zone, t, lockp))
1118 1118                  dropped = B_TRUE;
1119 1119  
1120 1120          if (zsd_wait_for_inprogress(zone, t, lockp))
1121 1121                  dropped = B_TRUE;
1122 1122  
1123 1123          if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1124 1124                  t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1125 1125                  t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1126 1126                  DTRACE_PROBE2(zsd__shutdown__inprogress,
1127 1127                      zone_t *, zone, zone_key_t, key);
1128 1128                  mutex_exit(&zone->zone_lock);
1129 1129                  if (lockp != NULL)
1130 1130                          mutex_exit(lockp);
1131 1131                  dropped = B_TRUE;
1132 1132  
1133 1133                  ASSERT(t->zsd_shutdown != NULL);
1134 1134                  data = t->zsd_data;
1135 1135  
1136 1136                  DTRACE_PROBE2(zsd__shutdown__start,
1137 1137                      zone_t *, zone, zone_key_t, key);
1138 1138  
1139 1139                  (t->zsd_shutdown)(zone->zone_id, data);
1140 1140                  DTRACE_PROBE2(zsd__shutdown__end,
1141 1141                      zone_t *, zone, zone_key_t, key);
1142 1142  
1143 1143                  if (lockp != NULL)
1144 1144                          mutex_enter(lockp);
1145 1145                  mutex_enter(&zone->zone_lock);
1146 1146                  t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1147 1147                  t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1148 1148                  cv_broadcast(&t->zsd_cv);
1149 1149                  DTRACE_PROBE2(zsd__shutdown__completed,
1150 1150                      zone_t *, zone, zone_key_t, key);
1151 1151          }
1152 1152          if (!zone_lock_held)
1153 1153                  mutex_exit(&zone->zone_lock);
1154 1154          return (dropped);
1155 1155  }
1156 1156  
1157 1157  /*
1158 1158   * Call the destroy function for the zone and key if DESTROY_NEEDED
1159 1159   * is set.
1160 1160   * If some other thread gets here first and sets *_INPROGRESS, then
1161 1161   * we wait for that thread to complete so that we can ensure that
1162 1162   * all the callbacks are done when we've looped over all zones/keys.
1163 1163   *
1164 1164   * When we call the destroy function, we drop the global held by the
1165 1165   * caller, and return true to tell the caller it needs to re-evalute the
1166 1166   * state.
1167 1167   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1168 1168   * remains held on exit.
1169 1169   */
1170 1170  static boolean_t
1171 1171  zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1172 1172      zone_t *zone, zone_key_t key)
1173 1173  {
1174 1174          struct zsd_entry *t;
1175 1175          void *data;
1176 1176          boolean_t dropped;
1177 1177  
1178 1178          if (lockp != NULL) {
1179 1179                  ASSERT(MUTEX_HELD(lockp));
1180 1180          }
1181 1181          if (zone_lock_held) {
1182 1182                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1183 1183          } else {
1184 1184                  mutex_enter(&zone->zone_lock);
1185 1185          }
1186 1186  
1187 1187          t = zsd_find(&zone->zone_zsd, key);
1188 1188          if (t == NULL) {
1189 1189                  /*
1190 1190                   * Somebody else got here first e.g the zone going
1191 1191                   * away.
1192 1192                   */
1193 1193                  if (!zone_lock_held)
1194 1194                          mutex_exit(&zone->zone_lock);
1195 1195                  return (B_FALSE);
1196 1196          }
1197 1197          dropped = B_FALSE;
1198 1198          if (zsd_wait_for_creator(zone, t, lockp))
1199 1199                  dropped = B_TRUE;
1200 1200  
1201 1201          if (zsd_wait_for_inprogress(zone, t, lockp))
1202 1202                  dropped = B_TRUE;
1203 1203  
1204 1204          if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1205 1205                  t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1206 1206                  t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1207 1207                  DTRACE_PROBE2(zsd__destroy__inprogress,
1208 1208                      zone_t *, zone, zone_key_t, key);
1209 1209                  mutex_exit(&zone->zone_lock);
1210 1210                  if (lockp != NULL)
1211 1211                          mutex_exit(lockp);
1212 1212                  dropped = B_TRUE;
1213 1213  
1214 1214                  ASSERT(t->zsd_destroy != NULL);
1215 1215                  data = t->zsd_data;
1216 1216                  DTRACE_PROBE2(zsd__destroy__start,
1217 1217                      zone_t *, zone, zone_key_t, key);
1218 1218  
1219 1219                  (t->zsd_destroy)(zone->zone_id, data);
1220 1220                  DTRACE_PROBE2(zsd__destroy__end,
1221 1221                      zone_t *, zone, zone_key_t, key);
1222 1222  
1223 1223                  if (lockp != NULL)
1224 1224                          mutex_enter(lockp);
1225 1225                  mutex_enter(&zone->zone_lock);
1226 1226                  t->zsd_data = NULL;
1227 1227                  t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1228 1228                  t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1229 1229                  cv_broadcast(&t->zsd_cv);
1230 1230                  DTRACE_PROBE2(zsd__destroy__completed,
1231 1231                      zone_t *, zone, zone_key_t, key);
1232 1232          }
1233 1233          if (!zone_lock_held)
1234 1234                  mutex_exit(&zone->zone_lock);
1235 1235          return (dropped);
1236 1236  }
1237 1237  
1238 1238  /*
1239 1239   * Wait for any CREATE_NEEDED flag to be cleared.
1240 1240   * Returns true if lockp was temporarily dropped while waiting.
1241 1241   */
1242 1242  static boolean_t
1243 1243  zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1244 1244  {
1245 1245          boolean_t dropped = B_FALSE;
1246 1246  
1247 1247          while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1248 1248                  DTRACE_PROBE2(zsd__wait__for__creator,
1249 1249                      zone_t *, zone, struct zsd_entry *, t);
1250 1250                  if (lockp != NULL) {
1251 1251                          dropped = B_TRUE;
1252 1252                          mutex_exit(lockp);
1253 1253                  }
1254 1254                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1255 1255                  if (lockp != NULL) {
1256 1256                          /* First drop zone_lock to preserve order */
1257 1257                          mutex_exit(&zone->zone_lock);
1258 1258                          mutex_enter(lockp);
1259 1259                          mutex_enter(&zone->zone_lock);
1260 1260                  }
1261 1261          }
1262 1262          return (dropped);
1263 1263  }
1264 1264  
1265 1265  /*
1266 1266   * Wait for any INPROGRESS flag to be cleared.
1267 1267   * Returns true if lockp was temporarily dropped while waiting.
1268 1268   */
1269 1269  static boolean_t
1270 1270  zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1271 1271  {
1272 1272          boolean_t dropped = B_FALSE;
1273 1273  
1274 1274          while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1275 1275                  DTRACE_PROBE2(zsd__wait__for__inprogress,
1276 1276                      zone_t *, zone, struct zsd_entry *, t);
1277 1277                  if (lockp != NULL) {
1278 1278                          dropped = B_TRUE;
1279 1279                          mutex_exit(lockp);
1280 1280                  }
1281 1281                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1282 1282                  if (lockp != NULL) {
1283 1283                          /* First drop zone_lock to preserve order */
1284 1284                          mutex_exit(&zone->zone_lock);
1285 1285                          mutex_enter(lockp);
1286 1286                          mutex_enter(&zone->zone_lock);
1287 1287                  }
1288 1288          }
1289 1289          return (dropped);
1290 1290  }
1291 1291  
1292 1292  /*
1293 1293   * Frees memory associated with the zone dataset list.
1294 1294   */
1295 1295  static void
1296 1296  zone_free_datasets(zone_t *zone)
1297 1297  {
1298 1298          zone_dataset_t *t, *next;
1299 1299  
1300 1300          for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1301 1301                  next = list_next(&zone->zone_datasets, t);
1302 1302                  list_remove(&zone->zone_datasets, t);
1303 1303                  kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1304 1304                  kmem_free(t, sizeof (*t));
1305 1305          }
1306 1306          list_destroy(&zone->zone_datasets);
1307 1307  }
1308 1308  
1309 1309  /*
1310 1310   * zone.cpu-shares resource control support.
1311 1311   */
1312 1312  /*ARGSUSED*/
1313 1313  static rctl_qty_t
1314 1314  zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1315 1315  {
1316 1316          ASSERT(MUTEX_HELD(&p->p_lock));
1317 1317          return (p->p_zone->zone_shares);
1318 1318  }
1319 1319  
1320 1320  /*ARGSUSED*/
1321 1321  static int
1322 1322  zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1323 1323      rctl_qty_t nv)
1324 1324  {
1325 1325          ASSERT(MUTEX_HELD(&p->p_lock));
1326 1326          ASSERT(e->rcep_t == RCENTITY_ZONE);
1327 1327          if (e->rcep_p.zone == NULL)
1328 1328                  return (0);
1329 1329  
1330 1330          e->rcep_p.zone->zone_shares = nv;
1331 1331          return (0);
1332 1332  }
1333 1333  
1334 1334  static rctl_ops_t zone_cpu_shares_ops = {
1335 1335          rcop_no_action,
1336 1336          zone_cpu_shares_usage,
1337 1337          zone_cpu_shares_set,
1338 1338          rcop_no_test
1339 1339  };
1340 1340  
1341 1341  /*
1342 1342   * zone.cpu-cap resource control support.
1343 1343   */
1344 1344  /*ARGSUSED*/
1345 1345  static rctl_qty_t
1346 1346  zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1347 1347  {
1348 1348          ASSERT(MUTEX_HELD(&p->p_lock));
1349 1349          return (cpucaps_zone_get(p->p_zone));
1350 1350  }
1351 1351  
1352 1352  /*ARGSUSED*/
1353 1353  static int
1354 1354  zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1355 1355      rctl_qty_t nv)
1356 1356  {
1357 1357          zone_t *zone = e->rcep_p.zone;
1358 1358  
1359 1359          ASSERT(MUTEX_HELD(&p->p_lock));
1360 1360          ASSERT(e->rcep_t == RCENTITY_ZONE);
1361 1361  
1362 1362          if (zone == NULL)
1363 1363                  return (0);
1364 1364  
1365 1365          /*
1366 1366           * set cap to the new value.
1367 1367           */
1368 1368          return (cpucaps_zone_set(zone, nv));
1369 1369  }
1370 1370  
1371 1371  static rctl_ops_t zone_cpu_cap_ops = {
1372 1372          rcop_no_action,
1373 1373          zone_cpu_cap_get,
1374 1374          zone_cpu_cap_set,
1375 1375          rcop_no_test
1376 1376  };
1377 1377  
1378 1378  /*ARGSUSED*/
1379 1379  static rctl_qty_t
1380 1380  zone_lwps_usage(rctl_t *r, proc_t *p)
1381 1381  {
1382 1382          rctl_qty_t nlwps;
1383 1383          zone_t *zone = p->p_zone;
1384 1384  
1385 1385          ASSERT(MUTEX_HELD(&p->p_lock));
1386 1386  
1387 1387          mutex_enter(&zone->zone_nlwps_lock);
1388 1388          nlwps = zone->zone_nlwps;
1389 1389          mutex_exit(&zone->zone_nlwps_lock);
1390 1390  
1391 1391          return (nlwps);
1392 1392  }
1393 1393  
1394 1394  /*ARGSUSED*/
1395 1395  static int
1396 1396  zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1397 1397      rctl_qty_t incr, uint_t flags)
1398 1398  {
1399 1399          rctl_qty_t nlwps;
1400 1400  
1401 1401          ASSERT(MUTEX_HELD(&p->p_lock));
1402 1402          ASSERT(e->rcep_t == RCENTITY_ZONE);
1403 1403          if (e->rcep_p.zone == NULL)
1404 1404                  return (0);
1405 1405          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1406 1406          nlwps = e->rcep_p.zone->zone_nlwps;
1407 1407  
1408 1408          if (nlwps + incr > rcntl->rcv_value)
1409 1409                  return (1);
1410 1410  
1411 1411          return (0);
1412 1412  }
1413 1413  
1414 1414  /*ARGSUSED*/
1415 1415  static int
1416 1416  zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1417 1417  {
1418 1418          ASSERT(MUTEX_HELD(&p->p_lock));
1419 1419          ASSERT(e->rcep_t == RCENTITY_ZONE);
1420 1420          if (e->rcep_p.zone == NULL)
1421 1421                  return (0);
1422 1422          e->rcep_p.zone->zone_nlwps_ctl = nv;
1423 1423          return (0);
1424 1424  }
1425 1425  
1426 1426  static rctl_ops_t zone_lwps_ops = {
1427 1427          rcop_no_action,
1428 1428          zone_lwps_usage,
1429 1429          zone_lwps_set,
1430 1430          zone_lwps_test,
1431 1431  };
1432 1432  
1433 1433  /*ARGSUSED*/
1434 1434  static rctl_qty_t
1435 1435  zone_procs_usage(rctl_t *r, proc_t *p)
1436 1436  {
1437 1437          rctl_qty_t nprocs;
1438 1438          zone_t *zone = p->p_zone;
1439 1439  
1440 1440          ASSERT(MUTEX_HELD(&p->p_lock));
1441 1441  
1442 1442          mutex_enter(&zone->zone_nlwps_lock);
1443 1443          nprocs = zone->zone_nprocs;
1444 1444          mutex_exit(&zone->zone_nlwps_lock);
1445 1445  
1446 1446          return (nprocs);
1447 1447  }
1448 1448  
1449 1449  /*ARGSUSED*/
1450 1450  static int
1451 1451  zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1452 1452      rctl_qty_t incr, uint_t flags)
1453 1453  {
1454 1454          rctl_qty_t nprocs;
1455 1455  
1456 1456          ASSERT(MUTEX_HELD(&p->p_lock));
1457 1457          ASSERT(e->rcep_t == RCENTITY_ZONE);
1458 1458          if (e->rcep_p.zone == NULL)
1459 1459                  return (0);
1460 1460          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1461 1461          nprocs = e->rcep_p.zone->zone_nprocs;
1462 1462  
1463 1463          if (nprocs + incr > rcntl->rcv_value)
1464 1464                  return (1);
1465 1465  
1466 1466          return (0);
1467 1467  }
1468 1468  
1469 1469  /*ARGSUSED*/
1470 1470  static int
1471 1471  zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1472 1472  {
1473 1473          ASSERT(MUTEX_HELD(&p->p_lock));
1474 1474          ASSERT(e->rcep_t == RCENTITY_ZONE);
1475 1475          if (e->rcep_p.zone == NULL)
1476 1476                  return (0);
1477 1477          e->rcep_p.zone->zone_nprocs_ctl = nv;
1478 1478          return (0);
1479 1479  }
1480 1480  
1481 1481  static rctl_ops_t zone_procs_ops = {
1482 1482          rcop_no_action,
1483 1483          zone_procs_usage,
1484 1484          zone_procs_set,
1485 1485          zone_procs_test,
1486 1486  };
1487 1487  
1488 1488  /*ARGSUSED*/
1489 1489  static int
1490 1490  zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1491 1491      rctl_qty_t incr, uint_t flags)
1492 1492  {
1493 1493          rctl_qty_t v;
1494 1494          ASSERT(MUTEX_HELD(&p->p_lock));
1495 1495          ASSERT(e->rcep_t == RCENTITY_ZONE);
1496 1496          v = e->rcep_p.zone->zone_shmmax + incr;
1497 1497          if (v > rval->rcv_value)
1498 1498                  return (1);
1499 1499          return (0);
1500 1500  }
1501 1501  
1502 1502  static rctl_ops_t zone_shmmax_ops = {
1503 1503          rcop_no_action,
1504 1504          rcop_no_usage,
1505 1505          rcop_no_set,
1506 1506          zone_shmmax_test
1507 1507  };
1508 1508  
1509 1509  /*ARGSUSED*/
1510 1510  static int
1511 1511  zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1512 1512      rctl_qty_t incr, uint_t flags)
1513 1513  {
1514 1514          rctl_qty_t v;
1515 1515          ASSERT(MUTEX_HELD(&p->p_lock));
1516 1516          ASSERT(e->rcep_t == RCENTITY_ZONE);
1517 1517          v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1518 1518          if (v > rval->rcv_value)
1519 1519                  return (1);
1520 1520          return (0);
1521 1521  }
1522 1522  
1523 1523  static rctl_ops_t zone_shmmni_ops = {
1524 1524          rcop_no_action,
1525 1525          rcop_no_usage,
1526 1526          rcop_no_set,
1527 1527          zone_shmmni_test
1528 1528  };
1529 1529  
1530 1530  /*ARGSUSED*/
1531 1531  static int
1532 1532  zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1533 1533      rctl_qty_t incr, uint_t flags)
1534 1534  {
1535 1535          rctl_qty_t v;
1536 1536          ASSERT(MUTEX_HELD(&p->p_lock));
1537 1537          ASSERT(e->rcep_t == RCENTITY_ZONE);
1538 1538          v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1539 1539          if (v > rval->rcv_value)
1540 1540                  return (1);
1541 1541          return (0);
1542 1542  }
1543 1543  
1544 1544  static rctl_ops_t zone_semmni_ops = {
1545 1545          rcop_no_action,
1546 1546          rcop_no_usage,
1547 1547          rcop_no_set,
1548 1548          zone_semmni_test
1549 1549  };
1550 1550  
1551 1551  /*ARGSUSED*/
1552 1552  static int
1553 1553  zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1554 1554      rctl_qty_t incr, uint_t flags)
1555 1555  {
1556 1556          rctl_qty_t v;
1557 1557          ASSERT(MUTEX_HELD(&p->p_lock));
1558 1558          ASSERT(e->rcep_t == RCENTITY_ZONE);
1559 1559          v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1560 1560          if (v > rval->rcv_value)
1561 1561                  return (1);
1562 1562          return (0);
1563 1563  }
1564 1564  
1565 1565  static rctl_ops_t zone_msgmni_ops = {
1566 1566          rcop_no_action,
1567 1567          rcop_no_usage,
1568 1568          rcop_no_set,
1569 1569          zone_msgmni_test
1570 1570  };
1571 1571  
1572 1572  /*ARGSUSED*/
1573 1573  static rctl_qty_t
1574 1574  zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1575 1575  {
1576 1576          rctl_qty_t q;
1577 1577          ASSERT(MUTEX_HELD(&p->p_lock));
1578 1578          mutex_enter(&p->p_zone->zone_mem_lock);
1579 1579          q = p->p_zone->zone_locked_mem;
1580 1580          mutex_exit(&p->p_zone->zone_mem_lock);
1581 1581          return (q);
1582 1582  }
1583 1583  
1584 1584  /*ARGSUSED*/
1585 1585  static int
1586 1586  zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1587 1587      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1588 1588  {
1589 1589          rctl_qty_t q;
1590 1590          zone_t *z;
1591 1591  
1592 1592          z = e->rcep_p.zone;
1593 1593          ASSERT(MUTEX_HELD(&p->p_lock));
1594 1594          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1595 1595          q = z->zone_locked_mem;
1596 1596          if (q + incr > rcntl->rcv_value)
1597 1597                  return (1);
1598 1598          return (0);
1599 1599  }
1600 1600  
1601 1601  /*ARGSUSED*/
1602 1602  static int
1603 1603  zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1604 1604      rctl_qty_t nv)
1605 1605  {
1606 1606          ASSERT(MUTEX_HELD(&p->p_lock));
1607 1607          ASSERT(e->rcep_t == RCENTITY_ZONE);
1608 1608          if (e->rcep_p.zone == NULL)
1609 1609                  return (0);
1610 1610          e->rcep_p.zone->zone_locked_mem_ctl = nv;
1611 1611          return (0);
1612 1612  }
1613 1613  
1614 1614  static rctl_ops_t zone_locked_mem_ops = {
1615 1615          rcop_no_action,
1616 1616          zone_locked_mem_usage,
1617 1617          zone_locked_mem_set,
1618 1618          zone_locked_mem_test
1619 1619  };
1620 1620  
1621 1621  /*ARGSUSED*/
1622 1622  static rctl_qty_t
1623 1623  zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1624 1624  {
1625 1625          rctl_qty_t q;
1626 1626          zone_t *z = p->p_zone;
1627 1627  
1628 1628          ASSERT(MUTEX_HELD(&p->p_lock));
1629 1629          mutex_enter(&z->zone_mem_lock);
1630 1630          q = z->zone_max_swap;
1631 1631          mutex_exit(&z->zone_mem_lock);
1632 1632          return (q);
1633 1633  }
1634 1634  
1635 1635  /*ARGSUSED*/
1636 1636  static int
1637 1637  zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1638 1638      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1639 1639  {
1640 1640          rctl_qty_t q;
1641 1641          zone_t *z;
1642 1642  
1643 1643          z = e->rcep_p.zone;
1644 1644          ASSERT(MUTEX_HELD(&p->p_lock));
1645 1645          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1646 1646          q = z->zone_max_swap;
1647 1647          if (q + incr > rcntl->rcv_value)
1648 1648                  return (1);
1649 1649          return (0);
1650 1650  }
1651 1651  
1652 1652  /*ARGSUSED*/
1653 1653  static int
1654 1654  zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1655 1655      rctl_qty_t nv)
1656 1656  {
1657 1657          ASSERT(MUTEX_HELD(&p->p_lock));
1658 1658          ASSERT(e->rcep_t == RCENTITY_ZONE);
1659 1659          if (e->rcep_p.zone == NULL)
1660 1660                  return (0);
1661 1661          e->rcep_p.zone->zone_max_swap_ctl = nv;
1662 1662          return (0);
1663 1663  }
1664 1664  
1665 1665  static rctl_ops_t zone_max_swap_ops = {
1666 1666          rcop_no_action,
1667 1667          zone_max_swap_usage,
1668 1668          zone_max_swap_set,
1669 1669          zone_max_swap_test
1670 1670  };
1671 1671  
1672 1672  /*ARGSUSED*/
1673 1673  static rctl_qty_t
1674 1674  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1675 1675  {
1676 1676          rctl_qty_t q;
1677 1677          zone_t *z = p->p_zone;
1678 1678  
1679 1679          ASSERT(MUTEX_HELD(&p->p_lock));
1680 1680          mutex_enter(&z->zone_rctl_lock);
1681 1681          q = z->zone_max_lofi;
1682 1682          mutex_exit(&z->zone_rctl_lock);
1683 1683          return (q);
1684 1684  }
1685 1685  
1686 1686  /*ARGSUSED*/
1687 1687  static int
1688 1688  zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1689 1689      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1690 1690  {
1691 1691          rctl_qty_t q;
1692 1692          zone_t *z;
1693 1693  
1694 1694          z = e->rcep_p.zone;
1695 1695          ASSERT(MUTEX_HELD(&p->p_lock));
1696 1696          ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1697 1697          q = z->zone_max_lofi;
1698 1698          if (q + incr > rcntl->rcv_value)
1699 1699                  return (1);
1700 1700          return (0);
1701 1701  }
1702 1702  
1703 1703  /*ARGSUSED*/
1704 1704  static int
1705 1705  zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1706 1706      rctl_qty_t nv)
1707 1707  {
1708 1708          ASSERT(MUTEX_HELD(&p->p_lock));
1709 1709          ASSERT(e->rcep_t == RCENTITY_ZONE);
1710 1710          if (e->rcep_p.zone == NULL)
1711 1711                  return (0);
1712 1712          e->rcep_p.zone->zone_max_lofi_ctl = nv;
1713 1713          return (0);
1714 1714  }
1715 1715  
1716 1716  static rctl_ops_t zone_max_lofi_ops = {
1717 1717          rcop_no_action,
1718 1718          zone_max_lofi_usage,
1719 1719          zone_max_lofi_set,
1720 1720          zone_max_lofi_test
1721 1721  };
1722 1722  
1723 1723  /*
1724 1724   * Helper function to brand the zone with a unique ID.
1725 1725   */
1726 1726  static void
1727 1727  zone_uniqid(zone_t *zone)
1728 1728  {
1729 1729          static uint64_t uniqid = 0;
1730 1730  
1731 1731          ASSERT(MUTEX_HELD(&zonehash_lock));
1732 1732          zone->zone_uniqid = uniqid++;
1733 1733  }
1734 1734  
1735 1735  /*
1736 1736   * Returns a held pointer to the "kcred" for the specified zone.
1737 1737   */
1738 1738  struct cred *
1739 1739  zone_get_kcred(zoneid_t zoneid)
1740 1740  {
1741 1741          zone_t *zone;
1742 1742          cred_t *cr;
1743 1743  
1744 1744          if ((zone = zone_find_by_id(zoneid)) == NULL)
1745 1745                  return (NULL);
1746 1746          cr = zone->zone_kcred;
1747 1747          crhold(cr);
1748 1748          zone_rele(zone);
1749 1749          return (cr);
1750 1750  }
1751 1751  
1752 1752  static int
1753 1753  zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1754 1754  {
1755 1755          zone_t *zone = ksp->ks_private;
1756 1756          zone_kstat_t *zk = ksp->ks_data;
1757 1757  
1758 1758          if (rw == KSTAT_WRITE)
1759 1759                  return (EACCES);
1760 1760  
1761 1761          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1762 1762          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1763 1763          return (0);
1764 1764  }
1765 1765  
1766 1766  static int
1767 1767  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1768 1768  {
1769 1769          zone_t *zone = ksp->ks_private;
1770 1770          zone_kstat_t *zk = ksp->ks_data;
1771 1771  
1772 1772          if (rw == KSTAT_WRITE)
1773 1773                  return (EACCES);
1774 1774  
1775 1775          zk->zk_usage.value.ui64 = zone->zone_nprocs;
1776 1776          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1777 1777          return (0);
1778 1778  }
1779 1779  
1780 1780  static int
1781 1781  zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1782 1782  {
1783 1783          zone_t *zone = ksp->ks_private;
1784 1784          zone_kstat_t *zk = ksp->ks_data;
1785 1785  
1786 1786          if (rw == KSTAT_WRITE)
1787 1787                  return (EACCES);
1788 1788  
1789 1789          zk->zk_usage.value.ui64 = zone->zone_max_swap;
1790 1790          zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1791 1791          return (0);
1792 1792  }
1793 1793  
1794 1794  static kstat_t *
1795 1795  zone_kstat_create_common(zone_t *zone, char *name,
1796 1796      int (*updatefunc) (kstat_t *, int))
1797 1797  {
1798 1798          kstat_t *ksp;
1799 1799          zone_kstat_t *zk;
1800 1800  
1801 1801          ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1802 1802              sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1803 1803              KSTAT_FLAG_VIRTUAL);
1804 1804  
1805 1805          if (ksp == NULL)
1806 1806                  return (NULL);
1807 1807  
1808 1808          zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1809 1809          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1810 1810          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1811 1811          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1812 1812          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1813 1813          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1814 1814          ksp->ks_update = updatefunc;
1815 1815          ksp->ks_private = zone;
1816 1816          kstat_install(ksp);
1817 1817          return (ksp);
1818 1818  }
1819 1819  
1820 1820  static int
1821 1821  zone_misc_kstat_update(kstat_t *ksp, int rw)
1822 1822  {
1823 1823          zone_t *zone = ksp->ks_private;
1824 1824          zone_misc_kstat_t *zmp = ksp->ks_data;
1825 1825          hrtime_t tmp;
1826 1826  
1827 1827          if (rw == KSTAT_WRITE)
1828 1828                  return (EACCES);
1829 1829  
1830 1830          tmp = zone->zone_utime;
1831 1831          scalehrtime(&tmp);
1832 1832          zmp->zm_utime.value.ui64 = tmp;
1833 1833          tmp = zone->zone_stime;
1834 1834          scalehrtime(&tmp);
1835 1835          zmp->zm_stime.value.ui64 = tmp;
1836 1836          tmp = zone->zone_wtime;
1837 1837          scalehrtime(&tmp);
1838 1838          zmp->zm_wtime.value.ui64 = tmp;
1839 1839  
1840 1840          zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1841 1841          zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1842 1842          zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1843 1843  
1844 1844          zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1845 1845          zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1846 1846          zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1847 1847          zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1848 1848  
1849 1849          return (0);
1850 1850  }
1851 1851  
1852 1852  static kstat_t *
1853 1853  zone_misc_kstat_create(zone_t *zone)
1854 1854  {
1855 1855          kstat_t *ksp;
1856 1856          zone_misc_kstat_t *zmp;
1857 1857  
1858 1858          if ((ksp = kstat_create_zone("zones", zone->zone_id,
1859 1859              zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1860 1860              sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1861 1861              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1862 1862                  return (NULL);
1863 1863  
1864 1864          if (zone->zone_id != GLOBAL_ZONEID)
1865 1865                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1866 1866  
1867 1867          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1868 1868          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1869 1869          ksp->ks_lock = &zone->zone_misc_lock;
1870 1870          zone->zone_misc_stats = zmp;
1871 1871  
1872 1872          /* The kstat "name" field is not large enough for a full zonename */
1873 1873          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1874 1874          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1875 1875          kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1876 1876          kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1877 1877          kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1878 1878          kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1879 1879          kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1880 1880          kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1881 1881              KSTAT_DATA_UINT32);
1882 1882          kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1883 1883          kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1884 1884              KSTAT_DATA_UINT32);
1885 1885          kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1886 1886          kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1887 1887  
1888 1888  
1889 1889          ksp->ks_update = zone_misc_kstat_update;
1890 1890          ksp->ks_private = zone;
1891 1891  
1892 1892          kstat_install(ksp);
1893 1893          return (ksp);
1894 1894  }
1895 1895  
1896 1896  static void
1897 1897  zone_kstat_create(zone_t *zone)
1898 1898  {
1899 1899          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1900 1900              "lockedmem", zone_lockedmem_kstat_update);
1901 1901          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1902 1902              "swapresv", zone_swapresv_kstat_update);
1903 1903          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
1904 1904              "nprocs", zone_nprocs_kstat_update);
1905 1905  
1906 1906          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
1907 1907                  zone->zone_misc_stats = kmem_zalloc(
1908 1908                      sizeof (zone_misc_kstat_t), KM_SLEEP);
1909 1909          }
1910 1910  }
1911 1911  
1912 1912  static void
1913 1913  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
1914 1914  {
1915 1915          void *data;
1916 1916  
1917 1917          if (*pkstat != NULL) {
1918 1918                  data = (*pkstat)->ks_data;
1919 1919                  kstat_delete(*pkstat);
1920 1920                  kmem_free(data, datasz);
1921 1921                  *pkstat = NULL;
1922 1922          }
1923 1923  }
1924 1924  
1925 1925  static void
1926 1926  zone_kstat_delete(zone_t *zone)
1927 1927  {
1928 1928          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
1929 1929              sizeof (zone_kstat_t));
1930 1930          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
1931 1931              sizeof (zone_kstat_t));
1932 1932          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
1933 1933              sizeof (zone_kstat_t));
1934 1934          zone_kstat_delete_common(&zone->zone_misc_ksp,
1935 1935              sizeof (zone_misc_kstat_t));
1936 1936  }
1937 1937  
1938 1938  /*
1939 1939   * Called very early on in boot to initialize the ZSD list so that
1940 1940   * zone_key_create() can be called before zone_init().  It also initializes
1941 1941   * portions of zone0 which may be used before zone_init() is called.  The
1942 1942   * variable "global_zone" will be set when zone0 is fully initialized by
1943 1943   * zone_init().
1944 1944   */
1945 1945  void
1946 1946  zone_zsd_init(void)
1947 1947  {
1948 1948          mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1949 1949          mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1950 1950          list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1951 1951              offsetof(struct zsd_entry, zsd_linkage));
1952 1952          list_create(&zone_active, sizeof (zone_t),
1953 1953              offsetof(zone_t, zone_linkage));
1954 1954          list_create(&zone_deathrow, sizeof (zone_t),
1955 1955              offsetof(zone_t, zone_linkage));
1956 1956  
1957 1957          mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1958 1958          mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1959 1959          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1960 1960          zone0.zone_shares = 1;
1961 1961          zone0.zone_nlwps = 0;
1962 1962          zone0.zone_nlwps_ctl = INT_MAX;
1963 1963          zone0.zone_nprocs = 0;
1964 1964          zone0.zone_nprocs_ctl = INT_MAX;
1965 1965          zone0.zone_locked_mem = 0;
1966 1966          zone0.zone_locked_mem_ctl = UINT64_MAX;
1967 1967          ASSERT(zone0.zone_max_swap == 0);
1968 1968          zone0.zone_max_swap_ctl = UINT64_MAX;
1969 1969          zone0.zone_max_lofi = 0;
1970 1970          zone0.zone_max_lofi_ctl = UINT64_MAX;
1971 1971          zone0.zone_shmmax = 0;
1972 1972          zone0.zone_ipc.ipcq_shmmni = 0;
1973 1973          zone0.zone_ipc.ipcq_semmni = 0;
1974 1974          zone0.zone_ipc.ipcq_msgmni = 0;
1975 1975          zone0.zone_name = GLOBAL_ZONENAME;
1976 1976          zone0.zone_nodename = utsname.nodename;
1977 1977          zone0.zone_domain = srpc_domain;
1978 1978          zone0.zone_hostid = HW_INVALID_HOSTID;
1979 1979          zone0.zone_fs_allowed = NULL;
1980 1980          zone0.zone_ref = 1;
1981 1981          zone0.zone_id = GLOBAL_ZONEID;
1982 1982          zone0.zone_status = ZONE_IS_RUNNING;
1983 1983          zone0.zone_rootpath = "/";
1984 1984          zone0.zone_rootpathlen = 2;
1985 1985          zone0.zone_psetid = ZONE_PS_INVAL;
1986 1986          zone0.zone_ncpus = 0;
1987 1987          zone0.zone_ncpus_online = 0;
1988 1988          zone0.zone_proc_initpid = 1;
1989 1989          zone0.zone_initname = initname;
1990 1990          zone0.zone_lockedmem_kstat = NULL;
1991 1991          zone0.zone_swapresv_kstat = NULL;
1992 1992          zone0.zone_nprocs_kstat = NULL;
1993 1993  
1994 1994          zone0.zone_stime = 0;
1995 1995          zone0.zone_utime = 0;
1996 1996          zone0.zone_wtime = 0;
1997 1997  
1998 1998          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
1999 1999              offsetof(zone_ref_t, zref_linkage));
2000 2000          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2001 2001              offsetof(struct zsd_entry, zsd_linkage));
2002 2002          list_insert_head(&zone_active, &zone0);
2003 2003  
2004 2004          /*
2005 2005           * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2006 2006           * to anything meaningful.  It is assigned to be 'rootdir' in
2007 2007           * vfs_mountroot().
2008 2008           */
2009 2009          zone0.zone_rootvp = NULL;
2010 2010          zone0.zone_vfslist = NULL;
2011 2011          zone0.zone_bootargs = initargs;
2012 2012          zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2013 2013          /*
2014 2014           * The global zone has all privileges
2015 2015           */
2016 2016          priv_fillset(zone0.zone_privset);
2017 2017          /*
2018 2018           * Add p0 to the global zone
2019 2019           */
2020 2020          zone0.zone_zsched = &p0;
2021 2021          p0.p_zone = &zone0;
2022 2022  }
2023 2023  
2024 2024  /*
2025 2025   * Compute a hash value based on the contents of the label and the DOI.  The
2026 2026   * hash algorithm is somewhat arbitrary, but is based on the observation that
2027 2027   * humans will likely pick labels that differ by amounts that work out to be
2028 2028   * multiples of the number of hash chains, and thus stirring in some primes
2029 2029   * should help.
2030 2030   */
2031 2031  static uint_t
2032 2032  hash_bylabel(void *hdata, mod_hash_key_t key)
2033 2033  {
2034 2034          const ts_label_t *lab = (ts_label_t *)key;
2035 2035          const uint32_t *up, *ue;
2036 2036          uint_t hash;
2037 2037          int i;
2038 2038  
2039 2039          _NOTE(ARGUNUSED(hdata));
2040 2040  
2041 2041          hash = lab->tsl_doi + (lab->tsl_doi << 1);
2042 2042          /* we depend on alignment of label, but not representation */
2043 2043          up = (const uint32_t *)&lab->tsl_label;
2044 2044          ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2045 2045          i = 1;
2046 2046          while (up < ue) {
2047 2047                  /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2048 2048                  hash += *up + (*up << ((i % 16) + 1));
2049 2049                  up++;
2050 2050                  i++;
2051 2051          }
2052 2052          return (hash);
2053 2053  }
2054 2054  
2055 2055  /*
2056 2056   * All that mod_hash cares about here is zero (equal) versus non-zero (not
2057 2057   * equal).  This may need to be changed if less than / greater than is ever
2058 2058   * needed.
2059 2059   */
2060 2060  static int
2061 2061  hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2062 2062  {
2063 2063          ts_label_t *lab1 = (ts_label_t *)key1;
2064 2064          ts_label_t *lab2 = (ts_label_t *)key2;
2065 2065  
2066 2066          return (label_equal(lab1, lab2) ? 0 : 1);
2067 2067  }
2068 2068  
2069 2069  /*
2070 2070   * Called by main() to initialize the zones framework.
2071 2071   */
2072 2072  void
2073 2073  zone_init(void)
2074 2074  {
2075 2075          rctl_dict_entry_t *rde;
2076 2076          rctl_val_t *dval;
2077 2077          rctl_set_t *set;
2078 2078          rctl_alloc_gp_t *gp;
2079 2079          rctl_entity_p_t e;
2080 2080          int res;
2081 2081  
2082 2082          ASSERT(curproc == &p0);
2083 2083  
2084 2084          /*
2085 2085           * Create ID space for zone IDs.  ID 0 is reserved for the
2086 2086           * global zone.
2087 2087           */
2088 2088          zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2089 2089  
2090 2090          /*
2091 2091           * Initialize generic zone resource controls, if any.
2092 2092           */
2093 2093          rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2094 2094              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2095 2095              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2096 2096              FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2097 2097  
2098 2098          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2099 2099              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2100 2100              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2101 2101              RCTL_GLOBAL_INFINITE,
2102 2102              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2103 2103  
2104 2104          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2105 2105              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2106 2106              INT_MAX, INT_MAX, &zone_lwps_ops);
2107 2107  
2108 2108          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2109 2109              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2110 2110              INT_MAX, INT_MAX, &zone_procs_ops);
2111 2111  
2112 2112          /*
2113 2113           * System V IPC resource controls
2114 2114           */
2115 2115          rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2116 2116              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2117 2117              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2118 2118  
2119 2119          rc_zone_semmni = rctl_register("zone.max-sem-ids",
2120 2120              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2121 2121              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2122 2122  
2123 2123          rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2124 2124              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2125 2125              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2126 2126  
2127 2127          rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2128 2128              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2129 2129              RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2130 2130  
2131 2131          /*
2132 2132           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2133 2133           * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2134 2134           */
2135 2135          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2136 2136          bzero(dval, sizeof (rctl_val_t));
2137 2137          dval->rcv_value = 1;
2138 2138          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2139 2139          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2140 2140          dval->rcv_action_recip_pid = -1;
2141 2141  
2142 2142          rde = rctl_dict_lookup("zone.cpu-shares");
2143 2143          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2144 2144  
2145 2145          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2146 2146              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2147 2147              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2148 2148              &zone_locked_mem_ops);
2149 2149  
2150 2150          rc_zone_max_swap = rctl_register("zone.max-swap",
2151 2151              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2152 2152              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2153 2153              &zone_max_swap_ops);
2154 2154  
2155 2155          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2156 2156              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2157 2157              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2158 2158              &zone_max_lofi_ops);
2159 2159  
2160 2160          /*
2161 2161           * Initialize the ``global zone''.
2162 2162           */
2163 2163          set = rctl_set_create();
2164 2164          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2165 2165          mutex_enter(&p0.p_lock);
2166 2166          e.rcep_p.zone = &zone0;
2167 2167          e.rcep_t = RCENTITY_ZONE;
2168 2168          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2169 2169              gp);
2170 2170  
2171 2171          zone0.zone_nlwps = p0.p_lwpcnt;
2172 2172          zone0.zone_nprocs = 1;
2173 2173          zone0.zone_ntasks = 1;
2174 2174          mutex_exit(&p0.p_lock);
2175 2175          zone0.zone_restart_init = B_TRUE;
2176 2176          zone0.zone_brand = &native_brand;
2177 2177          rctl_prealloc_destroy(gp);
2178 2178          /*
2179 2179           * pool_default hasn't been initialized yet, so we let pool_init()
2180 2180           * take care of making sure the global zone is in the default pool.
2181 2181           */
2182 2182  
2183 2183          /*
2184 2184           * Initialize global zone kstats
2185 2185           */
2186 2186          zone_kstat_create(&zone0);
2187 2187  
2188 2188          /*
2189 2189           * Initialize zone label.
2190 2190           * mlp are initialized when tnzonecfg is loaded.
2191 2191           */
2192 2192          zone0.zone_slabel = l_admin_low;
2193 2193          rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2194 2194          label_hold(l_admin_low);
2195 2195  
2196 2196          /*
2197 2197           * Initialise the lock for the database structure used by mntfs.
2198 2198           */
2199 2199          rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2200 2200  
2201 2201          mutex_enter(&zonehash_lock);
2202 2202          zone_uniqid(&zone0);
2203 2203          ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2204 2204  
2205 2205          zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2206 2206              mod_hash_null_valdtor);
2207 2207          zonehashbyname = mod_hash_create_strhash("zone_by_name",
2208 2208              zone_hash_size, mod_hash_null_valdtor);
2209 2209          /*
2210 2210           * maintain zonehashbylabel only for labeled systems
2211 2211           */
2212 2212          if (is_system_labeled())
2213 2213                  zonehashbylabel = mod_hash_create_extended("zone_by_label",
2214 2214                      zone_hash_size, mod_hash_null_keydtor,
2215 2215                      mod_hash_null_valdtor, hash_bylabel, NULL,
2216 2216                      hash_labelkey_cmp, KM_SLEEP);
2217 2217          zonecount = 1;
2218 2218  
2219 2219          (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2220 2220              (mod_hash_val_t)&zone0);
2221 2221          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2222 2222              (mod_hash_val_t)&zone0);
2223 2223          if (is_system_labeled()) {
2224 2224                  zone0.zone_flags |= ZF_HASHED_LABEL;
2225 2225                  (void) mod_hash_insert(zonehashbylabel,
2226 2226                      (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2227 2227          }
2228 2228          mutex_exit(&zonehash_lock);
2229 2229  
2230 2230          /*
2231 2231           * We avoid setting zone_kcred until now, since kcred is initialized
2232 2232           * sometime after zone_zsd_init() and before zone_init().
2233 2233           */
2234 2234          zone0.zone_kcred = kcred;
2235 2235          /*
2236 2236           * The global zone is fully initialized (except for zone_rootvp which
2237 2237           * will be set when the root filesystem is mounted).
2238 2238           */
2239 2239          global_zone = &zone0;
2240 2240  
2241 2241          /*
2242 2242           * Setup an event channel to send zone status change notifications on
2243 2243           */
2244 2244          res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,

↓ open down ↓

2244 lines elided

↑ open up ↑

2245 2245              EVCH_CREAT);
2246 2246  
2247 2247          if (res)
2248 2248                  panic("Sysevent_evc_bind failed during zone setup.\n");
2249 2249  
2250 2250  }
2251 2251  
2252 2252  static void
2253 2253  zone_free(zone_t *zone)
2254 2254  {
     2255 +        zone_dl_t *zdl;
     2256 +
2255 2257          ASSERT(zone != global_zone);
2256 2258          ASSERT(zone->zone_ntasks == 0);
2257 2259          ASSERT(zone->zone_nlwps == 0);
2258 2260          ASSERT(zone->zone_nprocs == 0);
2259 2261          ASSERT(zone->zone_cred_ref == 0);
2260 2262          ASSERT(zone->zone_kcred == NULL);
2261 2263          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2262 2264              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2263 2265          ASSERT(list_is_empty(&zone->zone_ref_list));
2264 2266

2265 2267          /*
2266 2268           * Remove any zone caps.
2267 2269           */
2268 2270          cpucaps_zone_remove(zone);
2269 2271  
2270 2272          ASSERT(zone->zone_cpucap == NULL);
2271 2273  
2272 2274          /* remove from deathrow list */

↓ open down ↓

8 lines elided

↑ open up ↑

2273 2275          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2274 2276                  ASSERT(zone->zone_ref == 0);
2275 2277                  mutex_enter(&zone_deathrow_lock);
2276 2278                  list_remove(&zone_deathrow, zone);
2277 2279                  mutex_exit(&zone_deathrow_lock);
2278 2280          }
2279 2281  
2280 2282          list_destroy(&zone->zone_ref_list);
2281 2283          zone_free_zsd(zone);
2282 2284          zone_free_datasets(zone);
     2285 +
     2286 +        /*
     2287 +         * While dlmgmtd should have removed all of these, it could have left
     2288 +         * something behind or crashed. In which case it's not safe for us to
     2289 +         * assume that the list is empty which list_destroy() will ASSERT. We
     2290 +         * clean up for our userland comrades which may have crashed, or worse,
     2291 +         * been disabled by SMF.
     2292 +         */
     2293 +        while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
     2294 +                if (zdl->zdl_net != NULL)
     2295 +                        nvlist_free(zdl->zdl_net);
     2296 +                kmem_free(zdl, sizeof (zone_dl_t));
     2297 +        }
2283 2298          list_destroy(&zone->zone_dl_list);
2284 2299  
2285 2300          if (zone->zone_rootvp != NULL)
2286 2301                  VN_RELE(zone->zone_rootvp);
2287 2302          if (zone->zone_rootpath)
2288 2303                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2289 2304          if (zone->zone_name != NULL)
2290 2305                  kmem_free(zone->zone_name, ZONENAME_MAX);
2291 2306          if (zone->zone_slabel != NULL)
2292 2307                  label_rele(zone->zone_slabel);

2293 2308          if (zone->zone_nodename != NULL)
2294 2309                  kmem_free(zone->zone_nodename, _SYS_NMLN);
2295 2310          if (zone->zone_domain != NULL)
2296 2311                  kmem_free(zone->zone_domain, _SYS_NMLN);
2297 2312          if (zone->zone_privset != NULL)
2298 2313                  kmem_free(zone->zone_privset, sizeof (priv_set_t));
2299 2314          if (zone->zone_rctls != NULL)
2300 2315                  rctl_set_free(zone->zone_rctls);
2301 2316          if (zone->zone_bootargs != NULL)
2302 2317                  strfree(zone->zone_bootargs);
2303 2318          if (zone->zone_initname != NULL)
2304 2319                  strfree(zone->zone_initname);
2305 2320          if (zone->zone_fs_allowed != NULL)
2306 2321                  strfree(zone->zone_fs_allowed);
2307 2322          if (zone->zone_pfexecd != NULL)
2308 2323                  klpd_freelist(&zone->zone_pfexecd);
2309 2324          id_free(zoneid_space, zone->zone_id);
2310 2325          mutex_destroy(&zone->zone_lock);
2311 2326          cv_destroy(&zone->zone_cv);
2312 2327          rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2313 2328          rw_destroy(&zone->zone_mntfs_db_lock);
2314 2329          kmem_free(zone, sizeof (zone_t));
2315 2330  }
2316 2331  
2317 2332  /*
2318 2333   * See block comment at the top of this file for information about zone
2319 2334   * status values.
2320 2335   */
2321 2336  /*
2322 2337   * Convenience function for setting zone status.
2323 2338   */
2324 2339  static void
2325 2340  zone_status_set(zone_t *zone, zone_status_t status)
2326 2341  {
2327 2342  
2328 2343          nvlist_t *nvl = NULL;
2329 2344          ASSERT(MUTEX_HELD(&zone_status_lock));
2330 2345          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2331 2346              status >= zone_status_get(zone));
2332 2347  
2333 2348          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2334 2349              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2335 2350              nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2336 2351              zone_status_table[status]) ||
2337 2352              nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2338 2353              zone_status_table[zone->zone_status]) ||
2339 2354              nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2340 2355              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2341 2356              sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2342 2357              ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2343 2358  #ifdef DEBUG
2344 2359                  (void) printf(
2345 2360                      "Failed to allocate and send zone state change event.\n");
2346 2361  #endif
2347 2362          }
2348 2363          nvlist_free(nvl);
2349 2364  
2350 2365          zone->zone_status = status;
2351 2366  
2352 2367          cv_broadcast(&zone->zone_cv);
2353 2368  }
2354 2369  
2355 2370  /*
2356 2371   * Public function to retrieve the zone status.  The zone status may
2357 2372   * change after it is retrieved.
2358 2373   */
2359 2374  zone_status_t
2360 2375  zone_status_get(zone_t *zone)
2361 2376  {
2362 2377          return (zone->zone_status);
2363 2378  }
2364 2379  
2365 2380  static int
2366 2381  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2367 2382  {
2368 2383          char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2369 2384          int err = 0;
2370 2385  
2371 2386          ASSERT(zone != global_zone);
2372 2387          if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2373 2388                  goto done;      /* EFAULT or ENAMETOOLONG */
2374 2389  
2375 2390          if (zone->zone_bootargs != NULL)
2376 2391                  strfree(zone->zone_bootargs);
2377 2392  
2378 2393          zone->zone_bootargs = strdup(buf);
2379 2394  
2380 2395  done:
2381 2396          kmem_free(buf, BOOTARGS_MAX);
2382 2397          return (err);
2383 2398  }
2384 2399  
2385 2400  static int
2386 2401  zone_set_brand(zone_t *zone, const char *brand)
2387 2402  {
2388 2403          struct brand_attr *attrp;
2389 2404          brand_t *bp;
2390 2405  
2391 2406          attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2392 2407          if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2393 2408                  kmem_free(attrp, sizeof (struct brand_attr));
2394 2409                  return (EFAULT);
2395 2410          }
2396 2411  
2397 2412          bp = brand_register_zone(attrp);
2398 2413          kmem_free(attrp, sizeof (struct brand_attr));
2399 2414          if (bp == NULL)
2400 2415                  return (EINVAL);
2401 2416  
2402 2417          /*
2403 2418           * This is the only place where a zone can change it's brand.
2404 2419           * We already need to hold zone_status_lock to check the zone
2405 2420           * status, so we'll just use that lock to serialize zone
2406 2421           * branding requests as well.
2407 2422           */
2408 2423          mutex_enter(&zone_status_lock);
2409 2424  
2410 2425          /* Re-Branding is not allowed and the zone can't be booted yet */
2411 2426          if ((ZONE_IS_BRANDED(zone)) ||
2412 2427              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2413 2428                  mutex_exit(&zone_status_lock);
2414 2429                  brand_unregister_zone(bp);
2415 2430                  return (EINVAL);
2416 2431          }
2417 2432  
2418 2433          /* set up the brand specific data */
2419 2434          zone->zone_brand = bp;
2420 2435          ZBROP(zone)->b_init_brand_data(zone);
2421 2436  
2422 2437          mutex_exit(&zone_status_lock);
2423 2438          return (0);
2424 2439  }
2425 2440  
2426 2441  static int
2427 2442  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2428 2443  {
2429 2444          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2430 2445          int err = 0;
2431 2446  
2432 2447          ASSERT(zone != global_zone);
2433 2448          if ((err = copyinstr(zone_fs_allowed, buf,
2434 2449              ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2435 2450                  goto done;
2436 2451  
2437 2452          if (zone->zone_fs_allowed != NULL)
2438 2453                  strfree(zone->zone_fs_allowed);
2439 2454  
2440 2455          zone->zone_fs_allowed = strdup(buf);
2441 2456  
2442 2457  done:
2443 2458          kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2444 2459          return (err);
2445 2460  }
2446 2461  
2447 2462  static int
2448 2463  zone_set_initname(zone_t *zone, const char *zone_initname)
2449 2464  {
2450 2465          char initname[INITNAME_SZ];
2451 2466          size_t len;
2452 2467          int err = 0;
2453 2468  
2454 2469          ASSERT(zone != global_zone);
2455 2470          if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2456 2471                  return (err);   /* EFAULT or ENAMETOOLONG */
2457 2472  
2458 2473          if (zone->zone_initname != NULL)
2459 2474                  strfree(zone->zone_initname);
2460 2475  
2461 2476          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2462 2477          (void) strcpy(zone->zone_initname, initname);
2463 2478          return (0);
2464 2479  }
2465 2480  
2466 2481  static int
2467 2482  zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2468 2483  {
2469 2484          uint64_t mcap;
2470 2485          int err = 0;
2471 2486  
2472 2487          if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2473 2488                  zone->zone_phys_mcap = mcap;
2474 2489  
2475 2490          return (err);
2476 2491  }
2477 2492  
2478 2493  static int
2479 2494  zone_set_sched_class(zone_t *zone, const char *new_class)
2480 2495  {
2481 2496          char sched_class[PC_CLNMSZ];
2482 2497          id_t classid;
2483 2498          int err;
2484 2499  
2485 2500          ASSERT(zone != global_zone);
2486 2501          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2487 2502                  return (err);   /* EFAULT or ENAMETOOLONG */
2488 2503  
2489 2504          if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2490 2505                  return (set_errno(EINVAL));
2491 2506          zone->zone_defaultcid = classid;
2492 2507          ASSERT(zone->zone_defaultcid > 0 &&
2493 2508              zone->zone_defaultcid < loaded_classes);
2494 2509  
2495 2510          return (0);
2496 2511  }
2497 2512  
2498 2513  /*
2499 2514   * Block indefinitely waiting for (zone_status >= status)
2500 2515   */
2501 2516  void
2502 2517  zone_status_wait(zone_t *zone, zone_status_t status)
2503 2518  {
2504 2519          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2505 2520  
2506 2521          mutex_enter(&zone_status_lock);
2507 2522          while (zone->zone_status < status) {
2508 2523                  cv_wait(&zone->zone_cv, &zone_status_lock);
2509 2524          }
2510 2525          mutex_exit(&zone_status_lock);
2511 2526  }
2512 2527  
2513 2528  /*
2514 2529   * Private CPR-safe version of zone_status_wait().
2515 2530   */
2516 2531  static void
2517 2532  zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2518 2533  {
2519 2534          callb_cpr_t cprinfo;
2520 2535  
2521 2536          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2522 2537  
2523 2538          CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2524 2539              str);
2525 2540          mutex_enter(&zone_status_lock);
2526 2541          while (zone->zone_status < status) {
2527 2542                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2528 2543                  cv_wait(&zone->zone_cv, &zone_status_lock);
2529 2544                  CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2530 2545          }
2531 2546          /*
2532 2547           * zone_status_lock is implicitly released by the following.
2533 2548           */
2534 2549          CALLB_CPR_EXIT(&cprinfo);
2535 2550  }
2536 2551  
2537 2552  /*
2538 2553   * Block until zone enters requested state or signal is received.  Return (0)
2539 2554   * if signaled, non-zero otherwise.
2540 2555   */
2541 2556  int
2542 2557  zone_status_wait_sig(zone_t *zone, zone_status_t status)
2543 2558  {
2544 2559          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2545 2560  
2546 2561          mutex_enter(&zone_status_lock);
2547 2562          while (zone->zone_status < status) {
2548 2563                  if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2549 2564                          mutex_exit(&zone_status_lock);
2550 2565                          return (0);
2551 2566                  }
2552 2567          }
2553 2568          mutex_exit(&zone_status_lock);
2554 2569          return (1);
2555 2570  }
2556 2571  
2557 2572  /*
2558 2573   * Block until the zone enters the requested state or the timeout expires,
2559 2574   * whichever happens first.  Return (-1) if operation timed out, time remaining
2560 2575   * otherwise.
2561 2576   */
2562 2577  clock_t
2563 2578  zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2564 2579  {
2565 2580          clock_t timeleft = 0;
2566 2581  
2567 2582          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2568 2583  
2569 2584          mutex_enter(&zone_status_lock);
2570 2585          while (zone->zone_status < status && timeleft != -1) {
2571 2586                  timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2572 2587          }
2573 2588          mutex_exit(&zone_status_lock);
2574 2589          return (timeleft);
2575 2590  }
2576 2591  
2577 2592  /*
2578 2593   * Block until the zone enters the requested state, the current process is
2579 2594   * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2580 2595   * operation timed out, 0 if signaled, time remaining otherwise.
2581 2596   */
2582 2597  clock_t
2583 2598  zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2584 2599  {
2585 2600          clock_t timeleft = tim - ddi_get_lbolt();
2586 2601  
2587 2602          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2588 2603  
2589 2604          mutex_enter(&zone_status_lock);
2590 2605          while (zone->zone_status < status) {
2591 2606                  timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2592 2607                      tim);
2593 2608                  if (timeleft <= 0)
2594 2609                          break;
2595 2610          }
2596 2611          mutex_exit(&zone_status_lock);
2597 2612          return (timeleft);
2598 2613  }
2599 2614  
2600 2615  /*
2601 2616   * Zones have two reference counts: one for references from credential
2602 2617   * structures (zone_cred_ref), and one (zone_ref) for everything else.
2603 2618   * This is so we can allow a zone to be rebooted while there are still
2604 2619   * outstanding cred references, since certain drivers cache dblks (which
2605 2620   * implicitly results in cached creds).  We wait for zone_ref to drop to
2606 2621   * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2607 2622   * later freed when the zone_cred_ref drops to 0, though nothing other
2608 2623   * than the zone id and privilege set should be accessed once the zone
2609 2624   * is "dead".
2610 2625   *
2611 2626   * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2612 2627   * to force halt/reboot to block waiting for the zone_cred_ref to drop
2613 2628   * to 0.  This can be useful to flush out other sources of cached creds
2614 2629   * that may be less innocuous than the driver case.
2615 2630   *
2616 2631   * Zones also provide a tracked reference counting mechanism in which zone
2617 2632   * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2618 2633   * debuggers determine the sources of leaked zone references.  See
2619 2634   * zone_hold_ref() and zone_rele_ref() below for more information.
2620 2635   */
2621 2636  
2622 2637  int zone_wait_for_cred = 0;
2623 2638  
2624 2639  static void
2625 2640  zone_hold_locked(zone_t *z)
2626 2641  {
2627 2642          ASSERT(MUTEX_HELD(&z->zone_lock));
2628 2643          z->zone_ref++;
2629 2644          ASSERT(z->zone_ref != 0);
2630 2645  }
2631 2646  
2632 2647  /*
2633 2648   * Increment the specified zone's reference count.  The zone's zone_t structure
2634 2649   * will not be freed as long as the zone's reference count is nonzero.
2635 2650   * Decrement the zone's reference count via zone_rele().
2636 2651   *
2637 2652   * NOTE: This function should only be used to hold zones for short periods of
2638 2653   * time.  Use zone_hold_ref() if the zone must be held for a long time.
2639 2654   */
2640 2655  void
2641 2656  zone_hold(zone_t *z)
2642 2657  {
2643 2658          mutex_enter(&z->zone_lock);
2644 2659          zone_hold_locked(z);
2645 2660          mutex_exit(&z->zone_lock);
2646 2661  }
2647 2662  
2648 2663  /*
2649 2664   * If the non-cred ref count drops to 1 and either the cred ref count
2650 2665   * is 0 or we aren't waiting for cred references, the zone is ready to
2651 2666   * be destroyed.
2652 2667   */
2653 2668  #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2654 2669              (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2655 2670  
2656 2671  /*
2657 2672   * Common zone reference release function invoked by zone_rele() and
2658 2673   * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2659 2674   * zone's subsystem-specific reference counters are not affected by the
2660 2675   * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2661 2676   * removed from the specified zone's reference list.  ref must be non-NULL iff
2662 2677   * subsys is not ZONE_REF_NUM_SUBSYS.
2663 2678   */
2664 2679  static void
2665 2680  zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2666 2681  {
2667 2682          boolean_t wakeup;
2668 2683  
2669 2684          mutex_enter(&z->zone_lock);
2670 2685          ASSERT(z->zone_ref != 0);
2671 2686          z->zone_ref--;
2672 2687          if (subsys != ZONE_REF_NUM_SUBSYS) {
2673 2688                  ASSERT(z->zone_subsys_ref[subsys] != 0);
2674 2689                  z->zone_subsys_ref[subsys]--;
2675 2690                  list_remove(&z->zone_ref_list, ref);
2676 2691          }
2677 2692          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2678 2693                  /* no more refs, free the structure */
2679 2694                  mutex_exit(&z->zone_lock);
2680 2695                  zone_free(z);
2681 2696                  return;
2682 2697          }
2683 2698          /* signal zone_destroy so the zone can finish halting */
2684 2699          wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2685 2700          mutex_exit(&z->zone_lock);
2686 2701  
2687 2702          if (wakeup) {
2688 2703                  /*
2689 2704                   * Grabbing zonehash_lock here effectively synchronizes with
2690 2705                   * zone_destroy() to avoid missed signals.
2691 2706                   */
2692 2707                  mutex_enter(&zonehash_lock);
2693 2708                  cv_broadcast(&zone_destroy_cv);
2694 2709                  mutex_exit(&zonehash_lock);
2695 2710          }
2696 2711  }
2697 2712  
2698 2713  /*
2699 2714   * Decrement the specified zone's reference count.  The specified zone will
2700 2715   * cease to exist after this function returns if the reference count drops to
2701 2716   * zero.  This function should be paired with zone_hold().
2702 2717   */
2703 2718  void
2704 2719  zone_rele(zone_t *z)
2705 2720  {
2706 2721          zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2707 2722  }
2708 2723  
2709 2724  /*
2710 2725   * Initialize a zone reference structure.  This function must be invoked for
2711 2726   * a reference structure before the structure is passed to zone_hold_ref().
2712 2727   */
2713 2728  void
2714 2729  zone_init_ref(zone_ref_t *ref)
2715 2730  {
2716 2731          ref->zref_zone = NULL;
2717 2732          list_link_init(&ref->zref_linkage);
2718 2733  }
2719 2734  
2720 2735  /*
2721 2736   * Acquire a reference to zone z.  The caller must specify the
2722 2737   * zone_ref_subsys_t constant associated with its subsystem.  The specified
2723 2738   * zone_ref_t structure will represent a reference to the specified zone.  Use
2724 2739   * zone_rele_ref() to release the reference.
2725 2740   *
2726 2741   * The referenced zone_t structure will not be freed as long as the zone_t's
2727 2742   * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2728 2743   * references.
2729 2744   *
2730 2745   * NOTE: The zone_ref_t structure must be initialized before it is used.
2731 2746   * See zone_init_ref() above.
2732 2747   */
2733 2748  void
2734 2749  zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2735 2750  {
2736 2751          ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2737 2752  
2738 2753          /*
2739 2754           * Prevent consumers from reusing a reference structure before
2740 2755           * releasing it.
2741 2756           */
2742 2757          VERIFY(ref->zref_zone == NULL);
2743 2758  
2744 2759          ref->zref_zone = z;
2745 2760          mutex_enter(&z->zone_lock);
2746 2761          zone_hold_locked(z);
2747 2762          z->zone_subsys_ref[subsys]++;
2748 2763          ASSERT(z->zone_subsys_ref[subsys] != 0);
2749 2764          list_insert_head(&z->zone_ref_list, ref);
2750 2765          mutex_exit(&z->zone_lock);
2751 2766  }
2752 2767  
2753 2768  /*
2754 2769   * Release the zone reference represented by the specified zone_ref_t.
2755 2770   * The reference is invalid after it's released; however, the zone_ref_t
2756 2771   * structure can be reused without having to invoke zone_init_ref().
2757 2772   * subsys should be the same value that was passed to zone_hold_ref()
2758 2773   * when the reference was acquired.
2759 2774   */
2760 2775  void
2761 2776  zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2762 2777  {
2763 2778          zone_rele_common(ref->zref_zone, ref, subsys);
2764 2779  
2765 2780          /*
2766 2781           * Set the zone_ref_t's zref_zone field to NULL to generate panics
2767 2782           * when consumers dereference the reference.  This helps us catch
2768 2783           * consumers who use released references.  Furthermore, this lets
2769 2784           * consumers reuse the zone_ref_t structure without having to
2770 2785           * invoke zone_init_ref().
2771 2786           */
2772 2787          ref->zref_zone = NULL;
2773 2788  }
2774 2789  
2775 2790  void
2776 2791  zone_cred_hold(zone_t *z)
2777 2792  {
2778 2793          mutex_enter(&z->zone_lock);
2779 2794          z->zone_cred_ref++;
2780 2795          ASSERT(z->zone_cred_ref != 0);
2781 2796          mutex_exit(&z->zone_lock);
2782 2797  }
2783 2798  
2784 2799  void
2785 2800  zone_cred_rele(zone_t *z)
2786 2801  {
2787 2802          boolean_t wakeup;
2788 2803  
2789 2804          mutex_enter(&z->zone_lock);
2790 2805          ASSERT(z->zone_cred_ref != 0);
2791 2806          z->zone_cred_ref--;
2792 2807          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2793 2808                  /* no more refs, free the structure */
2794 2809                  mutex_exit(&z->zone_lock);
2795 2810                  zone_free(z);
2796 2811                  return;
2797 2812          }
2798 2813          /*
2799 2814           * If zone_destroy is waiting for the cred references to drain
2800 2815           * out, and they have, signal it.
2801 2816           */
2802 2817          wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2803 2818              zone_status_get(z) >= ZONE_IS_DEAD);
2804 2819          mutex_exit(&z->zone_lock);
2805 2820  
2806 2821          if (wakeup) {
2807 2822                  /*
2808 2823                   * Grabbing zonehash_lock here effectively synchronizes with
2809 2824                   * zone_destroy() to avoid missed signals.
2810 2825                   */
2811 2826                  mutex_enter(&zonehash_lock);
2812 2827                  cv_broadcast(&zone_destroy_cv);
2813 2828                  mutex_exit(&zonehash_lock);
2814 2829          }
2815 2830  }
2816 2831  
2817 2832  void
2818 2833  zone_task_hold(zone_t *z)
2819 2834  {
2820 2835          mutex_enter(&z->zone_lock);
2821 2836          z->zone_ntasks++;
2822 2837          ASSERT(z->zone_ntasks != 0);
2823 2838          mutex_exit(&z->zone_lock);
2824 2839  }
2825 2840  
2826 2841  void
2827 2842  zone_task_rele(zone_t *zone)
2828 2843  {
2829 2844          uint_t refcnt;
2830 2845  
2831 2846          mutex_enter(&zone->zone_lock);
2832 2847          ASSERT(zone->zone_ntasks != 0);
2833 2848          refcnt = --zone->zone_ntasks;
2834 2849          if (refcnt > 1) {       /* Common case */
2835 2850                  mutex_exit(&zone->zone_lock);
2836 2851                  return;
2837 2852          }
2838 2853          zone_hold_locked(zone); /* so we can use the zone_t later */
2839 2854          mutex_exit(&zone->zone_lock);
2840 2855          if (refcnt == 1) {
2841 2856                  /*
2842 2857                   * See if the zone is shutting down.
2843 2858                   */
2844 2859                  mutex_enter(&zone_status_lock);
2845 2860                  if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2846 2861                          goto out;
2847 2862                  }
2848 2863  
2849 2864                  /*
2850 2865                   * Make sure the ntasks didn't change since we
2851 2866                   * dropped zone_lock.
2852 2867                   */
2853 2868                  mutex_enter(&zone->zone_lock);
2854 2869                  if (refcnt != zone->zone_ntasks) {
2855 2870                          mutex_exit(&zone->zone_lock);
2856 2871                          goto out;
2857 2872                  }
2858 2873                  mutex_exit(&zone->zone_lock);
2859 2874  
2860 2875                  /*
2861 2876                   * No more user processes in the zone.  The zone is empty.
2862 2877                   */
2863 2878                  zone_status_set(zone, ZONE_IS_EMPTY);
2864 2879                  goto out;
2865 2880          }
2866 2881  
2867 2882          ASSERT(refcnt == 0);
2868 2883          /*
2869 2884           * zsched has exited; the zone is dead.
2870 2885           */
2871 2886          zone->zone_zsched = NULL;               /* paranoia */
2872 2887          mutex_enter(&zone_status_lock);
2873 2888          zone_status_set(zone, ZONE_IS_DEAD);
2874 2889  out:
2875 2890          mutex_exit(&zone_status_lock);
2876 2891          zone_rele(zone);
2877 2892  }
2878 2893  
2879 2894  zoneid_t
2880 2895  getzoneid(void)
2881 2896  {
2882 2897          return (curproc->p_zone->zone_id);
2883 2898  }
2884 2899  
2885 2900  /*
2886 2901   * Internal versions of zone_find_by_*().  These don't zone_hold() or
2887 2902   * check the validity of a zone's state.
2888 2903   */
2889 2904  static zone_t *
2890 2905  zone_find_all_by_id(zoneid_t zoneid)
2891 2906  {
2892 2907          mod_hash_val_t hv;
2893 2908          zone_t *zone = NULL;
2894 2909  
2895 2910          ASSERT(MUTEX_HELD(&zonehash_lock));
2896 2911  
2897 2912          if (mod_hash_find(zonehashbyid,
2898 2913              (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2899 2914                  zone = (zone_t *)hv;
2900 2915          return (zone);
2901 2916  }
2902 2917  
2903 2918  static zone_t *
2904 2919  zone_find_all_by_label(const ts_label_t *label)
2905 2920  {
2906 2921          mod_hash_val_t hv;
2907 2922          zone_t *zone = NULL;
2908 2923  
2909 2924          ASSERT(MUTEX_HELD(&zonehash_lock));
2910 2925  
2911 2926          /*
2912 2927           * zonehashbylabel is not maintained for unlabeled systems
2913 2928           */
2914 2929          if (!is_system_labeled())
2915 2930                  return (NULL);
2916 2931          if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2917 2932                  zone = (zone_t *)hv;
2918 2933          return (zone);
2919 2934  }
2920 2935  
2921 2936  static zone_t *
2922 2937  zone_find_all_by_name(char *name)
2923 2938  {
2924 2939          mod_hash_val_t hv;
2925 2940          zone_t *zone = NULL;
2926 2941  
2927 2942          ASSERT(MUTEX_HELD(&zonehash_lock));
2928 2943  
2929 2944          if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2930 2945                  zone = (zone_t *)hv;
2931 2946          return (zone);
2932 2947  }
2933 2948  
2934 2949  /*
2935 2950   * Public interface for looking up a zone by zoneid.  Only returns the zone if
2936 2951   * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2937 2952   * Caller must call zone_rele() once it is done with the zone.
2938 2953   *
2939 2954   * The zone may begin the zone_destroy() sequence immediately after this
2940 2955   * function returns, but may be safely used until zone_rele() is called.
2941 2956   */
2942 2957  zone_t *
2943 2958  zone_find_by_id(zoneid_t zoneid)
2944 2959  {
2945 2960          zone_t *zone;
2946 2961          zone_status_t status;
2947 2962  
2948 2963          mutex_enter(&zonehash_lock);
2949 2964          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2950 2965                  mutex_exit(&zonehash_lock);
2951 2966                  return (NULL);
2952 2967          }
2953 2968          status = zone_status_get(zone);
2954 2969          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2955 2970                  /*
2956 2971                   * For all practical purposes the zone doesn't exist.
2957 2972                   */
2958 2973                  mutex_exit(&zonehash_lock);
2959 2974                  return (NULL);
2960 2975          }
2961 2976          zone_hold(zone);
2962 2977          mutex_exit(&zonehash_lock);
2963 2978          return (zone);
2964 2979  }
2965 2980  
2966 2981  /*
2967 2982   * Similar to zone_find_by_id, but using zone label as the key.
2968 2983   */
2969 2984  zone_t *
2970 2985  zone_find_by_label(const ts_label_t *label)
2971 2986  {
2972 2987          zone_t *zone;
2973 2988          zone_status_t status;
2974 2989  
2975 2990          mutex_enter(&zonehash_lock);
2976 2991          if ((zone = zone_find_all_by_label(label)) == NULL) {
2977 2992                  mutex_exit(&zonehash_lock);
2978 2993                  return (NULL);
2979 2994          }
2980 2995  
2981 2996          status = zone_status_get(zone);
2982 2997          if (status > ZONE_IS_DOWN) {
2983 2998                  /*
2984 2999                   * For all practical purposes the zone doesn't exist.
2985 3000                   */
2986 3001                  mutex_exit(&zonehash_lock);
2987 3002                  return (NULL);
2988 3003          }
2989 3004          zone_hold(zone);
2990 3005          mutex_exit(&zonehash_lock);
2991 3006          return (zone);
2992 3007  }
2993 3008  
2994 3009  /*
2995 3010   * Similar to zone_find_by_id, but using zone name as the key.
2996 3011   */
2997 3012  zone_t *
2998 3013  zone_find_by_name(char *name)
2999 3014  {
3000 3015          zone_t *zone;
3001 3016          zone_status_t status;
3002 3017  
3003 3018          mutex_enter(&zonehash_lock);
3004 3019          if ((zone = zone_find_all_by_name(name)) == NULL) {
3005 3020                  mutex_exit(&zonehash_lock);
3006 3021                  return (NULL);
3007 3022          }
3008 3023          status = zone_status_get(zone);
3009 3024          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3010 3025                  /*
3011 3026                   * For all practical purposes the zone doesn't exist.
3012 3027                   */
3013 3028                  mutex_exit(&zonehash_lock);
3014 3029                  return (NULL);
3015 3030          }
3016 3031          zone_hold(zone);
3017 3032          mutex_exit(&zonehash_lock);
3018 3033          return (zone);
3019 3034  }
3020 3035  
3021 3036  /*
3022 3037   * Similar to zone_find_by_id(), using the path as a key.  For instance,
3023 3038   * if there is a zone "foo" rooted at /foo/root, and the path argument
3024 3039   * is "/foo/root/proc", it will return the held zone_t corresponding to
3025 3040   * zone "foo".
3026 3041   *
3027 3042   * zone_find_by_path() always returns a non-NULL value, since at the
3028 3043   * very least every path will be contained in the global zone.
3029 3044   *
3030 3045   * As with the other zone_find_by_*() functions, the caller is
3031 3046   * responsible for zone_rele()ing the return value of this function.
3032 3047   */
3033 3048  zone_t *
3034 3049  zone_find_by_path(const char *path)
3035 3050  {
3036 3051          zone_t *zone;
3037 3052          zone_t *zret = NULL;
3038 3053          zone_status_t status;
3039 3054  
3040 3055          if (path == NULL) {
3041 3056                  /*
3042 3057                   * Call from rootconf().
3043 3058                   */
3044 3059                  zone_hold(global_zone);
3045 3060                  return (global_zone);
3046 3061          }
3047 3062          ASSERT(*path == '/');
3048 3063          mutex_enter(&zonehash_lock);
3049 3064          for (zone = list_head(&zone_active); zone != NULL;
3050 3065              zone = list_next(&zone_active, zone)) {
3051 3066                  if (ZONE_PATH_VISIBLE(path, zone))
3052 3067                          zret = zone;
3053 3068          }
3054 3069          ASSERT(zret != NULL);
3055 3070          status = zone_status_get(zret);
3056 3071          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3057 3072                  /*
3058 3073                   * Zone practically doesn't exist.
3059 3074                   */
3060 3075                  zret = global_zone;
3061 3076          }
3062 3077          zone_hold(zret);
3063 3078          mutex_exit(&zonehash_lock);
3064 3079          return (zret);
3065 3080  }
3066 3081  
3067 3082  /*
3068 3083   * Public interface for updating per-zone load averages.  Called once per
3069 3084   * second.
3070 3085   *
3071 3086   * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3072 3087   */
3073 3088  void
3074 3089  zone_loadavg_update()
3075 3090  {
3076 3091          zone_t *zp;
3077 3092          zone_status_t status;
3078 3093          struct loadavg_s *lavg;
3079 3094          hrtime_t zone_total;
3080 3095          int i;
3081 3096          hrtime_t hr_avg;
3082 3097          int nrun;
3083 3098          static int64_t f[3] = { 135, 27, 9 };
3084 3099          int64_t q, r;
3085 3100  
3086 3101          mutex_enter(&zonehash_lock);
3087 3102          for (zp = list_head(&zone_active); zp != NULL;
3088 3103              zp = list_next(&zone_active, zp)) {
3089 3104                  mutex_enter(&zp->zone_lock);
3090 3105  
3091 3106                  /* Skip zones that are on the way down or not yet up */
3092 3107                  status = zone_status_get(zp);
3093 3108                  if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3094 3109                          /* For all practical purposes the zone doesn't exist. */
3095 3110                          mutex_exit(&zp->zone_lock);
3096 3111                          continue;
3097 3112                  }
3098 3113  
3099 3114                  /*
3100 3115                   * Update the 10 second moving average data in zone_loadavg.
3101 3116                   */
3102 3117                  lavg = &zp->zone_loadavg;
3103 3118  
3104 3119                  zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
3105 3120                  scalehrtime(&zone_total);
3106 3121  
3107 3122                  /* The zone_total should always be increasing. */
3108 3123                  lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3109 3124                      zone_total - lavg->lg_total : 0;
3110 3125                  lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3111 3126                  /* lg_total holds the prev. 1 sec. total */
3112 3127                  lavg->lg_total = zone_total;
3113 3128  
3114 3129                  /*
3115 3130                   * To simplify the calculation, we don't calculate the load avg.
3116 3131                   * until the zone has been up for at least 10 seconds and our
3117 3132                   * moving average is thus full.
3118 3133                   */
3119 3134                  if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3120 3135                          lavg->lg_len++;
3121 3136                          mutex_exit(&zp->zone_lock);
3122 3137                          continue;
3123 3138                  }
3124 3139  
3125 3140                  /* Now calculate the 1min, 5min, 15 min load avg. */
3126 3141                  hr_avg = 0;
3127 3142                  for (i = 0; i < S_LOADAVG_SZ; i++)
3128 3143                          hr_avg += lavg->lg_loads[i];
3129 3144                  hr_avg = hr_avg / S_LOADAVG_SZ;
3130 3145                  nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3131 3146  
3132 3147                  /* Compute load avg. See comment in calcloadavg() */
3133 3148                  for (i = 0; i < 3; i++) {
3134 3149                          q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3135 3150                          r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3136 3151                          zp->zone_hp_avenrun[i] +=
3137 3152                              ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3138 3153  
3139 3154                          /* avenrun[] can only hold 31 bits of load avg. */
3140 3155                          if (zp->zone_hp_avenrun[i] <
3141 3156                              ((uint64_t)1<<(31+16-FSHIFT)))
3142 3157                                  zp->zone_avenrun[i] = (int32_t)
3143 3158                                      (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3144 3159                          else
3145 3160                                  zp->zone_avenrun[i] = 0x7fffffff;
3146 3161                  }
3147 3162  
3148 3163                  mutex_exit(&zp->zone_lock);
3149 3164          }
3150 3165          mutex_exit(&zonehash_lock);
3151 3166  }
3152 3167  
3153 3168  /*
3154 3169   * Get the number of cpus visible to this zone.  The system-wide global
3155 3170   * 'ncpus' is returned if pools are disabled, the caller is in the
3156 3171   * global zone, or a NULL zone argument is passed in.
3157 3172   */
3158 3173  int
3159 3174  zone_ncpus_get(zone_t *zone)
3160 3175  {
3161 3176          int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3162 3177  
3163 3178          return (myncpus != 0 ? myncpus : ncpus);
3164 3179  }
3165 3180  
3166 3181  /*
3167 3182   * Get the number of online cpus visible to this zone.  The system-wide
3168 3183   * global 'ncpus_online' is returned if pools are disabled, the caller
3169 3184   * is in the global zone, or a NULL zone argument is passed in.
3170 3185   */
3171 3186  int
3172 3187  zone_ncpus_online_get(zone_t *zone)
3173 3188  {
3174 3189          int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3175 3190  
3176 3191          return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3177 3192  }
3178 3193  
3179 3194  /*
3180 3195   * Return the pool to which the zone is currently bound.
3181 3196   */
3182 3197  pool_t *
3183 3198  zone_pool_get(zone_t *zone)
3184 3199  {
3185 3200          ASSERT(pool_lock_held());
3186 3201  
3187 3202          return (zone->zone_pool);
3188 3203  }
3189 3204  
3190 3205  /*
3191 3206   * Set the zone's pool pointer and update the zone's visibility to match
3192 3207   * the resources in the new pool.
3193 3208   */
3194 3209  void
3195 3210  zone_pool_set(zone_t *zone, pool_t *pool)
3196 3211  {
3197 3212          ASSERT(pool_lock_held());
3198 3213          ASSERT(MUTEX_HELD(&cpu_lock));
3199 3214  
3200 3215          zone->zone_pool = pool;
3201 3216          zone_pset_set(zone, pool->pool_pset->pset_id);
3202 3217  }
3203 3218  
3204 3219  /*
3205 3220   * Return the cached value of the id of the processor set to which the
3206 3221   * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3207 3222   * facility is disabled.
3208 3223   */
3209 3224  psetid_t
3210 3225  zone_pset_get(zone_t *zone)
3211 3226  {
3212 3227          ASSERT(MUTEX_HELD(&cpu_lock));
3213 3228  
3214 3229          return (zone->zone_psetid);
3215 3230  }
3216 3231  
3217 3232  /*
3218 3233   * Set the cached value of the id of the processor set to which the zone
3219 3234   * is currently bound.  Also update the zone's visibility to match the
3220 3235   * resources in the new processor set.
3221 3236   */
3222 3237  void
3223 3238  zone_pset_set(zone_t *zone, psetid_t newpsetid)
3224 3239  {
3225 3240          psetid_t oldpsetid;
3226 3241  
3227 3242          ASSERT(MUTEX_HELD(&cpu_lock));
3228 3243          oldpsetid = zone_pset_get(zone);
3229 3244  
3230 3245          if (oldpsetid == newpsetid)
3231 3246                  return;
3232 3247          /*
3233 3248           * Global zone sees all.
3234 3249           */
3235 3250          if (zone != global_zone) {
3236 3251                  zone->zone_psetid = newpsetid;
3237 3252                  if (newpsetid != ZONE_PS_INVAL)
3238 3253                          pool_pset_visibility_add(newpsetid, zone);
3239 3254                  if (oldpsetid != ZONE_PS_INVAL)
3240 3255                          pool_pset_visibility_remove(oldpsetid, zone);
3241 3256          }
3242 3257          /*
3243 3258           * Disabling pools, so we should start using the global values
3244 3259           * for ncpus and ncpus_online.
3245 3260           */
3246 3261          if (newpsetid == ZONE_PS_INVAL) {
3247 3262                  zone->zone_ncpus = 0;
3248 3263                  zone->zone_ncpus_online = 0;
3249 3264          }
3250 3265  }
3251 3266  
3252 3267  /*
3253 3268   * Walk the list of active zones and issue the provided callback for
3254 3269   * each of them.
3255 3270   *
3256 3271   * Caller must not be holding any locks that may be acquired under
3257 3272   * zonehash_lock.  See comment at the beginning of the file for a list of
3258 3273   * common locks and their interactions with zones.
3259 3274   */
3260 3275  int
3261 3276  zone_walk(int (*cb)(zone_t *, void *), void *data)
3262 3277  {
3263 3278          zone_t *zone;
3264 3279          int ret = 0;
3265 3280          zone_status_t status;
3266 3281  
3267 3282          mutex_enter(&zonehash_lock);
3268 3283          for (zone = list_head(&zone_active); zone != NULL;
3269 3284              zone = list_next(&zone_active, zone)) {
3270 3285                  /*
3271 3286                   * Skip zones that shouldn't be externally visible.
3272 3287                   */
3273 3288                  status = zone_status_get(zone);
3274 3289                  if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3275 3290                          continue;
3276 3291                  /*
3277 3292                   * Bail immediately if any callback invocation returns a
3278 3293                   * non-zero value.
3279 3294                   */
3280 3295                  ret = (*cb)(zone, data);
3281 3296                  if (ret != 0)
3282 3297                          break;
3283 3298          }
3284 3299          mutex_exit(&zonehash_lock);
3285 3300          return (ret);
3286 3301  }
3287 3302  
3288 3303  static int
3289 3304  zone_set_root(zone_t *zone, const char *upath)
3290 3305  {
3291 3306          vnode_t *vp;
3292 3307          int trycount;
3293 3308          int error = 0;
3294 3309          char *path;
3295 3310          struct pathname upn, pn;
3296 3311          size_t pathlen;
3297 3312  
3298 3313          if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3299 3314                  return (error);
3300 3315  
3301 3316          pn_alloc(&pn);
3302 3317  
3303 3318          /* prevent infinite loop */
3304 3319          trycount = 10;
3305 3320          for (;;) {
3306 3321                  if (--trycount <= 0) {
3307 3322                          error = ESTALE;
3308 3323                          goto out;
3309 3324                  }
3310 3325  
3311 3326                  if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3312 3327                          /*
3313 3328                           * VOP_ACCESS() may cover 'vp' with a new
3314 3329                           * filesystem, if 'vp' is an autoFS vnode.
3315 3330                           * Get the new 'vp' if so.
3316 3331                           */
3317 3332                          if ((error =
3318 3333                              VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3319 3334                              (!vn_ismntpt(vp) ||
3320 3335                              (error = traverse(&vp)) == 0)) {
3321 3336                                  pathlen = pn.pn_pathlen + 2;
3322 3337                                  path = kmem_alloc(pathlen, KM_SLEEP);
3323 3338                                  (void) strncpy(path, pn.pn_path,
3324 3339                                      pn.pn_pathlen + 1);
3325 3340                                  path[pathlen - 2] = '/';
3326 3341                                  path[pathlen - 1] = '\0';
3327 3342                                  pn_free(&pn);
3328 3343                                  pn_free(&upn);
3329 3344  
3330 3345                                  /* Success! */
3331 3346                                  break;
3332 3347                          }
3333 3348                          VN_RELE(vp);
3334 3349                  }
3335 3350                  if (error != ESTALE)
3336 3351                          goto out;
3337 3352          }
3338 3353  
3339 3354          ASSERT(error == 0);
3340 3355          zone->zone_rootvp = vp;         /* we hold a reference to vp */
3341 3356          zone->zone_rootpath = path;
3342 3357          zone->zone_rootpathlen = pathlen;
3343 3358          if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3344 3359                  zone->zone_flags |= ZF_IS_SCRATCH;
3345 3360          return (0);
3346 3361  
3347 3362  out:
3348 3363          pn_free(&pn);
3349 3364          pn_free(&upn);
3350 3365          return (error);
3351 3366  }
3352 3367  
3353 3368  #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3354 3369                          ((c) >= 'a' && (c) <= 'z') || \
3355 3370                          ((c) >= 'A' && (c) <= 'Z'))
3356 3371  
3357 3372  static int
3358 3373  zone_set_name(zone_t *zone, const char *uname)
3359 3374  {
3360 3375          char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3361 3376          size_t len;
3362 3377          int i, err;
3363 3378  
3364 3379          if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3365 3380                  kmem_free(kname, ZONENAME_MAX);
3366 3381                  return (err);   /* EFAULT or ENAMETOOLONG */
3367 3382          }
3368 3383  
3369 3384          /* must be less than ZONENAME_MAX */
3370 3385          if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3371 3386                  kmem_free(kname, ZONENAME_MAX);
3372 3387                  return (EINVAL);
3373 3388          }
3374 3389  
3375 3390          /*
3376 3391           * Name must start with an alphanumeric and must contain only
3377 3392           * alphanumerics, '-', '_' and '.'.
3378 3393           */
3379 3394          if (!isalnum(kname[0])) {
3380 3395                  kmem_free(kname, ZONENAME_MAX);
3381 3396                  return (EINVAL);
3382 3397          }
3383 3398          for (i = 1; i < len - 1; i++) {
3384 3399                  if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3385 3400                      kname[i] != '.') {
3386 3401                          kmem_free(kname, ZONENAME_MAX);
3387 3402                          return (EINVAL);
3388 3403                  }
3389 3404          }
3390 3405  
3391 3406          zone->zone_name = kname;
3392 3407          return (0);
3393 3408  }
3394 3409  
3395 3410  /*
3396 3411   * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3397 3412   * is NULL or it points to a zone with no hostid emulation, then the machine's
3398 3413   * hostid (i.e., the global zone's hostid) is returned.  This function returns
3399 3414   * zero if neither the zone nor the host machine (global zone) have hostids.  It
3400 3415   * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3401 3416   * hostid and the machine's hostid is invalid.
3402 3417   */
3403 3418  uint32_t
3404 3419  zone_get_hostid(zone_t *zonep)
3405 3420  {
3406 3421          unsigned long machine_hostid;
3407 3422  
3408 3423          if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3409 3424                  if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3410 3425                          return (HW_INVALID_HOSTID);
3411 3426                  return ((uint32_t)machine_hostid);
3412 3427          }
3413 3428          return (zonep->zone_hostid);
3414 3429  }
3415 3430  
3416 3431  /*
3417 3432   * Similar to thread_create(), but makes sure the thread is in the appropriate
3418 3433   * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3419 3434   */
3420 3435  /*ARGSUSED*/
3421 3436  kthread_t *
3422 3437  zthread_create(
3423 3438      caddr_t stk,
3424 3439      size_t stksize,
3425 3440      void (*proc)(),
3426 3441      void *arg,
3427 3442      size_t len,
3428 3443      pri_t pri)
3429 3444  {
3430 3445          kthread_t *t;
3431 3446          zone_t *zone = curproc->p_zone;
3432 3447          proc_t *pp = zone->zone_zsched;
3433 3448  
3434 3449          zone_hold(zone);        /* Reference to be dropped when thread exits */
3435 3450  
3436 3451          /*
3437 3452           * No-one should be trying to create threads if the zone is shutting
3438 3453           * down and there aren't any kernel threads around.  See comment
3439 3454           * in zthread_exit().
3440 3455           */
3441 3456          ASSERT(!(zone->zone_kthreads == NULL &&
3442 3457              zone_status_get(zone) >= ZONE_IS_EMPTY));
3443 3458          /*
3444 3459           * Create a thread, but don't let it run until we've finished setting
3445 3460           * things up.
3446 3461           */
3447 3462          t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3448 3463          ASSERT(t->t_forw == NULL);
3449 3464          mutex_enter(&zone_status_lock);
3450 3465          if (zone->zone_kthreads == NULL) {
3451 3466                  t->t_forw = t->t_back = t;
3452 3467          } else {
3453 3468                  kthread_t *tx = zone->zone_kthreads;
3454 3469  
3455 3470                  t->t_forw = tx;
3456 3471                  t->t_back = tx->t_back;
3457 3472                  tx->t_back->t_forw = t;
3458 3473                  tx->t_back = t;
3459 3474          }
3460 3475          zone->zone_kthreads = t;
3461 3476          mutex_exit(&zone_status_lock);
3462 3477  
3463 3478          mutex_enter(&pp->p_lock);
3464 3479          t->t_proc_flag |= TP_ZTHREAD;
3465 3480          project_rele(t->t_proj);
3466 3481          t->t_proj = project_hold(pp->p_task->tk_proj);
3467 3482  
3468 3483          /*
3469 3484           * Setup complete, let it run.
3470 3485           */
3471 3486          thread_lock(t);
3472 3487          t->t_schedflag |= TS_ALLSTART;
3473 3488          setrun_locked(t);
3474 3489          thread_unlock(t);
3475 3490  
3476 3491          mutex_exit(&pp->p_lock);
3477 3492  
3478 3493          return (t);
3479 3494  }
3480 3495  
3481 3496  /*
3482 3497   * Similar to thread_exit().  Must be called by threads created via
3483 3498   * zthread_exit().
3484 3499   */
3485 3500  void
3486 3501  zthread_exit(void)
3487 3502  {
3488 3503          kthread_t *t = curthread;
3489 3504          proc_t *pp = curproc;
3490 3505          zone_t *zone = pp->p_zone;
3491 3506  
3492 3507          mutex_enter(&zone_status_lock);
3493 3508  
3494 3509          /*
3495 3510           * Reparent to p0
3496 3511           */
3497 3512          kpreempt_disable();
3498 3513          mutex_enter(&pp->p_lock);
3499 3514          t->t_proc_flag &= ~TP_ZTHREAD;
3500 3515          t->t_procp = &p0;
3501 3516          hat_thread_exit(t);
3502 3517          mutex_exit(&pp->p_lock);
3503 3518          kpreempt_enable();
3504 3519  
3505 3520          if (t->t_back == t) {
3506 3521                  ASSERT(t->t_forw == t);
3507 3522                  /*
3508 3523                   * If the zone is empty, once the thread count
3509 3524                   * goes to zero no further kernel threads can be
3510 3525                   * created.  This is because if the creator is a process
3511 3526                   * in the zone, then it must have exited before the zone
3512 3527                   * state could be set to ZONE_IS_EMPTY.
3513 3528                   * Otherwise, if the creator is a kernel thread in the
3514 3529                   * zone, the thread count is non-zero.
3515 3530                   *
3516 3531                   * This really means that non-zone kernel threads should
3517 3532                   * not create zone kernel threads.
3518 3533                   */
3519 3534                  zone->zone_kthreads = NULL;
3520 3535                  if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3521 3536                          zone_status_set(zone, ZONE_IS_DOWN);
3522 3537                          /*
3523 3538                           * Remove any CPU caps on this zone.
3524 3539                           */
3525 3540                          cpucaps_zone_remove(zone);
3526 3541                  }
3527 3542          } else {
3528 3543                  t->t_forw->t_back = t->t_back;
3529 3544                  t->t_back->t_forw = t->t_forw;
3530 3545                  if (zone->zone_kthreads == t)
3531 3546                          zone->zone_kthreads = t->t_forw;
3532 3547          }
3533 3548          mutex_exit(&zone_status_lock);
3534 3549          zone_rele(zone);
3535 3550          thread_exit();
3536 3551          /* NOTREACHED */
3537 3552  }
3538 3553  
3539 3554  static void
3540 3555  zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3541 3556  {
3542 3557          vnode_t *oldvp;
3543 3558  
3544 3559          /* we're going to hold a reference here to the directory */
3545 3560          VN_HOLD(vp);
3546 3561  
3547 3562          /* update abs cwd/root path see c2/audit.c */
3548 3563          if (AU_AUDITING())
3549 3564                  audit_chdirec(vp, vpp);
3550 3565  
3551 3566          mutex_enter(&pp->p_lock);
3552 3567          oldvp = *vpp;
3553 3568          *vpp = vp;
3554 3569          mutex_exit(&pp->p_lock);
3555 3570          if (oldvp != NULL)
3556 3571                  VN_RELE(oldvp);
3557 3572  }
3558 3573  
3559 3574  /*
3560 3575   * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3561 3576   */
3562 3577  static int
3563 3578  nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3564 3579  {
3565 3580          nvpair_t *nvp = NULL;
3566 3581          boolean_t priv_set = B_FALSE;
3567 3582          boolean_t limit_set = B_FALSE;
3568 3583          boolean_t action_set = B_FALSE;
3569 3584  
3570 3585          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3571 3586                  const char *name;
3572 3587                  uint64_t ui64;
3573 3588  
3574 3589                  name = nvpair_name(nvp);
3575 3590                  if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3576 3591                          return (EINVAL);
3577 3592                  (void) nvpair_value_uint64(nvp, &ui64);
3578 3593                  if (strcmp(name, "privilege") == 0) {
3579 3594                          /*
3580 3595                           * Currently only privileged values are allowed, but
3581 3596                           * this may change in the future.
3582 3597                           */
3583 3598                          if (ui64 != RCPRIV_PRIVILEGED)
3584 3599                                  return (EINVAL);
3585 3600                          rv->rcv_privilege = ui64;
3586 3601                          priv_set = B_TRUE;
3587 3602                  } else if (strcmp(name, "limit") == 0) {
3588 3603                          rv->rcv_value = ui64;
3589 3604                          limit_set = B_TRUE;
3590 3605                  } else if (strcmp(name, "action") == 0) {
3591 3606                          if (ui64 != RCTL_LOCAL_NOACTION &&
3592 3607                              ui64 != RCTL_LOCAL_DENY)
3593 3608                                  return (EINVAL);
3594 3609                          rv->rcv_flagaction = ui64;
3595 3610                          action_set = B_TRUE;
3596 3611                  } else {
3597 3612                          return (EINVAL);
3598 3613                  }
3599 3614          }
3600 3615  
3601 3616          if (!(priv_set && limit_set && action_set))
3602 3617                  return (EINVAL);
3603 3618          rv->rcv_action_signal = 0;
3604 3619          rv->rcv_action_recipient = NULL;
3605 3620          rv->rcv_action_recip_pid = -1;
3606 3621          rv->rcv_firing_time = 0;
3607 3622  
3608 3623          return (0);
3609 3624  }
3610 3625  
3611 3626  /*
3612 3627   * Non-global zone version of start_init.
3613 3628   */
3614 3629  void
3615 3630  zone_start_init(void)
3616 3631  {
3617 3632          proc_t *p = ttoproc(curthread);
3618 3633          zone_t *z = p->p_zone;
3619 3634  
3620 3635          ASSERT(!INGLOBALZONE(curproc));
3621 3636  
3622 3637          /*
3623 3638           * For all purposes (ZONE_ATTR_INITPID and restart_init),
3624 3639           * storing just the pid of init is sufficient.
3625 3640           */
3626 3641          z->zone_proc_initpid = p->p_pid;
3627 3642  
3628 3643          /*
3629 3644           * We maintain zone_boot_err so that we can return the cause of the
3630 3645           * failure back to the caller of the zone_boot syscall.
3631 3646           */
3632 3647          p->p_zone->zone_boot_err = start_init_common();
3633 3648  
3634 3649          /*
3635 3650           * We will prevent booting zones from becoming running zones if the
3636 3651           * global zone is shutting down.
3637 3652           */
3638 3653          mutex_enter(&zone_status_lock);
3639 3654          if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3640 3655              ZONE_IS_SHUTTING_DOWN) {
3641 3656                  /*
3642 3657                   * Make sure we are still in the booting state-- we could have
3643 3658                   * raced and already be shutting down, or even further along.
3644 3659                   */
3645 3660                  if (zone_status_get(z) == ZONE_IS_BOOTING) {
3646 3661                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3647 3662                  }
3648 3663                  mutex_exit(&zone_status_lock);
3649 3664                  /* It's gone bad, dispose of the process */
3650 3665                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3651 3666                          mutex_enter(&p->p_lock);
3652 3667                          ASSERT(p->p_flag & SEXITLWPS);
3653 3668                          lwp_exit();
3654 3669                  }
3655 3670          } else {
3656 3671                  if (zone_status_get(z) == ZONE_IS_BOOTING)
3657 3672                          zone_status_set(z, ZONE_IS_RUNNING);
3658 3673                  mutex_exit(&zone_status_lock);
3659 3674                  /* cause the process to return to userland. */
3660 3675                  lwp_rtt();
3661 3676          }
3662 3677  }
3663 3678  
3664 3679  struct zsched_arg {
3665 3680          zone_t *zone;
3666 3681          nvlist_t *nvlist;
3667 3682  };
3668 3683  
3669 3684  /*
3670 3685   * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3671 3686   * anything to do with scheduling, but rather with the fact that
3672 3687   * per-zone kernel threads are parented to zsched, just like regular
3673 3688   * kernel threads are parented to sched (p0).
3674 3689   *
3675 3690   * zsched is also responsible for launching init for the zone.
3676 3691   */
3677 3692  static void
3678 3693  zsched(void *arg)
3679 3694  {
3680 3695          struct zsched_arg *za = arg;
3681 3696          proc_t *pp = curproc;
3682 3697          proc_t *initp = proc_init;
3683 3698          zone_t *zone = za->zone;
3684 3699          cred_t *cr, *oldcred;
3685 3700          rctl_set_t *set;
3686 3701          rctl_alloc_gp_t *gp;
3687 3702          contract_t *ct = NULL;
3688 3703          task_t *tk, *oldtk;
3689 3704          rctl_entity_p_t e;
3690 3705          kproject_t *pj;
3691 3706  
3692 3707          nvlist_t *nvl = za->nvlist;
3693 3708          nvpair_t *nvp = NULL;
3694 3709  
3695 3710          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3696 3711          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3697 3712          PTOU(pp)->u_argc = 0;
3698 3713          PTOU(pp)->u_argv = NULL;
3699 3714          PTOU(pp)->u_envp = NULL;
3700 3715          closeall(P_FINFO(pp));
3701 3716  
3702 3717          /*
3703 3718           * We are this zone's "zsched" process.  As the zone isn't generally
3704 3719           * visible yet we don't need to grab any locks before initializing its
3705 3720           * zone_proc pointer.
3706 3721           */
3707 3722          zone_hold(zone);  /* this hold is released by zone_destroy() */
3708 3723          zone->zone_zsched = pp;
3709 3724          mutex_enter(&pp->p_lock);
3710 3725          pp->p_zone = zone;
3711 3726          mutex_exit(&pp->p_lock);
3712 3727  
3713 3728          /*
3714 3729           * Disassociate process from its 'parent'; parent ourselves to init
3715 3730           * (pid 1) and change other values as needed.
3716 3731           */
3717 3732          sess_create();
3718 3733  
3719 3734          mutex_enter(&pidlock);
3720 3735          proc_detach(pp);
3721 3736          pp->p_ppid = 1;
3722 3737          pp->p_flag |= SZONETOP;
3723 3738          pp->p_ancpid = 1;
3724 3739          pp->p_parent = initp;
3725 3740          pp->p_psibling = NULL;
3726 3741          if (initp->p_child)
3727 3742                  initp->p_child->p_psibling = pp;
3728 3743          pp->p_sibling = initp->p_child;
3729 3744          initp->p_child = pp;
3730 3745  
3731 3746          /* Decrement what newproc() incremented. */
3732 3747          upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3733 3748          /*
3734 3749           * Our credentials are about to become kcred-like, so we don't care
3735 3750           * about the caller's ruid.
3736 3751           */
3737 3752          upcount_inc(crgetruid(kcred), zone->zone_id);
3738 3753          mutex_exit(&pidlock);
3739 3754  
3740 3755          /*
3741 3756           * getting out of global zone, so decrement lwp and process counts
3742 3757           */
3743 3758          pj = pp->p_task->tk_proj;
3744 3759          mutex_enter(&global_zone->zone_nlwps_lock);
3745 3760          pj->kpj_nlwps -= pp->p_lwpcnt;
3746 3761          global_zone->zone_nlwps -= pp->p_lwpcnt;
3747 3762          pj->kpj_nprocs--;
3748 3763          global_zone->zone_nprocs--;
3749 3764          mutex_exit(&global_zone->zone_nlwps_lock);
3750 3765  
3751 3766          /*
3752 3767           * Decrement locked memory counts on old zone and project.
3753 3768           */
3754 3769          mutex_enter(&global_zone->zone_mem_lock);
3755 3770          global_zone->zone_locked_mem -= pp->p_locked_mem;
3756 3771          pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3757 3772          mutex_exit(&global_zone->zone_mem_lock);
3758 3773  
3759 3774          /*
3760 3775           * Create and join a new task in project '0' of this zone.
3761 3776           *
3762 3777           * We don't need to call holdlwps() since we know we're the only lwp in
3763 3778           * this process.
3764 3779           *
3765 3780           * task_join() returns with p_lock held.
3766 3781           */
3767 3782          tk = task_create(0, zone);
3768 3783          mutex_enter(&cpu_lock);
3769 3784          oldtk = task_join(tk, 0);
3770 3785  
3771 3786          pj = pp->p_task->tk_proj;
3772 3787  
3773 3788          mutex_enter(&zone->zone_mem_lock);
3774 3789          zone->zone_locked_mem += pp->p_locked_mem;
3775 3790          pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3776 3791          mutex_exit(&zone->zone_mem_lock);
3777 3792  
3778 3793          /*
3779 3794           * add lwp and process counts to zsched's zone, and increment
3780 3795           * project's task and process count due to the task created in
3781 3796           * the above task_create.
3782 3797           */
3783 3798          mutex_enter(&zone->zone_nlwps_lock);
3784 3799          pj->kpj_nlwps += pp->p_lwpcnt;
3785 3800          pj->kpj_ntasks += 1;
3786 3801          zone->zone_nlwps += pp->p_lwpcnt;
3787 3802          pj->kpj_nprocs++;
3788 3803          zone->zone_nprocs++;
3789 3804          mutex_exit(&zone->zone_nlwps_lock);
3790 3805  
3791 3806          mutex_exit(&curproc->p_lock);
3792 3807          mutex_exit(&cpu_lock);
3793 3808          task_rele(oldtk);
3794 3809  
3795 3810          /*
3796 3811           * The process was created by a process in the global zone, hence the
3797 3812           * credentials are wrong.  We might as well have kcred-ish credentials.
3798 3813           */
3799 3814          cr = zone->zone_kcred;
3800 3815          crhold(cr);
3801 3816          mutex_enter(&pp->p_crlock);
3802 3817          oldcred = pp->p_cred;
3803 3818          pp->p_cred = cr;
3804 3819          mutex_exit(&pp->p_crlock);
3805 3820          crfree(oldcred);
3806 3821  
3807 3822          /*
3808 3823           * Hold credentials again (for thread)
3809 3824           */
3810 3825          crhold(cr);
3811 3826  
3812 3827          /*
3813 3828           * p_lwpcnt can't change since this is a kernel process.
3814 3829           */
3815 3830          crset(pp, cr);
3816 3831  
3817 3832          /*
3818 3833           * Chroot
3819 3834           */
3820 3835          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3821 3836          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3822 3837  
3823 3838          /*
3824 3839           * Initialize zone's rctl set.
3825 3840           */
3826 3841          set = rctl_set_create();
3827 3842          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3828 3843          mutex_enter(&pp->p_lock);
3829 3844          e.rcep_p.zone = zone;
3830 3845          e.rcep_t = RCENTITY_ZONE;
3831 3846          zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3832 3847          mutex_exit(&pp->p_lock);
3833 3848          rctl_prealloc_destroy(gp);
3834 3849  
3835 3850          /*
3836 3851           * Apply the rctls passed in to zone_create().  This is basically a list
3837 3852           * assignment: all of the old values are removed and the new ones
3838 3853           * inserted.  That is, if an empty list is passed in, all values are
3839 3854           * removed.
3840 3855           */
3841 3856          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3842 3857                  rctl_dict_entry_t *rde;
3843 3858                  rctl_hndl_t hndl;
3844 3859                  char *name;
3845 3860                  nvlist_t **nvlarray;
3846 3861                  uint_t i, nelem;
3847 3862                  int error;      /* For ASSERT()s */
3848 3863  
3849 3864                  name = nvpair_name(nvp);
3850 3865                  hndl = rctl_hndl_lookup(name);
3851 3866                  ASSERT(hndl != -1);
3852 3867                  rde = rctl_dict_lookup_hndl(hndl);
3853 3868                  ASSERT(rde != NULL);
3854 3869  
3855 3870                  for (; /* ever */; ) {
3856 3871                          rctl_val_t oval;
3857 3872  
3858 3873                          mutex_enter(&pp->p_lock);
3859 3874                          error = rctl_local_get(hndl, NULL, &oval, pp);
3860 3875                          mutex_exit(&pp->p_lock);
3861 3876                          ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
3862 3877                          ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3863 3878                          if (oval.rcv_privilege == RCPRIV_SYSTEM)
3864 3879                                  break;
3865 3880                          mutex_enter(&pp->p_lock);
3866 3881                          error = rctl_local_delete(hndl, &oval, pp);
3867 3882                          mutex_exit(&pp->p_lock);
3868 3883                          ASSERT(error == 0);
3869 3884                  }
3870 3885                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3871 3886                  ASSERT(error == 0);
3872 3887                  for (i = 0; i < nelem; i++) {
3873 3888                          rctl_val_t *nvalp;
3874 3889  
3875 3890                          nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3876 3891                          error = nvlist2rctlval(nvlarray[i], nvalp);
3877 3892                          ASSERT(error == 0);
3878 3893                          /*
3879 3894                           * rctl_local_insert can fail if the value being
3880 3895                           * inserted is a duplicate; this is OK.
3881 3896                           */
3882 3897                          mutex_enter(&pp->p_lock);
3883 3898                          if (rctl_local_insert(hndl, nvalp, pp) != 0)
3884 3899                                  kmem_cache_free(rctl_val_cache, nvalp);
3885 3900                          mutex_exit(&pp->p_lock);
3886 3901                  }
3887 3902          }
3888 3903          /*
3889 3904           * Tell the world that we're done setting up.
3890 3905           *
3891 3906           * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3892 3907           * and atomically set the zone's processor set visibility.  Once
3893 3908           * we drop pool_lock() this zone will automatically get updated
3894 3909           * to reflect any future changes to the pools configuration.
3895 3910           *
3896 3911           * Note that after we drop the locks below (zonehash_lock in
3897 3912           * particular) other operations such as a zone_getattr call can
3898 3913           * now proceed and observe the zone. That is the reason for doing a
3899 3914           * state transition to the INITIALIZED state.
3900 3915           */
3901 3916          pool_lock();
3902 3917          mutex_enter(&cpu_lock);
3903 3918          mutex_enter(&zonehash_lock);
3904 3919          zone_uniqid(zone);
3905 3920          zone_zsd_configure(zone);
3906 3921          if (pool_state == POOL_ENABLED)
3907 3922                  zone_pset_set(zone, pool_default->pool_pset->pset_id);
3908 3923          mutex_enter(&zone_status_lock);
3909 3924          ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3910 3925          zone_status_set(zone, ZONE_IS_INITIALIZED);
3911 3926          mutex_exit(&zone_status_lock);
3912 3927          mutex_exit(&zonehash_lock);
3913 3928          mutex_exit(&cpu_lock);
3914 3929          pool_unlock();
3915 3930  
3916 3931          /* Now call the create callback for this key */
3917 3932          zsd_apply_all_keys(zsd_apply_create, zone);
3918 3933  
3919 3934          /* The callbacks are complete. Mark ZONE_IS_READY */
3920 3935          mutex_enter(&zone_status_lock);
3921 3936          ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3922 3937          zone_status_set(zone, ZONE_IS_READY);
3923 3938          mutex_exit(&zone_status_lock);
3924 3939  
3925 3940          /*
3926 3941           * Once we see the zone transition to the ZONE_IS_BOOTING state,
3927 3942           * we launch init, and set the state to running.
3928 3943           */
3929 3944          zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3930 3945  
3931 3946          if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3932 3947                  id_t cid;
3933 3948  
3934 3949                  /*
3935 3950                   * Ok, this is a little complicated.  We need to grab the
3936 3951                   * zone's pool's scheduling class ID; note that by now, we
3937 3952                   * are already bound to a pool if we need to be (zoneadmd
3938 3953                   * will have done that to us while we're in the READY
3939 3954                   * state).  *But* the scheduling class for the zone's 'init'
3940 3955                   * must be explicitly passed to newproc, which doesn't
3941 3956                   * respect pool bindings.
3942 3957                   *
3943 3958                   * We hold the pool_lock across the call to newproc() to
3944 3959                   * close the obvious race: the pool's scheduling class
3945 3960                   * could change before we manage to create the LWP with
3946 3961                   * classid 'cid'.
3947 3962                   */
3948 3963                  pool_lock();
3949 3964                  if (zone->zone_defaultcid > 0)
3950 3965                          cid = zone->zone_defaultcid;
3951 3966                  else
3952 3967                          cid = pool_get_class(zone->zone_pool);
3953 3968                  if (cid == -1)
3954 3969                          cid = defaultcid;
3955 3970  
3956 3971                  /*
3957 3972                   * If this fails, zone_boot will ultimately fail.  The
3958 3973                   * state of the zone will be set to SHUTTING_DOWN-- userland
3959 3974                   * will have to tear down the zone, and fail, or try again.
3960 3975                   */
3961 3976                  if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3962 3977                      minclsyspri - 1, &ct, 0)) != 0) {
3963 3978                          mutex_enter(&zone_status_lock);
3964 3979                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3965 3980                          mutex_exit(&zone_status_lock);
3966 3981                  } else {
3967 3982                          zone->zone_boot_time = gethrestime_sec();
3968 3983                  }
3969 3984  
3970 3985                  pool_unlock();
3971 3986          }
3972 3987  
3973 3988          /*
3974 3989           * Wait for zone_destroy() to be called.  This is what we spend
3975 3990           * most of our life doing.
3976 3991           */
3977 3992          zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3978 3993  
3979 3994          if (ct)
3980 3995                  /*
3981 3996                   * At this point the process contract should be empty.
3982 3997                   * (Though if it isn't, it's not the end of the world.)
3983 3998                   */
3984 3999                  VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3985 4000  
3986 4001          /*
3987 4002           * Allow kcred to be freed when all referring processes
3988 4003           * (including this one) go away.  We can't just do this in
3989 4004           * zone_free because we need to wait for the zone_cred_ref to
3990 4005           * drop to 0 before calling zone_free, and the existence of
3991 4006           * zone_kcred will prevent that.  Thus, we call crfree here to
3992 4007           * balance the crdup in zone_create.  The crhold calls earlier
3993 4008           * in zsched will be dropped when the thread and process exit.
3994 4009           */
3995 4010          crfree(zone->zone_kcred);
3996 4011          zone->zone_kcred = NULL;
3997 4012  
3998 4013          exit(CLD_EXITED, 0);
3999 4014  }
4000 4015  
4001 4016  /*
4002 4017   * Helper function to determine if there are any submounts of the
4003 4018   * provided path.  Used to make sure the zone doesn't "inherit" any
4004 4019   * mounts from before it is created.
4005 4020   */
4006 4021  static uint_t
4007 4022  zone_mount_count(const char *rootpath)
4008 4023  {
4009 4024          vfs_t *vfsp;
4010 4025          uint_t count = 0;
4011 4026          size_t rootpathlen = strlen(rootpath);
4012 4027  
4013 4028          /*
4014 4029           * Holding zonehash_lock prevents race conditions with
4015 4030           * vfs_list_add()/vfs_list_remove() since we serialize with
4016 4031           * zone_find_by_path().
4017 4032           */
4018 4033          ASSERT(MUTEX_HELD(&zonehash_lock));
4019 4034          /*
4020 4035           * The rootpath must end with a '/'
4021 4036           */
4022 4037          ASSERT(rootpath[rootpathlen - 1] == '/');
4023 4038  
4024 4039          /*
4025 4040           * This intentionally does not count the rootpath itself if that
4026 4041           * happens to be a mount point.
4027 4042           */
4028 4043          vfs_list_read_lock();
4029 4044          vfsp = rootvfs;
4030 4045          do {
4031 4046                  if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4032 4047                      rootpathlen) == 0)
4033 4048                          count++;
4034 4049                  vfsp = vfsp->vfs_next;
4035 4050          } while (vfsp != rootvfs);
4036 4051          vfs_list_unlock();
4037 4052          return (count);
4038 4053  }
4039 4054  
4040 4055  /*
4041 4056   * Helper function to make sure that a zone created on 'rootpath'
4042 4057   * wouldn't end up containing other zones' rootpaths.
4043 4058   */
4044 4059  static boolean_t
4045 4060  zone_is_nested(const char *rootpath)
4046 4061  {
4047 4062          zone_t *zone;
4048 4063          size_t rootpathlen = strlen(rootpath);
4049 4064          size_t len;
4050 4065  
4051 4066          ASSERT(MUTEX_HELD(&zonehash_lock));
4052 4067  
4053 4068          /*
4054 4069           * zone_set_root() appended '/' and '\0' at the end of rootpath
4055 4070           */
4056 4071          if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4057 4072              (rootpath[1] == '/') && (rootpath[2] == '\0'))
4058 4073                  return (B_TRUE);
4059 4074  
4060 4075          for (zone = list_head(&zone_active); zone != NULL;
4061 4076              zone = list_next(&zone_active, zone)) {
4062 4077                  if (zone == global_zone)
4063 4078                          continue;
4064 4079                  len = strlen(zone->zone_rootpath);
4065 4080                  if (strncmp(rootpath, zone->zone_rootpath,
4066 4081                      MIN(rootpathlen, len)) == 0)
4067 4082                          return (B_TRUE);
4068 4083          }
4069 4084          return (B_FALSE);
4070 4085  }
4071 4086  
4072 4087  static int
4073 4088  zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4074 4089      size_t zone_privssz)
4075 4090  {
4076 4091          priv_set_t *privs;
4077 4092  
4078 4093          if (zone_privssz < sizeof (priv_set_t))
4079 4094                  return (ENOMEM);
4080 4095  
4081 4096          privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4082 4097  
4083 4098          if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4084 4099                  kmem_free(privs, sizeof (priv_set_t));
4085 4100                  return (EFAULT);
4086 4101          }
4087 4102  
4088 4103          zone->zone_privset = privs;
4089 4104          return (0);
4090 4105  }
4091 4106  
4092 4107  /*
4093 4108   * We make creative use of nvlists to pass in rctls from userland.  The list is
4094 4109   * a list of the following structures:
4095 4110   *
4096 4111   * (name = rctl_name, value = nvpair_list_array)
4097 4112   *
4098 4113   * Where each element of the nvpair_list_array is of the form:
4099 4114   *
4100 4115   * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4101 4116   *      (name = "limit", value = uint64_t),
4102 4117   *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4103 4118   */
4104 4119  static int
4105 4120  parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4106 4121  {
4107 4122          nvpair_t *nvp = NULL;
4108 4123          nvlist_t *nvl = NULL;
4109 4124          char *kbuf;
4110 4125          int error;
4111 4126          rctl_val_t rv;
4112 4127  
4113 4128          *nvlp = NULL;
4114 4129  
4115 4130          if (buflen == 0)
4116 4131                  return (0);
4117 4132  
4118 4133          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4119 4134                  return (ENOMEM);
4120 4135          if (copyin(ubuf, kbuf, buflen)) {
4121 4136                  error = EFAULT;
4122 4137                  goto out;
4123 4138          }
4124 4139          if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4125 4140                  /*
4126 4141                   * nvl may have been allocated/free'd, but the value set to
4127 4142                   * non-NULL, so we reset it here.
4128 4143                   */
4129 4144                  nvl = NULL;
4130 4145                  error = EINVAL;
4131 4146                  goto out;
4132 4147          }
4133 4148          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4134 4149                  rctl_dict_entry_t *rde;
4135 4150                  rctl_hndl_t hndl;
4136 4151                  nvlist_t **nvlarray;
4137 4152                  uint_t i, nelem;
4138 4153                  char *name;
4139 4154  
4140 4155                  error = EINVAL;
4141 4156                  name = nvpair_name(nvp);
4142 4157                  if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4143 4158                      != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4144 4159                          goto out;
4145 4160                  }
4146 4161                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4147 4162                          goto out;
4148 4163                  }
4149 4164                  rde = rctl_dict_lookup_hndl(hndl);
4150 4165                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4151 4166                  ASSERT(error == 0);
4152 4167                  for (i = 0; i < nelem; i++) {
4153 4168                          if (error = nvlist2rctlval(nvlarray[i], &rv))
4154 4169                                  goto out;
4155 4170                  }
4156 4171                  if (rctl_invalid_value(rde, &rv)) {
4157 4172                          error = EINVAL;
4158 4173                          goto out;
4159 4174                  }
4160 4175          }
4161 4176          error = 0;
4162 4177          *nvlp = nvl;
4163 4178  out:
4164 4179          kmem_free(kbuf, buflen);
4165 4180          if (error && nvl != NULL)
4166 4181                  nvlist_free(nvl);
4167 4182          return (error);
4168 4183  }
4169 4184  
4170 4185  int
4171 4186  zone_create_error(int er_error, int er_ext, int *er_out) {
4172 4187          if (er_out != NULL) {
4173 4188                  if (copyout(&er_ext, er_out, sizeof (int))) {
4174 4189                          return (set_errno(EFAULT));
4175 4190                  }
4176 4191          }
4177 4192          return (set_errno(er_error));
4178 4193  }
4179 4194  
4180 4195  static int
4181 4196  zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4182 4197  {
4183 4198          ts_label_t *tsl;
4184 4199          bslabel_t blab;
4185 4200  
4186 4201          /* Get label from user */
4187 4202          if (copyin(lab, &blab, sizeof (blab)) != 0)
4188 4203                  return (EFAULT);
4189 4204          tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4190 4205          if (tsl == NULL)
4191 4206                  return (ENOMEM);
4192 4207  
4193 4208          zone->zone_slabel = tsl;
4194 4209          return (0);
4195 4210  }
4196 4211  
4197 4212  /*
4198 4213   * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4199 4214   */
4200 4215  static int
4201 4216  parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4202 4217  {
4203 4218          char *kbuf;
4204 4219          char *dataset, *next;
4205 4220          zone_dataset_t *zd;
4206 4221          size_t len;
4207 4222  
4208 4223          if (ubuf == NULL || buflen == 0)
4209 4224                  return (0);
4210 4225  
4211 4226          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4212 4227                  return (ENOMEM);
4213 4228  
4214 4229          if (copyin(ubuf, kbuf, buflen) != 0) {
4215 4230                  kmem_free(kbuf, buflen);
4216 4231                  return (EFAULT);
4217 4232          }
4218 4233  
4219 4234          dataset = next = kbuf;
4220 4235          for (;;) {
4221 4236                  zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4222 4237  
4223 4238                  next = strchr(dataset, ',');
4224 4239  
4225 4240                  if (next == NULL)
4226 4241                          len = strlen(dataset);
4227 4242                  else
4228 4243                          len = next - dataset;
4229 4244  
4230 4245                  zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4231 4246                  bcopy(dataset, zd->zd_dataset, len);
4232 4247                  zd->zd_dataset[len] = '\0';
4233 4248  
4234 4249                  list_insert_head(&zone->zone_datasets, zd);
4235 4250  
4236 4251                  if (next == NULL)
4237 4252                          break;
4238 4253  
4239 4254                  dataset = next + 1;
4240 4255          }
4241 4256  
4242 4257          kmem_free(kbuf, buflen);
4243 4258          return (0);
4244 4259  }
4245 4260  
4246 4261  /*
4247 4262   * System call to create/initialize a new zone named 'zone_name', rooted
4248 4263   * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4249 4264   * and initialized with the zone-wide rctls described in 'rctlbuf', and
4250 4265   * with labeling set by 'match', 'doi', and 'label'.
4251 4266   *
4252 4267   * If extended error is non-null, we may use it to return more detailed
4253 4268   * error information.
4254 4269   */
4255 4270  static zoneid_t
4256 4271  zone_create(const char *zone_name, const char *zone_root,
4257 4272      const priv_set_t *zone_privs, size_t zone_privssz,
4258 4273      caddr_t rctlbuf, size_t rctlbufsz,
4259 4274      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4260 4275      int match, uint32_t doi, const bslabel_t *label,
4261 4276      int flags)
4262 4277  {
4263 4278          struct zsched_arg zarg;
4264 4279          nvlist_t *rctls = NULL;
4265 4280          proc_t *pp = curproc;
4266 4281          zone_t *zone, *ztmp;
4267 4282          zoneid_t zoneid;
4268 4283          int error;
4269 4284          int error2 = 0;
4270 4285          char *str;
4271 4286          cred_t *zkcr;
4272 4287          boolean_t insert_label_hash;
4273 4288  
4274 4289          if (secpolicy_zone_config(CRED()) != 0)
4275 4290                  return (set_errno(EPERM));
4276 4291  
4277 4292          /* can't boot zone from within chroot environment */
4278 4293          if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4279 4294                  return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4280 4295                      extended_error));
4281 4296  
4282 4297          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4283 4298          zoneid = zone->zone_id = id_alloc(zoneid_space);
4284 4299          zone->zone_status = ZONE_IS_UNINITIALIZED;
4285 4300          zone->zone_pool = pool_default;
4286 4301          zone->zone_pool_mod = gethrtime();
4287 4302          zone->zone_psetid = ZONE_PS_INVAL;
4288 4303          zone->zone_ncpus = 0;
4289 4304          zone->zone_ncpus_online = 0;
4290 4305          zone->zone_restart_init = B_TRUE;
4291 4306          zone->zone_brand = &native_brand;
4292 4307          zone->zone_initname = NULL;
4293 4308          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4294 4309          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4295 4310          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4296 4311          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4297 4312          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4298 4313              offsetof(zone_ref_t, zref_linkage));
4299 4314          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4300 4315              offsetof(struct zsd_entry, zsd_linkage));
4301 4316          list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4302 4317              offsetof(zone_dataset_t, zd_linkage));
4303 4318          list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4304 4319              offsetof(zone_dl_t, zdl_linkage));
4305 4320          rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4306 4321          rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4307 4322  
4308 4323          if (flags & ZCF_NET_EXCL) {
4309 4324                  zone->zone_flags |= ZF_NET_EXCL;
4310 4325          }
4311 4326  
4312 4327          if ((error = zone_set_name(zone, zone_name)) != 0) {
4313 4328                  zone_free(zone);
4314 4329                  return (zone_create_error(error, 0, extended_error));
4315 4330          }
4316 4331  
4317 4332          if ((error = zone_set_root(zone, zone_root)) != 0) {
4318 4333                  zone_free(zone);
4319 4334                  return (zone_create_error(error, 0, extended_error));
4320 4335          }
4321 4336          if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4322 4337                  zone_free(zone);
4323 4338                  return (zone_create_error(error, 0, extended_error));
4324 4339          }
4325 4340  
4326 4341          /* initialize node name to be the same as zone name */
4327 4342          zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4328 4343          (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4329 4344          zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4330 4345  
4331 4346          zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4332 4347          zone->zone_domain[0] = '\0';
4333 4348          zone->zone_hostid = HW_INVALID_HOSTID;
4334 4349          zone->zone_shares = 1;
4335 4350          zone->zone_shmmax = 0;
4336 4351          zone->zone_ipc.ipcq_shmmni = 0;
4337 4352          zone->zone_ipc.ipcq_semmni = 0;
4338 4353          zone->zone_ipc.ipcq_msgmni = 0;
4339 4354          zone->zone_bootargs = NULL;
4340 4355          zone->zone_fs_allowed = NULL;
4341 4356          zone->zone_initname =
4342 4357              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4343 4358          (void) strcpy(zone->zone_initname, zone_default_initname);
4344 4359          zone->zone_nlwps = 0;
4345 4360          zone->zone_nlwps_ctl = INT_MAX;
4346 4361          zone->zone_nprocs = 0;
4347 4362          zone->zone_nprocs_ctl = INT_MAX;
4348 4363          zone->zone_locked_mem = 0;
4349 4364          zone->zone_locked_mem_ctl = UINT64_MAX;
4350 4365          zone->zone_max_swap = 0;
4351 4366          zone->zone_max_swap_ctl = UINT64_MAX;
4352 4367          zone->zone_max_lofi = 0;
4353 4368          zone->zone_max_lofi_ctl = UINT64_MAX;
4354 4369          zone0.zone_lockedmem_kstat = NULL;
4355 4370          zone0.zone_swapresv_kstat = NULL;
4356 4371  
4357 4372          /*
4358 4373           * Zsched initializes the rctls.
4359 4374           */
4360 4375          zone->zone_rctls = NULL;
4361 4376  
4362 4377          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4363 4378                  zone_free(zone);
4364 4379                  return (zone_create_error(error, 0, extended_error));
4365 4380          }
4366 4381  
4367 4382          if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4368 4383                  zone_free(zone);
4369 4384                  return (set_errno(error));
4370 4385          }
4371 4386  
4372 4387          /*
4373 4388           * Read in the trusted system parameters:
4374 4389           * match flag and sensitivity label.
4375 4390           */
4376 4391          zone->zone_match = match;
4377 4392          if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4378 4393                  /* Fail if requested to set doi to anything but system's doi */
4379 4394                  if (doi != 0 && doi != default_doi) {
4380 4395                          zone_free(zone);
4381 4396                          return (set_errno(EINVAL));
4382 4397                  }
4383 4398                  /* Always apply system's doi to the zone */
4384 4399                  error = zone_set_label(zone, label, default_doi);
4385 4400                  if (error != 0) {
4386 4401                          zone_free(zone);
4387 4402                          return (set_errno(error));
4388 4403                  }
4389 4404                  insert_label_hash = B_TRUE;
4390 4405          } else {
4391 4406                  /* all zones get an admin_low label if system is not labeled */
4392 4407                  zone->zone_slabel = l_admin_low;
4393 4408                  label_hold(l_admin_low);
4394 4409                  insert_label_hash = B_FALSE;
4395 4410          }
4396 4411  
4397 4412          /*
4398 4413           * Stop all lwps since that's what normally happens as part of fork().
4399 4414           * This needs to happen before we grab any locks to avoid deadlock
4400 4415           * (another lwp in the process could be waiting for the held lock).
4401 4416           */
4402 4417          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4403 4418                  zone_free(zone);
4404 4419                  if (rctls)
4405 4420                          nvlist_free(rctls);
4406 4421                  return (zone_create_error(error, 0, extended_error));
4407 4422          }
4408 4423  
4409 4424          if (block_mounts(zone) == 0) {
4410 4425                  mutex_enter(&pp->p_lock);
4411 4426                  if (curthread != pp->p_agenttp)
4412 4427                          continuelwps(pp);
4413 4428                  mutex_exit(&pp->p_lock);
4414 4429                  zone_free(zone);
4415 4430                  if (rctls)
4416 4431                          nvlist_free(rctls);
4417 4432                  return (zone_create_error(error, 0, extended_error));
4418 4433          }
4419 4434  
4420 4435          /*
4421 4436           * Set up credential for kernel access.  After this, any errors
4422 4437           * should go through the dance in errout rather than calling
4423 4438           * zone_free directly.
4424 4439           */
4425 4440          zone->zone_kcred = crdup(kcred);
4426 4441          crsetzone(zone->zone_kcred, zone);
4427 4442          priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4428 4443          priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4429 4444          priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4430 4445          priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4431 4446  
4432 4447          mutex_enter(&zonehash_lock);
4433 4448          /*
4434 4449           * Make sure zone doesn't already exist.
4435 4450           *
4436 4451           * If the system and zone are labeled,
4437 4452           * make sure no other zone exists that has the same label.
4438 4453           */
4439 4454          if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4440 4455              (insert_label_hash &&
4441 4456              (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4442 4457                  zone_status_t status;
4443 4458  
4444 4459                  status = zone_status_get(ztmp);
4445 4460                  if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4446 4461                          error = EEXIST;
4447 4462                  else
4448 4463                          error = EBUSY;
4449 4464  
4450 4465                  if (insert_label_hash)
4451 4466                          error2 = ZE_LABELINUSE;
4452 4467  
4453 4468                  goto errout;
4454 4469          }
4455 4470  
4456 4471          /*
4457 4472           * Don't allow zone creations which would cause one zone's rootpath to
4458 4473           * be accessible from that of another (non-global) zone.
4459 4474           */
4460 4475          if (zone_is_nested(zone->zone_rootpath)) {
4461 4476                  error = EBUSY;
4462 4477                  goto errout;
4463 4478          }
4464 4479  
4465 4480          ASSERT(zonecount != 0);         /* check for leaks */
4466 4481          if (zonecount + 1 > maxzones) {
4467 4482                  error = ENOMEM;
4468 4483                  goto errout;
4469 4484          }
4470 4485  
4471 4486          if (zone_mount_count(zone->zone_rootpath) != 0) {
4472 4487                  error = EBUSY;
4473 4488                  error2 = ZE_AREMOUNTS;
4474 4489                  goto errout;
4475 4490          }
4476 4491  
4477 4492          /*
4478 4493           * Zone is still incomplete, but we need to drop all locks while
4479 4494           * zsched() initializes this zone's kernel process.  We
4480 4495           * optimistically add the zone to the hashtable and associated
4481 4496           * lists so a parallel zone_create() doesn't try to create the
4482 4497           * same zone.
4483 4498           */
4484 4499          zonecount++;
4485 4500          (void) mod_hash_insert(zonehashbyid,
4486 4501              (mod_hash_key_t)(uintptr_t)zone->zone_id,
4487 4502              (mod_hash_val_t)(uintptr_t)zone);
4488 4503          str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4489 4504          (void) strcpy(str, zone->zone_name);
4490 4505          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4491 4506              (mod_hash_val_t)(uintptr_t)zone);
4492 4507          if (insert_label_hash) {
4493 4508                  (void) mod_hash_insert(zonehashbylabel,
4494 4509                      (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4495 4510                  zone->zone_flags |= ZF_HASHED_LABEL;
4496 4511          }
4497 4512  
4498 4513          /*
4499 4514           * Insert into active list.  At this point there are no 'hold's
4500 4515           * on the zone, but everyone else knows not to use it, so we can
4501 4516           * continue to use it.  zsched() will do a zone_hold() if the
4502 4517           * newproc() is successful.
4503 4518           */
4504 4519          list_insert_tail(&zone_active, zone);
4505 4520          mutex_exit(&zonehash_lock);
4506 4521  
4507 4522          zarg.zone = zone;
4508 4523          zarg.nvlist = rctls;
4509 4524          /*
4510 4525           * The process, task, and project rctls are probably wrong;
4511 4526           * we need an interface to get the default values of all rctls,
4512 4527           * and initialize zsched appropriately.  I'm not sure that that
4513 4528           * makes much of a difference, though.
4514 4529           */
4515 4530          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4516 4531          if (error != 0) {
4517 4532                  /*
4518 4533                   * We need to undo all globally visible state.
4519 4534                   */
4520 4535                  mutex_enter(&zonehash_lock);
4521 4536                  list_remove(&zone_active, zone);
4522 4537                  if (zone->zone_flags & ZF_HASHED_LABEL) {
4523 4538                          ASSERT(zone->zone_slabel != NULL);
4524 4539                          (void) mod_hash_destroy(zonehashbylabel,
4525 4540                              (mod_hash_key_t)zone->zone_slabel);
4526 4541                  }
4527 4542                  (void) mod_hash_destroy(zonehashbyname,
4528 4543                      (mod_hash_key_t)(uintptr_t)zone->zone_name);
4529 4544                  (void) mod_hash_destroy(zonehashbyid,
4530 4545                      (mod_hash_key_t)(uintptr_t)zone->zone_id);
4531 4546                  ASSERT(zonecount > 1);
4532 4547                  zonecount--;
4533 4548                  goto errout;
4534 4549          }
4535 4550  
4536 4551          /*
4537 4552           * Zone creation can't fail from now on.
4538 4553           */
4539 4554  
4540 4555          /*
4541 4556           * Create zone kstats
4542 4557           */
4543 4558          zone_kstat_create(zone);
4544 4559  
4545 4560          /*
4546 4561           * Let the other lwps continue.
4547 4562           */
4548 4563          mutex_enter(&pp->p_lock);
4549 4564          if (curthread != pp->p_agenttp)
4550 4565                  continuelwps(pp);
4551 4566          mutex_exit(&pp->p_lock);
4552 4567  
4553 4568          /*
4554 4569           * Wait for zsched to finish initializing the zone.
4555 4570           */
4556 4571          zone_status_wait(zone, ZONE_IS_READY);
4557 4572          /*
4558 4573           * The zone is fully visible, so we can let mounts progress.
4559 4574           */
4560 4575          resume_mounts(zone);
4561 4576          if (rctls)
4562 4577                  nvlist_free(rctls);
4563 4578  
4564 4579          return (zoneid);
4565 4580  
4566 4581  errout:
4567 4582          mutex_exit(&zonehash_lock);
4568 4583          /*
4569 4584           * Let the other lwps continue.
4570 4585           */
4571 4586          mutex_enter(&pp->p_lock);
4572 4587          if (curthread != pp->p_agenttp)
4573 4588                  continuelwps(pp);
4574 4589          mutex_exit(&pp->p_lock);
4575 4590  
4576 4591          resume_mounts(zone);
4577 4592          if (rctls)
4578 4593                  nvlist_free(rctls);
4579 4594          /*
4580 4595           * There is currently one reference to the zone, a cred_ref from
4581 4596           * zone_kcred.  To free the zone, we call crfree, which will call
4582 4597           * zone_cred_rele, which will call zone_free.
4583 4598           */
4584 4599          ASSERT(zone->zone_cred_ref == 1);
4585 4600          ASSERT(zone->zone_kcred->cr_ref == 1);
4586 4601          ASSERT(zone->zone_ref == 0);
4587 4602          zkcr = zone->zone_kcred;
4588 4603          zone->zone_kcred = NULL;
4589 4604          crfree(zkcr);                           /* triggers call to zone_free */
4590 4605          return (zone_create_error(error, error2, extended_error));
4591 4606  }
4592 4607  
4593 4608  /*
4594 4609   * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4595 4610   * the heavy lifting.  initname is the path to the program to launch
4596 4611   * at the "top" of the zone; if this is NULL, we use the system default,
4597 4612   * which is stored at zone_default_initname.
4598 4613   */
4599 4614  static int
4600 4615  zone_boot(zoneid_t zoneid)
4601 4616  {
4602 4617          int err;
4603 4618          zone_t *zone;
4604 4619  
4605 4620          if (secpolicy_zone_config(CRED()) != 0)
4606 4621                  return (set_errno(EPERM));
4607 4622          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4608 4623                  return (set_errno(EINVAL));
4609 4624  
4610 4625          mutex_enter(&zonehash_lock);
4611 4626          /*
4612 4627           * Look for zone under hash lock to prevent races with calls to
4613 4628           * zone_shutdown, zone_destroy, etc.
4614 4629           */
4615 4630          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4616 4631                  mutex_exit(&zonehash_lock);
4617 4632                  return (set_errno(EINVAL));
4618 4633          }
4619 4634  
4620 4635          mutex_enter(&zone_status_lock);
4621 4636          if (zone_status_get(zone) != ZONE_IS_READY) {
4622 4637                  mutex_exit(&zone_status_lock);
4623 4638                  mutex_exit(&zonehash_lock);
4624 4639                  return (set_errno(EINVAL));
4625 4640          }
4626 4641          zone_status_set(zone, ZONE_IS_BOOTING);
4627 4642          mutex_exit(&zone_status_lock);
4628 4643  
4629 4644          zone_hold(zone);        /* so we can use the zone_t later */
4630 4645          mutex_exit(&zonehash_lock);
4631 4646  
4632 4647          if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4633 4648                  zone_rele(zone);
4634 4649                  return (set_errno(EINTR));
4635 4650          }
4636 4651  
4637 4652          /*
4638 4653           * Boot (starting init) might have failed, in which case the zone
4639 4654           * will go to the SHUTTING_DOWN state; an appropriate errno will
4640 4655           * be placed in zone->zone_boot_err, and so we return that.
4641 4656           */
4642 4657          err = zone->zone_boot_err;
4643 4658          zone_rele(zone);
4644 4659          return (err ? set_errno(err) : 0);
4645 4660  }
4646 4661  
4647 4662  /*
4648 4663   * Kills all user processes in the zone, waiting for them all to exit
4649 4664   * before returning.
4650 4665   */
4651 4666  static int
4652 4667  zone_empty(zone_t *zone)
4653 4668  {
4654 4669          int waitstatus;
4655 4670  
4656 4671          /*
4657 4672           * We need to drop zonehash_lock before killing all
4658 4673           * processes, otherwise we'll deadlock with zone_find_*
4659 4674           * which can be called from the exit path.
4660 4675           */
4661 4676          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4662 4677          while ((waitstatus = zone_status_timedwait_sig(zone,
4663 4678              ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4664 4679                  killall(zone->zone_id);
4665 4680          }
4666 4681          /*
4667 4682           * return EINTR if we were signaled
4668 4683           */
4669 4684          if (waitstatus == 0)
4670 4685                  return (EINTR);
4671 4686          return (0);
4672 4687  }
4673 4688  
4674 4689  /*
4675 4690   * This function implements the policy for zone visibility.
4676 4691   *
4677 4692   * In standard Solaris, a non-global zone can only see itself.
4678 4693   *
4679 4694   * In Trusted Extensions, a labeled zone can lookup any zone whose label
4680 4695   * it dominates. For this test, the label of the global zone is treated as
4681 4696   * admin_high so it is special-cased instead of being checked for dominance.
4682 4697   *
4683 4698   * Returns true if zone attributes are viewable, false otherwise.
4684 4699   */
4685 4700  static boolean_t
4686 4701  zone_list_access(zone_t *zone)
4687 4702  {
4688 4703  
4689 4704          if (curproc->p_zone == global_zone ||
4690 4705              curproc->p_zone == zone) {
4691 4706                  return (B_TRUE);
4692 4707          } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4693 4708                  bslabel_t *curproc_label;
4694 4709                  bslabel_t *zone_label;
4695 4710  
4696 4711                  curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4697 4712                  zone_label = label2bslabel(zone->zone_slabel);
4698 4713  
4699 4714                  if (zone->zone_id != GLOBAL_ZONEID &&
4700 4715                      bldominates(curproc_label, zone_label)) {
4701 4716                          return (B_TRUE);
4702 4717                  } else {
4703 4718                          return (B_FALSE);
4704 4719                  }
4705 4720          } else {
4706 4721                  return (B_FALSE);
4707 4722          }
4708 4723  }
4709 4724  
4710 4725  /*
4711 4726   * Systemcall to start the zone's halt sequence.  By the time this
4712 4727   * function successfully returns, all user processes and kernel threads
4713 4728   * executing in it will have exited, ZSD shutdown callbacks executed,
4714 4729   * and the zone status set to ZONE_IS_DOWN.
4715 4730   *
4716 4731   * It is possible that the call will interrupt itself if the caller is the
4717 4732   * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4718 4733   */
4719 4734  static int
4720 4735  zone_shutdown(zoneid_t zoneid)
4721 4736  {
4722 4737          int error;
4723 4738          zone_t *zone;
4724 4739          zone_status_t status;
4725 4740  
4726 4741          if (secpolicy_zone_config(CRED()) != 0)
4727 4742                  return (set_errno(EPERM));
4728 4743          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4729 4744                  return (set_errno(EINVAL));
4730 4745  
4731 4746          mutex_enter(&zonehash_lock);
4732 4747          /*
4733 4748           * Look for zone under hash lock to prevent races with other
4734 4749           * calls to zone_shutdown and zone_destroy.
4735 4750           */
4736 4751          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4737 4752                  mutex_exit(&zonehash_lock);
4738 4753                  return (set_errno(EINVAL));
4739 4754          }
4740 4755  
4741 4756          /*
4742 4757           * We have to drop zonehash_lock before calling block_mounts.
4743 4758           * Hold the zone so we can continue to use the zone_t.
4744 4759           */
4745 4760          zone_hold(zone);
4746 4761          mutex_exit(&zonehash_lock);
4747 4762  
4748 4763          /*
4749 4764           * Block mounts so that VFS_MOUNT() can get an accurate view of
4750 4765           * the zone's status with regards to ZONE_IS_SHUTTING down.
4751 4766           *
4752 4767           * e.g. NFS can fail the mount if it determines that the zone
4753 4768           * has already begun the shutdown sequence.
4754 4769           *
4755 4770           */
4756 4771          if (block_mounts(zone) == 0) {
4757 4772                  zone_rele(zone);
4758 4773                  return (set_errno(EINTR));
4759 4774          }
4760 4775  
4761 4776          mutex_enter(&zonehash_lock);
4762 4777          mutex_enter(&zone_status_lock);
4763 4778          status = zone_status_get(zone);
4764 4779          /*
4765 4780           * Fail if the zone isn't fully initialized yet.
4766 4781           */
4767 4782          if (status < ZONE_IS_READY) {
4768 4783                  mutex_exit(&zone_status_lock);
4769 4784                  mutex_exit(&zonehash_lock);
4770 4785                  resume_mounts(zone);
4771 4786                  zone_rele(zone);
4772 4787                  return (set_errno(EINVAL));
4773 4788          }
4774 4789          /*
4775 4790           * If conditions required for zone_shutdown() to return have been met,
4776 4791           * return success.
4777 4792           */
4778 4793          if (status >= ZONE_IS_DOWN) {
4779 4794                  mutex_exit(&zone_status_lock);
4780 4795                  mutex_exit(&zonehash_lock);
4781 4796                  resume_mounts(zone);
4782 4797                  zone_rele(zone);
4783 4798                  return (0);
4784 4799          }
4785 4800          /*
4786 4801           * If zone_shutdown() hasn't been called before, go through the motions.
4787 4802           * If it has, there's nothing to do but wait for the kernel threads to
4788 4803           * drain.
4789 4804           */
4790 4805          if (status < ZONE_IS_EMPTY) {
4791 4806                  uint_t ntasks;
4792 4807  
4793 4808                  mutex_enter(&zone->zone_lock);
4794 4809                  if ((ntasks = zone->zone_ntasks) != 1) {
4795 4810                          /*
4796 4811                           * There's still stuff running.
4797 4812                           */
4798 4813                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4799 4814                  }
4800 4815                  mutex_exit(&zone->zone_lock);
4801 4816                  if (ntasks == 1) {
4802 4817                          /*
4803 4818                           * The only way to create another task is through
4804 4819                           * zone_enter(), which will block until we drop
4805 4820                           * zonehash_lock.  The zone is empty.
4806 4821                           */
4807 4822                          if (zone->zone_kthreads == NULL) {
4808 4823                                  /*
4809 4824                                   * Skip ahead to ZONE_IS_DOWN
4810 4825                                   */
4811 4826                                  zone_status_set(zone, ZONE_IS_DOWN);
4812 4827                          } else {
4813 4828                                  zone_status_set(zone, ZONE_IS_EMPTY);
4814 4829                          }
4815 4830                  }
4816 4831          }
4817 4832          mutex_exit(&zone_status_lock);
4818 4833          mutex_exit(&zonehash_lock);
4819 4834          resume_mounts(zone);
4820 4835  
4821 4836          if (error = zone_empty(zone)) {
4822 4837                  zone_rele(zone);
4823 4838                  return (set_errno(error));
4824 4839          }
4825 4840          /*
4826 4841           * After the zone status goes to ZONE_IS_DOWN this zone will no
4827 4842           * longer be notified of changes to the pools configuration, so
4828 4843           * in order to not end up with a stale pool pointer, we point
4829 4844           * ourselves at the default pool and remove all resource
4830 4845           * visibility.  This is especially important as the zone_t may
4831 4846           * languish on the deathrow for a very long time waiting for
4832 4847           * cred's to drain out.
4833 4848           *
4834 4849           * This rebinding of the zone can happen multiple times
4835 4850           * (presumably due to interrupted or parallel systemcalls)
4836 4851           * without any adverse effects.
4837 4852           */
4838 4853          if (pool_lock_intr() != 0) {
4839 4854                  zone_rele(zone);
4840 4855                  return (set_errno(EINTR));
4841 4856          }
4842 4857          if (pool_state == POOL_ENABLED) {
4843 4858                  mutex_enter(&cpu_lock);
4844 4859                  zone_pool_set(zone, pool_default);
4845 4860                  /*
4846 4861                   * The zone no longer needs to be able to see any cpus.
4847 4862                   */
4848 4863                  zone_pset_set(zone, ZONE_PS_INVAL);
4849 4864                  mutex_exit(&cpu_lock);
4850 4865          }
4851 4866          pool_unlock();
4852 4867  
4853 4868          /*
4854 4869           * ZSD shutdown callbacks can be executed multiple times, hence
4855 4870           * it is safe to not be holding any locks across this call.
4856 4871           */
4857 4872          zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4858 4873  
4859 4874          mutex_enter(&zone_status_lock);
4860 4875          if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4861 4876                  zone_status_set(zone, ZONE_IS_DOWN);
4862 4877          mutex_exit(&zone_status_lock);
4863 4878  
4864 4879          /*
4865 4880           * Wait for kernel threads to drain.
4866 4881           */
4867 4882          if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4868 4883                  zone_rele(zone);
4869 4884                  return (set_errno(EINTR));
4870 4885          }
4871 4886  
4872 4887          /*
4873 4888           * Zone can be become down/destroyable even if the above wait
4874 4889           * returns EINTR, so any code added here may never execute.
4875 4890           * (i.e. don't add code here)
4876 4891           */
4877 4892  
4878 4893          zone_rele(zone);
4879 4894          return (0);
4880 4895  }
4881 4896  
4882 4897  /*
4883 4898   * Log the specified zone's reference counts.  The caller should not be
4884 4899   * holding the zone's zone_lock.
4885 4900   */
4886 4901  static void
4887 4902  zone_log_refcounts(zone_t *zone)
4888 4903  {
4889 4904          char *buffer;
4890 4905          char *buffer_position;
4891 4906          uint32_t buffer_size;
4892 4907          uint32_t index;
4893 4908          uint_t ref;
4894 4909          uint_t cred_ref;
4895 4910  
4896 4911          /*
4897 4912           * Construct a string representing the subsystem-specific reference
4898 4913           * counts.  The counts are printed in ascending order by index into the
4899 4914           * zone_t::zone_subsys_ref array.  The list will be surrounded by
4900 4915           * square brackets [] and will only contain nonzero reference counts.
4901 4916           *
4902 4917           * The buffer will hold two square bracket characters plus ten digits,
4903 4918           * one colon, one space, one comma, and some characters for a
4904 4919           * subsystem name per subsystem-specific reference count.  (Unsigned 32-
4905 4920           * bit integers have at most ten decimal digits.)  The last
4906 4921           * reference count's comma is replaced by the closing square
4907 4922           * bracket and a NULL character to terminate the string.
4908 4923           *
4909 4924           * NOTE: We have to grab the zone's zone_lock to create a consistent
4910 4925           * snapshot of the zone's reference counters.
4911 4926           *
4912 4927           * First, figure out how much space the string buffer will need.
4913 4928           * The buffer's size is stored in buffer_size.
4914 4929           */
4915 4930          buffer_size = 2;                        /* for the square brackets */
4916 4931          mutex_enter(&zone->zone_lock);
4917 4932          zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
4918 4933          ref = zone->zone_ref;
4919 4934          cred_ref = zone->zone_cred_ref;
4920 4935          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
4921 4936                  if (zone->zone_subsys_ref[index] != 0)
4922 4937                          buffer_size += strlen(zone_ref_subsys_names[index]) +
4923 4938                              13;
4924 4939          if (buffer_size == 2) {
4925 4940                  /*
4926 4941                   * No subsystems had nonzero reference counts.  Don't bother
4927 4942                   * with allocating a buffer; just log the general-purpose and
4928 4943                   * credential reference counts.
4929 4944                   */
4930 4945                  mutex_exit(&zone->zone_lock);
4931 4946                  (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4932 4947                      "Zone '%s' (ID: %d) is shutting down, but %u zone "
4933 4948                      "references and %u credential references are still extant",
4934 4949                      zone->zone_name, zone->zone_id, ref, cred_ref);
4935 4950                  return;
4936 4951          }
4937 4952  
4938 4953          /*
4939 4954           * buffer_size contains the exact number of characters that the
4940 4955           * buffer will need.  Allocate the buffer and fill it with nonzero
4941 4956           * subsystem-specific reference counts.  Surround the results with
4942 4957           * square brackets afterwards.
4943 4958           */
4944 4959          buffer = kmem_alloc(buffer_size, KM_SLEEP);
4945 4960          buffer_position = &buffer[1];
4946 4961          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
4947 4962                  /*
4948 4963                   * NOTE: The DDI's version of sprintf() returns a pointer to
4949 4964                   * the modified buffer rather than the number of bytes written
4950 4965                   * (as in snprintf(3C)).  This is unfortunate and annoying.
4951 4966                   * Therefore, we'll use snprintf() with INT_MAX to get the
4952 4967                   * number of bytes written.  Using INT_MAX is safe because
4953 4968                   * the buffer is perfectly sized for the data: we'll never
4954 4969                   * overrun the buffer.
4955 4970                   */
4956 4971                  if (zone->zone_subsys_ref[index] != 0)
4957 4972                          buffer_position += snprintf(buffer_position, INT_MAX,
4958 4973                              "%s: %u,", zone_ref_subsys_names[index],
4959 4974                              zone->zone_subsys_ref[index]);
4960 4975          }
4961 4976          mutex_exit(&zone->zone_lock);
4962 4977          buffer[0] = '[';
4963 4978          ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
4964 4979          ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
4965 4980          buffer_position[-1] = ']';
4966 4981  
4967 4982          /*
4968 4983           * Log the reference counts and free the message buffer.
4969 4984           */
4970 4985          (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
4971 4986              "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
4972 4987              "%u credential references are still extant %s", zone->zone_name,
4973 4988              zone->zone_id, ref, cred_ref, buffer);
4974 4989          kmem_free(buffer, buffer_size);
4975 4990  }
4976 4991  
4977 4992  /*
4978 4993   * Systemcall entry point to finalize the zone halt process.  The caller
4979 4994   * must have already successfully called zone_shutdown().
4980 4995   *
4981 4996   * Upon successful completion, the zone will have been fully destroyed:
4982 4997   * zsched will have exited, destructor callbacks executed, and the zone
4983 4998   * removed from the list of active zones.
4984 4999   */
4985 5000  static int
4986 5001  zone_destroy(zoneid_t zoneid)
4987 5002  {
4988 5003          uint64_t uniqid;
4989 5004          zone_t *zone;
4990 5005          zone_status_t status;
4991 5006          clock_t wait_time;
4992 5007          boolean_t log_refcounts;
4993 5008  
4994 5009          if (secpolicy_zone_config(CRED()) != 0)
4995 5010                  return (set_errno(EPERM));
4996 5011          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4997 5012                  return (set_errno(EINVAL));
4998 5013  
4999 5014          mutex_enter(&zonehash_lock);
5000 5015          /*
5001 5016           * Look for zone under hash lock to prevent races with other
5002 5017           * calls to zone_destroy.
5003 5018           */
5004 5019          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5005 5020                  mutex_exit(&zonehash_lock);
5006 5021                  return (set_errno(EINVAL));
5007 5022          }
5008 5023  
5009 5024          if (zone_mount_count(zone->zone_rootpath) != 0) {
5010 5025                  mutex_exit(&zonehash_lock);
5011 5026                  return (set_errno(EBUSY));
5012 5027          }
5013 5028          mutex_enter(&zone_status_lock);
5014 5029          status = zone_status_get(zone);
5015 5030          if (status < ZONE_IS_DOWN) {
5016 5031                  mutex_exit(&zone_status_lock);
5017 5032                  mutex_exit(&zonehash_lock);
5018 5033                  return (set_errno(EBUSY));
5019 5034          } else if (status == ZONE_IS_DOWN) {
5020 5035                  zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5021 5036          }
5022 5037          mutex_exit(&zone_status_lock);
5023 5038          zone_hold(zone);
5024 5039          mutex_exit(&zonehash_lock);
5025 5040  
5026 5041          /*
5027 5042           * wait for zsched to exit
5028 5043           */
5029 5044          zone_status_wait(zone, ZONE_IS_DEAD);
5030 5045          zone_zsd_callbacks(zone, ZSD_DESTROY);
5031 5046          zone->zone_netstack = NULL;
5032 5047          uniqid = zone->zone_uniqid;
5033 5048          zone_rele(zone);
5034 5049          zone = NULL;    /* potentially free'd */
5035 5050  
5036 5051          log_refcounts = B_FALSE;
5037 5052          wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5038 5053          mutex_enter(&zonehash_lock);
5039 5054          for (; /* ever */; ) {
5040 5055                  boolean_t unref;
5041 5056                  boolean_t refs_have_been_logged;
5042 5057  
5043 5058                  if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5044 5059                      zone->zone_uniqid != uniqid) {
5045 5060                          /*
5046 5061                           * The zone has gone away.  Necessary conditions
5047 5062                           * are met, so we return success.
5048 5063                           */
5049 5064                          mutex_exit(&zonehash_lock);
5050 5065                          return (0);
5051 5066                  }
5052 5067                  mutex_enter(&zone->zone_lock);
5053 5068                  unref = ZONE_IS_UNREF(zone);
5054 5069                  refs_have_been_logged = (zone->zone_flags &
5055 5070                      ZF_REFCOUNTS_LOGGED);
5056 5071                  mutex_exit(&zone->zone_lock);
5057 5072                  if (unref) {
5058 5073                          /*
5059 5074                           * There is only one reference to the zone -- that
5060 5075                           * added when the zone was added to the hashtables --
5061 5076                           * and things will remain this way until we drop
5062 5077                           * zonehash_lock... we can go ahead and cleanup the
5063 5078                           * zone.
5064 5079                           */
5065 5080                          break;
5066 5081                  }
5067 5082  
5068 5083                  /*
5069 5084                   * Wait for zone_rele_common() or zone_cred_rele() to signal
5070 5085                   * zone_destroy_cv.  zone_destroy_cv is signaled only when
5071 5086                   * some zone's general-purpose reference count reaches one.
5072 5087                   * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5073 5088                   * on zone_destroy_cv, then log the zone's reference counts and
5074 5089                   * continue to wait for zone_rele() and zone_cred_rele().
5075 5090                   */
5076 5091                  if (!refs_have_been_logged) {
5077 5092                          if (!log_refcounts) {
5078 5093                                  /*
5079 5094                                   * This thread hasn't timed out waiting on
5080 5095                                   * zone_destroy_cv yet.  Wait wait_time clock
5081 5096                                   * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5082 5097                                   * seconds) for the zone's references to clear.
5083 5098                                   */
5084 5099                                  ASSERT(wait_time > 0);
5085 5100                                  wait_time = cv_reltimedwait_sig(
5086 5101                                      &zone_destroy_cv, &zonehash_lock, wait_time,
5087 5102                                      TR_SEC);
5088 5103                                  if (wait_time > 0) {
5089 5104                                          /*
5090 5105                                           * A thread in zone_rele() or
5091 5106                                           * zone_cred_rele() signaled
5092 5107                                           * zone_destroy_cv before this thread's
5093 5108                                           * wait timed out.  The zone might have
5094 5109                                           * only one reference left; find out!
5095 5110                                           */
5096 5111                                          continue;
5097 5112                                  } else if (wait_time == 0) {
5098 5113                                          /* The thread's process was signaled. */
5099 5114                                          mutex_exit(&zonehash_lock);
5100 5115                                          return (set_errno(EINTR));
5101 5116                                  }
5102 5117  
5103 5118                                  /*
5104 5119                                   * The thread timed out while waiting on
5105 5120                                   * zone_destroy_cv.  Even though the thread
5106 5121                                   * timed out, it has to check whether another
5107 5122                                   * thread woke up from zone_destroy_cv and
5108 5123                                   * destroyed the zone.
5109 5124                                   *
5110 5125                                   * If the zone still exists and has more than
5111 5126                                   * one unreleased general-purpose reference,
5112 5127                                   * then log the zone's reference counts.
5113 5128                                   */
5114 5129                                  log_refcounts = B_TRUE;
5115 5130                                  continue;
5116 5131                          }
5117 5132  
5118 5133                          /*
5119 5134                           * The thread already timed out on zone_destroy_cv while
5120 5135                           * waiting for subsystems to release the zone's last
5121 5136                           * general-purpose references.  Log the zone's reference
5122 5137                           * counts and wait indefinitely on zone_destroy_cv.
5123 5138                           */
5124 5139                          zone_log_refcounts(zone);
5125 5140                  }
5126 5141                  if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5127 5142                          /* The thread's process was signaled. */
5128 5143                          mutex_exit(&zonehash_lock);
5129 5144                          return (set_errno(EINTR));
5130 5145                  }
5131 5146          }
5132 5147  
5133 5148          /*
5134 5149           * Remove CPU cap for this zone now since we're not going to
5135 5150           * fail below this point.
5136 5151           */
5137 5152          cpucaps_zone_remove(zone);
5138 5153  
5139 5154          /* Get rid of the zone's kstats */
5140 5155          zone_kstat_delete(zone);
5141 5156  
5142 5157          /* remove the pfexecd doors */
5143 5158          if (zone->zone_pfexecd != NULL) {
5144 5159                  klpd_freelist(&zone->zone_pfexecd);
5145 5160                  zone->zone_pfexecd = NULL;
5146 5161          }
5147 5162  
5148 5163          /* free brand specific data */
5149 5164          if (ZONE_IS_BRANDED(zone))
5150 5165                  ZBROP(zone)->b_free_brand_data(zone);
5151 5166  
5152 5167          /* Say goodbye to brand framework. */
5153 5168          brand_unregister_zone(zone->zone_brand);
5154 5169  
5155 5170          /*
5156 5171           * It is now safe to let the zone be recreated; remove it from the
5157 5172           * lists.  The memory will not be freed until the last cred
5158 5173           * reference goes away.
5159 5174           */
5160 5175          ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5161 5176          zonecount--;
5162 5177          /* remove from active list and hash tables */
5163 5178          list_remove(&zone_active, zone);
5164 5179          (void) mod_hash_destroy(zonehashbyname,
5165 5180              (mod_hash_key_t)zone->zone_name);
5166 5181          (void) mod_hash_destroy(zonehashbyid,
5167 5182              (mod_hash_key_t)(uintptr_t)zone->zone_id);
5168 5183          if (zone->zone_flags & ZF_HASHED_LABEL)
5169 5184                  (void) mod_hash_destroy(zonehashbylabel,
5170 5185                      (mod_hash_key_t)zone->zone_slabel);
5171 5186          mutex_exit(&zonehash_lock);
5172 5187  
5173 5188          /*
5174 5189           * Release the root vnode; we're not using it anymore.  Nor should any
5175 5190           * other thread that might access it exist.
5176 5191           */
5177 5192          if (zone->zone_rootvp != NULL) {
5178 5193                  VN_RELE(zone->zone_rootvp);
5179 5194                  zone->zone_rootvp = NULL;
5180 5195          }
5181 5196  
5182 5197          /* add to deathrow list */
5183 5198          mutex_enter(&zone_deathrow_lock);
5184 5199          list_insert_tail(&zone_deathrow, zone);
5185 5200          mutex_exit(&zone_deathrow_lock);
5186 5201  
5187 5202          /*
5188 5203           * Drop last reference (which was added by zsched()), this will
5189 5204           * free the zone unless there are outstanding cred references.
5190 5205           */
5191 5206          zone_rele(zone);
5192 5207          return (0);
5193 5208  }
5194 5209  
5195 5210  /*
5196 5211   * Systemcall entry point for zone_getattr(2).
5197 5212   */
5198 5213  static ssize_t
5199 5214  zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5200 5215  {
5201 5216          size_t size;
5202 5217          int error = 0, err;
5203 5218          zone_t *zone;
5204 5219          char *zonepath;
5205 5220          char *outstr;
5206 5221          zone_status_t zone_status;
5207 5222          pid_t initpid;
5208 5223          boolean_t global = (curzone == global_zone);
5209 5224          boolean_t inzone = (curzone->zone_id == zoneid);
5210 5225          ushort_t flags;
5211 5226          zone_net_data_t *zbuf;
5212 5227  
5213 5228          mutex_enter(&zonehash_lock);
5214 5229          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5215 5230                  mutex_exit(&zonehash_lock);
5216 5231                  return (set_errno(EINVAL));
5217 5232          }
5218 5233          zone_status = zone_status_get(zone);
5219 5234          if (zone_status < ZONE_IS_INITIALIZED) {
5220 5235                  mutex_exit(&zonehash_lock);
5221 5236                  return (set_errno(EINVAL));
5222 5237          }
5223 5238          zone_hold(zone);
5224 5239          mutex_exit(&zonehash_lock);
5225 5240  
5226 5241          /*
5227 5242           * If not in the global zone, don't show information about other zones,
5228 5243           * unless the system is labeled and the local zone's label dominates
5229 5244           * the other zone.
5230 5245           */
5231 5246          if (!zone_list_access(zone)) {
5232 5247                  zone_rele(zone);
5233 5248                  return (set_errno(EINVAL));
5234 5249          }
5235 5250  
5236 5251          switch (attr) {
5237 5252          case ZONE_ATTR_ROOT:
5238 5253                  if (global) {
5239 5254                          /*
5240 5255                           * Copy the path to trim the trailing "/" (except for
5241 5256                           * the global zone).
5242 5257                           */
5243 5258                          if (zone != global_zone)
5244 5259                                  size = zone->zone_rootpathlen - 1;
5245 5260                          else
5246 5261                                  size = zone->zone_rootpathlen;
5247 5262                          zonepath = kmem_alloc(size, KM_SLEEP);
5248 5263                          bcopy(zone->zone_rootpath, zonepath, size);
5249 5264                          zonepath[size - 1] = '\0';
5250 5265                  } else {
5251 5266                          if (inzone || !is_system_labeled()) {
5252 5267                                  /*
5253 5268                                   * Caller is not in the global zone.
5254 5269                                   * if the query is on the current zone
5255 5270                                   * or the system is not labeled,
5256 5271                                   * just return faked-up path for current zone.
5257 5272                                   */
5258 5273                                  zonepath = "/";
5259 5274                                  size = 2;
5260 5275                          } else {
5261 5276                                  /*
5262 5277                                   * Return related path for current zone.
5263 5278                                   */
5264 5279                                  int prefix_len = strlen(zone_prefix);
5265 5280                                  int zname_len = strlen(zone->zone_name);
5266 5281  
5267 5282                                  size = prefix_len + zname_len + 1;
5268 5283                                  zonepath = kmem_alloc(size, KM_SLEEP);
5269 5284                                  bcopy(zone_prefix, zonepath, prefix_len);
5270 5285                                  bcopy(zone->zone_name, zonepath +
5271 5286                                      prefix_len, zname_len);
5272 5287                                  zonepath[size - 1] = '\0';
5273 5288                          }
5274 5289                  }
5275 5290                  if (bufsize > size)
5276 5291                          bufsize = size;
5277 5292                  if (buf != NULL) {
5278 5293                          err = copyoutstr(zonepath, buf, bufsize, NULL);
5279 5294                          if (err != 0 && err != ENAMETOOLONG)
5280 5295                                  error = EFAULT;
5281 5296                  }
5282 5297                  if (global || (is_system_labeled() && !inzone))
5283 5298                          kmem_free(zonepath, size);
5284 5299                  break;
5285 5300  
5286 5301          case ZONE_ATTR_NAME:
5287 5302                  size = strlen(zone->zone_name) + 1;
5288 5303                  if (bufsize > size)
5289 5304                          bufsize = size;
5290 5305                  if (buf != NULL) {
5291 5306                          err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5292 5307                          if (err != 0 && err != ENAMETOOLONG)
5293 5308                                  error = EFAULT;
5294 5309                  }
5295 5310                  break;
5296 5311  
5297 5312          case ZONE_ATTR_STATUS:
5298 5313                  /*
5299 5314                   * Since we're not holding zonehash_lock, the zone status
5300 5315                   * may be anything; leave it up to userland to sort it out.
5301 5316                   */
5302 5317                  size = sizeof (zone_status);
5303 5318                  if (bufsize > size)
5304 5319                          bufsize = size;
5305 5320                  zone_status = zone_status_get(zone);
5306 5321                  if (buf != NULL &&
5307 5322                      copyout(&zone_status, buf, bufsize) != 0)
5308 5323                          error = EFAULT;
5309 5324                  break;
5310 5325          case ZONE_ATTR_FLAGS:
5311 5326                  size = sizeof (zone->zone_flags);
5312 5327                  if (bufsize > size)
5313 5328                          bufsize = size;
5314 5329                  flags = zone->zone_flags;
5315 5330                  if (buf != NULL &&
5316 5331                      copyout(&flags, buf, bufsize) != 0)
5317 5332                          error = EFAULT;
5318 5333                  break;
5319 5334          case ZONE_ATTR_PRIVSET:
5320 5335                  size = sizeof (priv_set_t);
5321 5336                  if (bufsize > size)
5322 5337                          bufsize = size;
5323 5338                  if (buf != NULL &&
5324 5339                      copyout(zone->zone_privset, buf, bufsize) != 0)
5325 5340                          error = EFAULT;
5326 5341                  break;
5327 5342          case ZONE_ATTR_UNIQID:
5328 5343                  size = sizeof (zone->zone_uniqid);
5329 5344                  if (bufsize > size)
5330 5345                          bufsize = size;
5331 5346                  if (buf != NULL &&
5332 5347                      copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5333 5348                          error = EFAULT;
5334 5349                  break;
5335 5350          case ZONE_ATTR_POOLID:
5336 5351                  {
5337 5352                          pool_t *pool;
5338 5353                          poolid_t poolid;
5339 5354  
5340 5355                          if (pool_lock_intr() != 0) {
5341 5356                                  error = EINTR;
5342 5357                                  break;
5343 5358                          }
5344 5359                          pool = zone_pool_get(zone);
5345 5360                          poolid = pool->pool_id;
5346 5361                          pool_unlock();
5347 5362                          size = sizeof (poolid);
5348 5363                          if (bufsize > size)
5349 5364                                  bufsize = size;
5350 5365                          if (buf != NULL && copyout(&poolid, buf, size) != 0)
5351 5366                                  error = EFAULT;
5352 5367                  }
5353 5368                  break;
5354 5369          case ZONE_ATTR_SLBL:
5355 5370                  size = sizeof (bslabel_t);
5356 5371                  if (bufsize > size)
5357 5372                          bufsize = size;
5358 5373                  if (zone->zone_slabel == NULL)
5359 5374                          error = EINVAL;
5360 5375                  else if (buf != NULL &&
5361 5376                      copyout(label2bslabel(zone->zone_slabel), buf,
5362 5377                      bufsize) != 0)
5363 5378                          error = EFAULT;
5364 5379                  break;
5365 5380          case ZONE_ATTR_INITPID:
5366 5381                  size = sizeof (initpid);
5367 5382                  if (bufsize > size)
5368 5383                          bufsize = size;
5369 5384                  initpid = zone->zone_proc_initpid;
5370 5385                  if (initpid == -1) {
5371 5386                          error = ESRCH;
5372 5387                          break;
5373 5388                  }
5374 5389                  if (buf != NULL &&
5375 5390                      copyout(&initpid, buf, bufsize) != 0)
5376 5391                          error = EFAULT;
5377 5392                  break;
5378 5393          case ZONE_ATTR_BRAND:
5379 5394                  size = strlen(zone->zone_brand->b_name) + 1;
5380 5395  
5381 5396                  if (bufsize > size)
5382 5397                          bufsize = size;
5383 5398                  if (buf != NULL) {
5384 5399                          err = copyoutstr(zone->zone_brand->b_name, buf,
5385 5400                              bufsize, NULL);
5386 5401                          if (err != 0 && err != ENAMETOOLONG)
5387 5402                                  error = EFAULT;
5388 5403                  }
5389 5404                  break;
5390 5405          case ZONE_ATTR_INITNAME:
5391 5406                  size = strlen(zone->zone_initname) + 1;
5392 5407                  if (bufsize > size)
5393 5408                          bufsize = size;
5394 5409                  if (buf != NULL) {
5395 5410                          err = copyoutstr(zone->zone_initname, buf, bufsize,
5396 5411                              NULL);
5397 5412                          if (err != 0 && err != ENAMETOOLONG)
5398 5413                                  error = EFAULT;
5399 5414                  }
5400 5415                  break;
5401 5416          case ZONE_ATTR_BOOTARGS:
5402 5417                  if (zone->zone_bootargs == NULL)
5403 5418                          outstr = "";
5404 5419                  else
5405 5420                          outstr = zone->zone_bootargs;
5406 5421                  size = strlen(outstr) + 1;
5407 5422                  if (bufsize > size)
5408 5423                          bufsize = size;
5409 5424                  if (buf != NULL) {
5410 5425                          err = copyoutstr(outstr, buf, bufsize, NULL);
5411 5426                          if (err != 0 && err != ENAMETOOLONG)
5412 5427                                  error = EFAULT;
5413 5428                  }
5414 5429                  break;
5415 5430          case ZONE_ATTR_PHYS_MCAP:
5416 5431                  size = sizeof (zone->zone_phys_mcap);
5417 5432                  if (bufsize > size)
5418 5433                          bufsize = size;
5419 5434                  if (buf != NULL &&
5420 5435                      copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5421 5436                          error = EFAULT;
5422 5437                  break;
5423 5438          case ZONE_ATTR_SCHED_CLASS:
5424 5439                  mutex_enter(&class_lock);
5425 5440  
5426 5441                  if (zone->zone_defaultcid >= loaded_classes)
5427 5442                          outstr = "";
5428 5443                  else
5429 5444                          outstr = sclass[zone->zone_defaultcid].cl_name;
5430 5445                  size = strlen(outstr) + 1;
5431 5446                  if (bufsize > size)
5432 5447                          bufsize = size;
5433 5448                  if (buf != NULL) {
5434 5449                          err = copyoutstr(outstr, buf, bufsize, NULL);
5435 5450                          if (err != 0 && err != ENAMETOOLONG)
5436 5451                                  error = EFAULT;
5437 5452                  }
5438 5453  
5439 5454                  mutex_exit(&class_lock);
5440 5455                  break;
5441 5456          case ZONE_ATTR_HOSTID:
5442 5457                  if (zone->zone_hostid != HW_INVALID_HOSTID &&
5443 5458                      bufsize == sizeof (zone->zone_hostid)) {
5444 5459                          size = sizeof (zone->zone_hostid);
5445 5460                          if (buf != NULL && copyout(&zone->zone_hostid, buf,
5446 5461                              bufsize) != 0)
5447 5462                                  error = EFAULT;
5448 5463                  } else {
5449 5464                          error = EINVAL;
5450 5465                  }
5451 5466                  break;
5452 5467          case ZONE_ATTR_FS_ALLOWED:
5453 5468                  if (zone->zone_fs_allowed == NULL)
5454 5469                          outstr = "";
5455 5470                  else
5456 5471                          outstr = zone->zone_fs_allowed;
5457 5472                  size = strlen(outstr) + 1;
5458 5473                  if (bufsize > size)
5459 5474                          bufsize = size;
5460 5475                  if (buf != NULL) {
5461 5476                          err = copyoutstr(outstr, buf, bufsize, NULL);
5462 5477                          if (err != 0 && err != ENAMETOOLONG)
5463 5478                                  error = EFAULT;
5464 5479                  }
5465 5480                  break;
5466 5481          case ZONE_ATTR_NETWORK:
5467 5482                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5468 5483                  if (copyin(buf, zbuf, bufsize) != 0) {
5469 5484                          error = EFAULT;
5470 5485                  } else {
5471 5486                          error = zone_get_network(zoneid, zbuf);
5472 5487                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5473 5488                                  error = EFAULT;
5474 5489                  }
5475 5490                  kmem_free(zbuf, bufsize);
5476 5491                  break;
5477 5492          default:
5478 5493                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5479 5494                          size = bufsize;
5480 5495                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5481 5496                  } else {
5482 5497                          error = EINVAL;
5483 5498                  }
5484 5499          }
5485 5500          zone_rele(zone);
5486 5501  
5487 5502          if (error)
5488 5503                  return (set_errno(error));
5489 5504          return ((ssize_t)size);
5490 5505  }
5491 5506  
5492 5507  /*
5493 5508   * Systemcall entry point for zone_setattr(2).
5494 5509   */
5495 5510  /*ARGSUSED*/
5496 5511  static int
5497 5512  zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5498 5513  {
5499 5514          zone_t *zone;
5500 5515          zone_status_t zone_status;
5501 5516          int err = -1;
5502 5517          zone_net_data_t *zbuf;
5503 5518  
5504 5519          if (secpolicy_zone_config(CRED()) != 0)
5505 5520                  return (set_errno(EPERM));
5506 5521  
5507 5522          /*
5508 5523           * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5509 5524           * global zone.
5510 5525           */
5511 5526          if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5512 5527                  return (set_errno(EINVAL));
5513 5528          }
5514 5529  
5515 5530          mutex_enter(&zonehash_lock);
5516 5531          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5517 5532                  mutex_exit(&zonehash_lock);
5518 5533                  return (set_errno(EINVAL));
5519 5534          }
5520 5535          zone_hold(zone);
5521 5536          mutex_exit(&zonehash_lock);
5522 5537  
5523 5538          /*
5524 5539           * At present most attributes can only be set on non-running,
5525 5540           * non-global zones.
5526 5541           */
5527 5542          zone_status = zone_status_get(zone);
5528 5543          if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5529 5544                  err = EINVAL;
5530 5545                  goto done;
5531 5546          }
5532 5547  
5533 5548          switch (attr) {
5534 5549          case ZONE_ATTR_INITNAME:
5535 5550                  err = zone_set_initname(zone, (const char *)buf);
5536 5551                  break;
5537 5552          case ZONE_ATTR_INITNORESTART:
5538 5553                  zone->zone_restart_init = B_FALSE;
5539 5554                  err = 0;
5540 5555                  break;
5541 5556          case ZONE_ATTR_BOOTARGS:
5542 5557                  err = zone_set_bootargs(zone, (const char *)buf);
5543 5558                  break;
5544 5559          case ZONE_ATTR_BRAND:
5545 5560                  err = zone_set_brand(zone, (const char *)buf);
5546 5561                  break;
5547 5562          case ZONE_ATTR_FS_ALLOWED:
5548 5563                  err = zone_set_fs_allowed(zone, (const char *)buf);
5549 5564                  break;
5550 5565          case ZONE_ATTR_PHYS_MCAP:
5551 5566                  err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5552 5567                  break;
5553 5568          case ZONE_ATTR_SCHED_CLASS:
5554 5569                  err = zone_set_sched_class(zone, (const char *)buf);
5555 5570                  break;
5556 5571          case ZONE_ATTR_HOSTID:
5557 5572                  if (bufsize == sizeof (zone->zone_hostid)) {
5558 5573                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5559 5574                                  err = 0;
5560 5575                          else
5561 5576                                  err = EFAULT;
5562 5577                  } else {
5563 5578                          err = EINVAL;
5564 5579                  }
5565 5580                  break;
5566 5581          case ZONE_ATTR_NETWORK:
5567 5582                  if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5568 5583                          err = EINVAL;
5569 5584                          break;
5570 5585                  }
5571 5586                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5572 5587                  if (copyin(buf, zbuf, bufsize) != 0) {
5573 5588                          kmem_free(zbuf, bufsize);
5574 5589                          err = EFAULT;
5575 5590                          break;
5576 5591                  }
5577 5592                  err = zone_set_network(zoneid, zbuf);
5578 5593                  kmem_free(zbuf, bufsize);
5579 5594                  break;
5580 5595          default:
5581 5596                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5582 5597                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5583 5598                  else
5584 5599                          err = EINVAL;
5585 5600          }
5586 5601  
5587 5602  done:
5588 5603          zone_rele(zone);
5589 5604          ASSERT(err != -1);
5590 5605          return (err != 0 ? set_errno(err) : 0);
5591 5606  }
5592 5607  
5593 5608  /*
5594 5609   * Return zero if the process has at least one vnode mapped in to its
5595 5610   * address space which shouldn't be allowed to change zones.
5596 5611   *
5597 5612   * Also return zero if the process has any shared mappings which reserve
5598 5613   * swap.  This is because the counting for zone.max-swap does not allow swap
5599 5614   * reservation to be shared between zones.  zone swap reservation is counted
5600 5615   * on zone->zone_max_swap.
5601 5616   */
5602 5617  static int
5603 5618  as_can_change_zones(void)
5604 5619  {
5605 5620          proc_t *pp = curproc;
5606 5621          struct seg *seg;
5607 5622          struct as *as = pp->p_as;
5608 5623          vnode_t *vp;
5609 5624          int allow = 1;
5610 5625  
5611 5626          ASSERT(pp->p_as != &kas);
5612 5627          AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
5613 5628          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5614 5629  
5615 5630                  /*
5616 5631                   * Cannot enter zone with shared anon memory which
5617 5632                   * reserves swap.  See comment above.
5618 5633                   */
5619 5634                  if (seg_can_change_zones(seg) == B_FALSE) {
5620 5635                          allow = 0;
5621 5636                          break;
5622 5637                  }
5623 5638                  /*
5624 5639                   * if we can't get a backing vnode for this segment then skip
5625 5640                   * it.
5626 5641                   */
5627 5642                  vp = NULL;
5628 5643                  if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5629 5644                          continue;
5630 5645                  if (!vn_can_change_zones(vp)) { /* bail on first match */
5631 5646                          allow = 0;
5632 5647                          break;
5633 5648                  }
5634 5649          }
5635 5650          AS_LOCK_EXIT(as, &as->a_lock);
5636 5651          return (allow);
5637 5652  }
5638 5653  
5639 5654  /*
5640 5655   * Count swap reserved by curproc's address space
5641 5656   */
5642 5657  static size_t
5643 5658  as_swresv(void)
5644 5659  {
5645 5660          proc_t *pp = curproc;
5646 5661          struct seg *seg;
5647 5662          struct as *as = pp->p_as;
5648 5663          size_t swap = 0;
5649 5664  
5650 5665          ASSERT(pp->p_as != &kas);
5651 5666          ASSERT(AS_WRITE_HELD(as, &as->a_lock));
5652 5667          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5653 5668                  swap += seg_swresv(seg);
5654 5669  
5655 5670          return (swap);
5656 5671  }
5657 5672  
5658 5673  /*
5659 5674   * Systemcall entry point for zone_enter().
5660 5675   *
5661 5676   * The current process is injected into said zone.  In the process
5662 5677   * it will change its project membership, privileges, rootdir/cwd,
5663 5678   * zone-wide rctls, and pool association to match those of the zone.
5664 5679   *
5665 5680   * The first zone_enter() called while the zone is in the ZONE_IS_READY
5666 5681   * state will transition it to ZONE_IS_RUNNING.  Processes may only
5667 5682   * enter a zone that is "ready" or "running".
5668 5683   */
5669 5684  static int
5670 5685  zone_enter(zoneid_t zoneid)
5671 5686  {
5672 5687          zone_t *zone;
5673 5688          vnode_t *vp;
5674 5689          proc_t *pp = curproc;
5675 5690          contract_t *ct;
5676 5691          cont_process_t *ctp;
5677 5692          task_t *tk, *oldtk;
5678 5693          kproject_t *zone_proj0;
5679 5694          cred_t *cr, *newcr;
5680 5695          pool_t *oldpool, *newpool;
5681 5696          sess_t *sp;
5682 5697          uid_t uid;
5683 5698          zone_status_t status;
5684 5699          int err = 0;
5685 5700          rctl_entity_p_t e;
5686 5701          size_t swap;
5687 5702          kthread_id_t t;
5688 5703  
5689 5704          if (secpolicy_zone_config(CRED()) != 0)
5690 5705                  return (set_errno(EPERM));
5691 5706          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5692 5707                  return (set_errno(EINVAL));
5693 5708  
5694 5709          /*
5695 5710           * Stop all lwps so we don't need to hold a lock to look at
5696 5711           * curproc->p_zone.  This needs to happen before we grab any
5697 5712           * locks to avoid deadlock (another lwp in the process could
5698 5713           * be waiting for the held lock).
5699 5714           */
5700 5715          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5701 5716                  return (set_errno(EINTR));
5702 5717  
5703 5718          /*
5704 5719           * Make sure we're not changing zones with files open or mapped in
5705 5720           * to our address space which shouldn't be changing zones.
5706 5721           */
5707 5722          if (!files_can_change_zones()) {
5708 5723                  err = EBADF;
5709 5724                  goto out;
5710 5725          }
5711 5726          if (!as_can_change_zones()) {
5712 5727                  err = EFAULT;
5713 5728                  goto out;
5714 5729          }
5715 5730  
5716 5731          mutex_enter(&zonehash_lock);
5717 5732          if (pp->p_zone != global_zone) {
5718 5733                  mutex_exit(&zonehash_lock);
5719 5734                  err = EINVAL;
5720 5735                  goto out;
5721 5736          }
5722 5737  
5723 5738          zone = zone_find_all_by_id(zoneid);
5724 5739          if (zone == NULL) {
5725 5740                  mutex_exit(&zonehash_lock);
5726 5741                  err = EINVAL;
5727 5742                  goto out;
5728 5743          }
5729 5744  
5730 5745          /*
5731 5746           * To prevent processes in a zone from holding contracts on
5732 5747           * extrazonal resources, and to avoid process contract
5733 5748           * memberships which span zones, contract holders and processes
5734 5749           * which aren't the sole members of their encapsulating process
5735 5750           * contracts are not allowed to zone_enter.
5736 5751           */
5737 5752          ctp = pp->p_ct_process;
5738 5753          ct = &ctp->conp_contract;
5739 5754          mutex_enter(&ct->ct_lock);
5740 5755          mutex_enter(&pp->p_lock);
5741 5756          if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5742 5757                  mutex_exit(&pp->p_lock);
5743 5758                  mutex_exit(&ct->ct_lock);
5744 5759                  mutex_exit(&zonehash_lock);
5745 5760                  err = EINVAL;
5746 5761                  goto out;
5747 5762          }
5748 5763  
5749 5764          /*
5750 5765           * Moreover, we don't allow processes whose encapsulating
5751 5766           * process contracts have inherited extrazonal contracts.
5752 5767           * While it would be easier to eliminate all process contracts
5753 5768           * with inherited contracts, we need to be able to give a
5754 5769           * restarted init (or other zone-penetrating process) its
5755 5770           * predecessor's contracts.
5756 5771           */
5757 5772          if (ctp->conp_ninherited != 0) {
5758 5773                  contract_t *next;
5759 5774                  for (next = list_head(&ctp->conp_inherited); next;
5760 5775                      next = list_next(&ctp->conp_inherited, next)) {
5761 5776                          if (contract_getzuniqid(next) != zone->zone_uniqid) {
5762 5777                                  mutex_exit(&pp->p_lock);
5763 5778                                  mutex_exit(&ct->ct_lock);
5764 5779                                  mutex_exit(&zonehash_lock);
5765 5780                                  err = EINVAL;
5766 5781                                  goto out;
5767 5782                          }
5768 5783                  }
5769 5784          }
5770 5785  
5771 5786          mutex_exit(&pp->p_lock);
5772 5787          mutex_exit(&ct->ct_lock);
5773 5788  
5774 5789          status = zone_status_get(zone);
5775 5790          if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5776 5791                  /*
5777 5792                   * Can't join
5778 5793                   */
5779 5794                  mutex_exit(&zonehash_lock);
5780 5795                  err = EINVAL;
5781 5796                  goto out;
5782 5797          }
5783 5798  
5784 5799          /*
5785 5800           * Make sure new priv set is within the permitted set for caller
5786 5801           */
5787 5802          if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5788 5803                  mutex_exit(&zonehash_lock);
5789 5804                  err = EPERM;
5790 5805                  goto out;
5791 5806          }
5792 5807          /*
5793 5808           * We want to momentarily drop zonehash_lock while we optimistically
5794 5809           * bind curproc to the pool it should be running in.  This is safe
5795 5810           * since the zone can't disappear (we have a hold on it).
5796 5811           */
5797 5812          zone_hold(zone);
5798 5813          mutex_exit(&zonehash_lock);
5799 5814  
5800 5815          /*
5801 5816           * Grab pool_lock to keep the pools configuration from changing
5802 5817           * and to stop ourselves from getting rebound to another pool
5803 5818           * until we join the zone.
5804 5819           */
5805 5820          if (pool_lock_intr() != 0) {
5806 5821                  zone_rele(zone);
5807 5822                  err = EINTR;
5808 5823                  goto out;
5809 5824          }
5810 5825          ASSERT(secpolicy_pool(CRED()) == 0);
5811 5826          /*
5812 5827           * Bind ourselves to the pool currently associated with the zone.
5813 5828           */
5814 5829          oldpool = curproc->p_pool;
5815 5830          newpool = zone_pool_get(zone);
5816 5831          if (pool_state == POOL_ENABLED && newpool != oldpool &&
5817 5832              (err = pool_do_bind(newpool, P_PID, P_MYID,
5818 5833              POOL_BIND_ALL)) != 0) {
5819 5834                  pool_unlock();
5820 5835                  zone_rele(zone);
5821 5836                  goto out;
5822 5837          }
5823 5838  
5824 5839          /*
5825 5840           * Grab cpu_lock now; we'll need it later when we call
5826 5841           * task_join().
5827 5842           */
5828 5843          mutex_enter(&cpu_lock);
5829 5844          mutex_enter(&zonehash_lock);
5830 5845          /*
5831 5846           * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5832 5847           */
5833 5848          if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5834 5849                  /*
5835 5850                   * Can't join anymore.
5836 5851                   */
5837 5852                  mutex_exit(&zonehash_lock);
5838 5853                  mutex_exit(&cpu_lock);
5839 5854                  if (pool_state == POOL_ENABLED &&
5840 5855                      newpool != oldpool)
5841 5856                          (void) pool_do_bind(oldpool, P_PID, P_MYID,
5842 5857                              POOL_BIND_ALL);
5843 5858                  pool_unlock();
5844 5859                  zone_rele(zone);
5845 5860                  err = EINVAL;
5846 5861                  goto out;
5847 5862          }
5848 5863  
5849 5864          /*
5850 5865           * a_lock must be held while transfering locked memory and swap
5851 5866           * reservation from the global zone to the non global zone because
5852 5867           * asynchronous faults on the processes' address space can lock
5853 5868           * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5854 5869           * segments respectively.
5855 5870           */
5856 5871          AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
5857 5872          swap = as_swresv();
5858 5873          mutex_enter(&pp->p_lock);
5859 5874          zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5860 5875          /* verify that we do not exceed and task or lwp limits */
5861 5876          mutex_enter(&zone->zone_nlwps_lock);
5862 5877          /* add new lwps to zone and zone's proj0 */
5863 5878          zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5864 5879          zone->zone_nlwps += pp->p_lwpcnt;
5865 5880          /* add 1 task to zone's proj0 */
5866 5881          zone_proj0->kpj_ntasks += 1;
5867 5882  
5868 5883          zone_proj0->kpj_nprocs++;
5869 5884          zone->zone_nprocs++;
5870 5885          mutex_exit(&zone->zone_nlwps_lock);
5871 5886  
5872 5887          mutex_enter(&zone->zone_mem_lock);
5873 5888          zone->zone_locked_mem += pp->p_locked_mem;
5874 5889          zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5875 5890          zone->zone_max_swap += swap;
5876 5891          mutex_exit(&zone->zone_mem_lock);
5877 5892  
5878 5893          mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5879 5894          zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5880 5895          mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5881 5896  
5882 5897          /* remove lwps and process from proc's old zone and old project */
5883 5898          mutex_enter(&pp->p_zone->zone_nlwps_lock);
5884 5899          pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5885 5900          pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5886 5901          pp->p_task->tk_proj->kpj_nprocs--;
5887 5902          pp->p_zone->zone_nprocs--;
5888 5903          mutex_exit(&pp->p_zone->zone_nlwps_lock);
5889 5904  
5890 5905          mutex_enter(&pp->p_zone->zone_mem_lock);
5891 5906          pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5892 5907          pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5893 5908          pp->p_zone->zone_max_swap -= swap;
5894 5909          mutex_exit(&pp->p_zone->zone_mem_lock);
5895 5910  
5896 5911          mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5897 5912          pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5898 5913          mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5899 5914  
5900 5915          pp->p_flag |= SZONETOP;
5901 5916          pp->p_zone = zone;
5902 5917          mutex_exit(&pp->p_lock);
5903 5918          AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
5904 5919  
5905 5920          /*
5906 5921           * Joining the zone cannot fail from now on.
5907 5922           *
5908 5923           * This means that a lot of the following code can be commonized and
5909 5924           * shared with zsched().
5910 5925           */
5911 5926  
5912 5927          /*
5913 5928           * If the process contract fmri was inherited, we need to
5914 5929           * flag this so that any contract status will not leak
5915 5930           * extra zone information, svc_fmri in this case
5916 5931           */
5917 5932          if (ctp->conp_svc_ctid != ct->ct_id) {
5918 5933                  mutex_enter(&ct->ct_lock);
5919 5934                  ctp->conp_svc_zone_enter = ct->ct_id;
5920 5935                  mutex_exit(&ct->ct_lock);
5921 5936          }
5922 5937  
5923 5938          /*
5924 5939           * Reset the encapsulating process contract's zone.
5925 5940           */
5926 5941          ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5927 5942          contract_setzuniqid(ct, zone->zone_uniqid);
5928 5943  
5929 5944          /*
5930 5945           * Create a new task and associate the process with the project keyed
5931 5946           * by (projid,zoneid).
5932 5947           *
5933 5948           * We might as well be in project 0; the global zone's projid doesn't
5934 5949           * make much sense in a zone anyhow.
5935 5950           *
5936 5951           * This also increments zone_ntasks, and returns with p_lock held.
5937 5952           */
5938 5953          tk = task_create(0, zone);
5939 5954          oldtk = task_join(tk, 0);
5940 5955          mutex_exit(&cpu_lock);
5941 5956  
5942 5957          /*
5943 5958           * call RCTLOP_SET functions on this proc
5944 5959           */
5945 5960          e.rcep_p.zone = zone;
5946 5961          e.rcep_t = RCENTITY_ZONE;
5947 5962          (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5948 5963              RCD_CALLBACK);
5949 5964          mutex_exit(&pp->p_lock);
5950 5965  
5951 5966          /*
5952 5967           * We don't need to hold any of zsched's locks here; not only do we know
5953 5968           * the process and zone aren't going away, we know its session isn't
5954 5969           * changing either.
5955 5970           *
5956 5971           * By joining zsched's session here, we mimic the behavior in the
5957 5972           * global zone of init's sid being the pid of sched.  We extend this
5958 5973           * to all zlogin-like zone_enter()'ing processes as well.
5959 5974           */
5960 5975          mutex_enter(&pidlock);
5961 5976          sp = zone->zone_zsched->p_sessp;
5962 5977          sess_hold(zone->zone_zsched);
5963 5978          mutex_enter(&pp->p_lock);
5964 5979          pgexit(pp);
5965 5980          sess_rele(pp->p_sessp, B_TRUE);
5966 5981          pp->p_sessp = sp;
5967 5982          pgjoin(pp, zone->zone_zsched->p_pidp);
5968 5983  
5969 5984          /*
5970 5985           * If any threads are scheduled to be placed on zone wait queue they
5971 5986           * should abandon the idea since the wait queue is changing.
5972 5987           * We need to be holding pidlock & p_lock to do this.
5973 5988           */
5974 5989          if ((t = pp->p_tlist) != NULL) {
5975 5990                  do {
5976 5991                          thread_lock(t);
5977 5992                          /*
5978 5993                           * Kick this thread so that he doesn't sit
5979 5994                           * on a wrong wait queue.
5980 5995                           */
5981 5996                          if (ISWAITING(t))
5982 5997                                  setrun_locked(t);
5983 5998  
5984 5999                          if (t->t_schedflag & TS_ANYWAITQ)
5985 6000                                  t->t_schedflag &= ~ TS_ANYWAITQ;
5986 6001  
5987 6002                          thread_unlock(t);
5988 6003                  } while ((t = t->t_forw) != pp->p_tlist);
5989 6004          }
5990 6005  
5991 6006          /*
5992 6007           * If there is a default scheduling class for the zone and it is not
5993 6008           * the class we are currently in, change all of the threads in the
5994 6009           * process to the new class.  We need to be holding pidlock & p_lock
5995 6010           * when we call parmsset so this is a good place to do it.
5996 6011           */
5997 6012          if (zone->zone_defaultcid > 0 &&
5998 6013              zone->zone_defaultcid != curthread->t_cid) {
5999 6014                  pcparms_t pcparms;
6000 6015  
6001 6016                  pcparms.pc_cid = zone->zone_defaultcid;
6002 6017                  pcparms.pc_clparms[0] = 0;
6003 6018  
6004 6019                  /*
6005 6020                   * If setting the class fails, we still want to enter the zone.
6006 6021                   */
6007 6022                  if ((t = pp->p_tlist) != NULL) {
6008 6023                          do {
6009 6024                                  (void) parmsset(&pcparms, t);
6010 6025                          } while ((t = t->t_forw) != pp->p_tlist);
6011 6026                  }
6012 6027          }
6013 6028  
6014 6029          mutex_exit(&pp->p_lock);
6015 6030          mutex_exit(&pidlock);
6016 6031  
6017 6032          mutex_exit(&zonehash_lock);
6018 6033          /*
6019 6034           * We're firmly in the zone; let pools progress.
6020 6035           */
6021 6036          pool_unlock();
6022 6037          task_rele(oldtk);
6023 6038          /*
6024 6039           * We don't need to retain a hold on the zone since we already
6025 6040           * incremented zone_ntasks, so the zone isn't going anywhere.
6026 6041           */
6027 6042          zone_rele(zone);
6028 6043  
6029 6044          /*
6030 6045           * Chroot
6031 6046           */
6032 6047          vp = zone->zone_rootvp;
6033 6048          zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6034 6049          zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6035 6050  
6036 6051          /*
6037 6052           * Change process credentials
6038 6053           */
6039 6054          newcr = cralloc();
6040 6055          mutex_enter(&pp->p_crlock);
6041 6056          cr = pp->p_cred;
6042 6057          crcopy_to(cr, newcr);
6043 6058          crsetzone(newcr, zone);
6044 6059          pp->p_cred = newcr;
6045 6060  
6046 6061          /*
6047 6062           * Restrict all process privilege sets to zone limit
6048 6063           */
6049 6064          priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6050 6065          priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6051 6066          priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6052 6067          priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6053 6068          mutex_exit(&pp->p_crlock);
6054 6069          crset(pp, newcr);
6055 6070  
6056 6071          /*
6057 6072           * Adjust upcount to reflect zone entry.
6058 6073           */
6059 6074          uid = crgetruid(newcr);
6060 6075          mutex_enter(&pidlock);
6061 6076          upcount_dec(uid, GLOBAL_ZONEID);
6062 6077          upcount_inc(uid, zoneid);
6063 6078          mutex_exit(&pidlock);
6064 6079  
6065 6080          /*
6066 6081           * Set up core file path and content.
6067 6082           */
6068 6083          set_core_defaults();
6069 6084  
6070 6085  out:
6071 6086          /*
6072 6087           * Let the other lwps continue.
6073 6088           */
6074 6089          mutex_enter(&pp->p_lock);
6075 6090          if (curthread != pp->p_agenttp)
6076 6091                  continuelwps(pp);
6077 6092          mutex_exit(&pp->p_lock);
6078 6093  
6079 6094          return (err != 0 ? set_errno(err) : 0);
6080 6095  }
6081 6096  
6082 6097  /*
6083 6098   * Systemcall entry point for zone_list(2).
6084 6099   *
6085 6100   * Processes running in a (non-global) zone only see themselves.
6086 6101   * On labeled systems, they see all zones whose label they dominate.
6087 6102   */
6088 6103  static int
6089 6104  zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6090 6105  {
6091 6106          zoneid_t *zoneids;
6092 6107          zone_t *zone, *myzone;
6093 6108          uint_t user_nzones, real_nzones;
6094 6109          uint_t domi_nzones;
6095 6110          int error;
6096 6111  
6097 6112          if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6098 6113                  return (set_errno(EFAULT));
6099 6114  
6100 6115          myzone = curproc->p_zone;
6101 6116          if (myzone != global_zone) {
6102 6117                  bslabel_t *mybslab;
6103 6118  
6104 6119                  if (!is_system_labeled()) {
6105 6120                          /* just return current zone */
6106 6121                          real_nzones = domi_nzones = 1;
6107 6122                          zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6108 6123                          zoneids[0] = myzone->zone_id;
6109 6124                  } else {
6110 6125                          /* return all zones that are dominated */
6111 6126                          mutex_enter(&zonehash_lock);
6112 6127                          real_nzones = zonecount;
6113 6128                          domi_nzones = 0;
6114 6129                          if (real_nzones > 0) {
6115 6130                                  zoneids = kmem_alloc(real_nzones *
6116 6131                                      sizeof (zoneid_t), KM_SLEEP);
6117 6132                                  mybslab = label2bslabel(myzone->zone_slabel);
6118 6133                                  for (zone = list_head(&zone_active);
6119 6134                                      zone != NULL;
6120 6135                                      zone = list_next(&zone_active, zone)) {
6121 6136                                          if (zone->zone_id == GLOBAL_ZONEID)
6122 6137                                                  continue;
6123 6138                                          if (zone != myzone &&
6124 6139                                              (zone->zone_flags & ZF_IS_SCRATCH))
6125 6140                                                  continue;
6126 6141                                          /*
6127 6142                                           * Note that a label always dominates
6128 6143                                           * itself, so myzone is always included
6129 6144                                           * in the list.
6130 6145                                           */
6131 6146                                          if (bldominates(mybslab,
6132 6147                                              label2bslabel(zone->zone_slabel))) {
6133 6148                                                  zoneids[domi_nzones++] =
6134 6149                                                      zone->zone_id;
6135 6150                                          }
6136 6151                                  }
6137 6152                          }
6138 6153                          mutex_exit(&zonehash_lock);
6139 6154                  }
6140 6155          } else {
6141 6156                  mutex_enter(&zonehash_lock);
6142 6157                  real_nzones = zonecount;
6143 6158                  domi_nzones = 0;
6144 6159                  if (real_nzones > 0) {
6145 6160                          zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6146 6161                              KM_SLEEP);
6147 6162                          for (zone = list_head(&zone_active); zone != NULL;
6148 6163                              zone = list_next(&zone_active, zone))
6149 6164                                  zoneids[domi_nzones++] = zone->zone_id;
6150 6165                          ASSERT(domi_nzones == real_nzones);
6151 6166                  }
6152 6167                  mutex_exit(&zonehash_lock);
6153 6168          }
6154 6169  
6155 6170          /*
6156 6171           * If user has allocated space for fewer entries than we found, then
6157 6172           * return only up to his limit.  Either way, tell him exactly how many
6158 6173           * we found.
6159 6174           */
6160 6175          if (domi_nzones < user_nzones)
6161 6176                  user_nzones = domi_nzones;
6162 6177          error = 0;
6163 6178          if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6164 6179                  error = EFAULT;
6165 6180          } else if (zoneidlist != NULL && user_nzones != 0) {
6166 6181                  if (copyout(zoneids, zoneidlist,
6167 6182                      user_nzones * sizeof (zoneid_t)) != 0)
6168 6183                          error = EFAULT;
6169 6184          }
6170 6185  
6171 6186          if (real_nzones > 0)
6172 6187                  kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6173 6188  
6174 6189          if (error != 0)
6175 6190                  return (set_errno(error));
6176 6191          else
6177 6192                  return (0);
6178 6193  }
6179 6194  
6180 6195  /*
6181 6196   * Systemcall entry point for zone_lookup(2).
6182 6197   *
6183 6198   * Non-global zones are only able to see themselves and (on labeled systems)
6184 6199   * the zones they dominate.
6185 6200   */
6186 6201  static zoneid_t
6187 6202  zone_lookup(const char *zone_name)
6188 6203  {
6189 6204          char *kname;
6190 6205          zone_t *zone;
6191 6206          zoneid_t zoneid;
6192 6207          int err;
6193 6208  
6194 6209          if (zone_name == NULL) {
6195 6210                  /* return caller's zone id */
6196 6211                  return (getzoneid());
6197 6212          }
6198 6213  
6199 6214          kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6200 6215          if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6201 6216                  kmem_free(kname, ZONENAME_MAX);
6202 6217                  return (set_errno(err));
6203 6218          }
6204 6219  
6205 6220          mutex_enter(&zonehash_lock);
6206 6221          zone = zone_find_all_by_name(kname);
6207 6222          kmem_free(kname, ZONENAME_MAX);
6208 6223          /*
6209 6224           * In a non-global zone, can only lookup global and own name.
6210 6225           * In Trusted Extensions zone label dominance rules apply.
6211 6226           */
6212 6227          if (zone == NULL ||
6213 6228              zone_status_get(zone) < ZONE_IS_READY ||
6214 6229              !zone_list_access(zone)) {
6215 6230                  mutex_exit(&zonehash_lock);
6216 6231                  return (set_errno(EINVAL));
6217 6232          } else {
6218 6233                  zoneid = zone->zone_id;
6219 6234                  mutex_exit(&zonehash_lock);
6220 6235                  return (zoneid);
6221 6236          }
6222 6237  }
6223 6238  
6224 6239  static int
6225 6240  zone_version(int *version_arg)
6226 6241  {
6227 6242          int version = ZONE_SYSCALL_API_VERSION;
6228 6243  
6229 6244          if (copyout(&version, version_arg, sizeof (int)) != 0)
6230 6245                  return (set_errno(EFAULT));
6231 6246          return (0);
6232 6247  }
6233 6248  
6234 6249  /* ARGSUSED */
6235 6250  long
6236 6251  zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6237 6252  {
6238 6253          zone_def zs;
6239 6254          int err;
6240 6255  
6241 6256          switch (cmd) {
6242 6257          case ZONE_CREATE:
6243 6258                  if (get_udatamodel() == DATAMODEL_NATIVE) {
6244 6259                          if (copyin(arg1, &zs, sizeof (zone_def))) {
6245 6260                                  return (set_errno(EFAULT));
6246 6261                          }
6247 6262                  } else {
6248 6263  #ifdef _SYSCALL32_IMPL
6249 6264                          zone_def32 zs32;
6250 6265  
6251 6266                          if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6252 6267                                  return (set_errno(EFAULT));
6253 6268                          }
6254 6269                          zs.zone_name =
6255 6270                              (const char *)(unsigned long)zs32.zone_name;
6256 6271                          zs.zone_root =
6257 6272                              (const char *)(unsigned long)zs32.zone_root;
6258 6273                          zs.zone_privs =
6259 6274                              (const struct priv_set *)
6260 6275                              (unsigned long)zs32.zone_privs;
6261 6276                          zs.zone_privssz = zs32.zone_privssz;
6262 6277                          zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6263 6278                          zs.rctlbufsz = zs32.rctlbufsz;
6264 6279                          zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6265 6280                          zs.zfsbufsz = zs32.zfsbufsz;
6266 6281                          zs.extended_error =
6267 6282                              (int *)(unsigned long)zs32.extended_error;
6268 6283                          zs.match = zs32.match;
6269 6284                          zs.doi = zs32.doi;
6270 6285                          zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6271 6286                          zs.flags = zs32.flags;
6272 6287  #else
6273 6288                          panic("get_udatamodel() returned bogus result\n");
6274 6289  #endif
6275 6290                  }
6276 6291  
6277 6292                  return (zone_create(zs.zone_name, zs.zone_root,
6278 6293                      zs.zone_privs, zs.zone_privssz,
6279 6294                      (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6280 6295                      (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6281 6296                      zs.extended_error, zs.match, zs.doi,
6282 6297                      zs.label, zs.flags));
6283 6298          case ZONE_BOOT:
6284 6299                  return (zone_boot((zoneid_t)(uintptr_t)arg1));
6285 6300          case ZONE_DESTROY:
6286 6301                  return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6287 6302          case ZONE_GETATTR:
6288 6303                  return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6289 6304                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6290 6305          case ZONE_SETATTR:
6291 6306                  return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6292 6307                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6293 6308          case ZONE_ENTER:
6294 6309                  return (zone_enter((zoneid_t)(uintptr_t)arg1));
6295 6310          case ZONE_LIST:
6296 6311                  return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6297 6312          case ZONE_SHUTDOWN:
6298 6313                  return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6299 6314          case ZONE_LOOKUP:
6300 6315                  return (zone_lookup((const char *)arg1));
6301 6316          case ZONE_VERSION:
6302 6317                  return (zone_version((int *)arg1));
6303 6318          case ZONE_ADD_DATALINK:
6304 6319                  return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6305 6320                      (datalink_id_t)(uintptr_t)arg2));
6306 6321          case ZONE_DEL_DATALINK:
6307 6322                  return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6308 6323                      (datalink_id_t)(uintptr_t)arg2));
6309 6324          case ZONE_CHECK_DATALINK: {
6310 6325                  zoneid_t        zoneid;
6311 6326                  boolean_t       need_copyout;
6312 6327  
6313 6328                  if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6314 6329                          return (EFAULT);
6315 6330                  need_copyout = (zoneid == ALL_ZONES);
6316 6331                  err = zone_check_datalink(&zoneid,
6317 6332                      (datalink_id_t)(uintptr_t)arg2);
6318 6333                  if (err == 0 && need_copyout) {
6319 6334                          if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6320 6335                                  err = EFAULT;
6321 6336                  }
6322 6337                  return (err == 0 ? 0 : set_errno(err));
6323 6338          }
6324 6339          case ZONE_LIST_DATALINK:
6325 6340                  return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6326 6341                      (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6327 6342          default:
6328 6343                  return (set_errno(EINVAL));
6329 6344          }
6330 6345  }
6331 6346  
6332 6347  struct zarg {
6333 6348          zone_t *zone;
6334 6349          zone_cmd_arg_t arg;
6335 6350  };
6336 6351  
6337 6352  static int
6338 6353  zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6339 6354  {
6340 6355          char *buf;
6341 6356          size_t buflen;
6342 6357          int error;
6343 6358  
6344 6359          buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6345 6360          buf = kmem_alloc(buflen, KM_SLEEP);
6346 6361          (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6347 6362          error = door_ki_open(buf, doorp);
6348 6363          kmem_free(buf, buflen);
6349 6364          return (error);
6350 6365  }
6351 6366  
6352 6367  static void
6353 6368  zone_release_door(door_handle_t *doorp)
6354 6369  {
6355 6370          door_ki_rele(*doorp);
6356 6371          *doorp = NULL;
6357 6372  }
6358 6373  
6359 6374  static void
6360 6375  zone_ki_call_zoneadmd(struct zarg *zargp)
6361 6376  {
6362 6377          door_handle_t door = NULL;
6363 6378          door_arg_t darg, save_arg;
6364 6379          char *zone_name;
6365 6380          size_t zone_namelen;
6366 6381          zoneid_t zoneid;
6367 6382          zone_t *zone;
6368 6383          zone_cmd_arg_t arg;
6369 6384          uint64_t uniqid;
6370 6385          size_t size;
6371 6386          int error;
6372 6387          int retry;
6373 6388  
6374 6389          zone = zargp->zone;
6375 6390          arg = zargp->arg;
6376 6391          kmem_free(zargp, sizeof (*zargp));
6377 6392  
6378 6393          zone_namelen = strlen(zone->zone_name) + 1;
6379 6394          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6380 6395          bcopy(zone->zone_name, zone_name, zone_namelen);
6381 6396          zoneid = zone->zone_id;
6382 6397          uniqid = zone->zone_uniqid;
6383 6398          /*
6384 6399           * zoneadmd may be down, but at least we can empty out the zone.
6385 6400           * We can ignore the return value of zone_empty() since we're called
6386 6401           * from a kernel thread and know we won't be delivered any signals.
6387 6402           */
6388 6403          ASSERT(curproc == &p0);
6389 6404          (void) zone_empty(zone);
6390 6405          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6391 6406          zone_rele(zone);
6392 6407  
6393 6408          size = sizeof (arg);
6394 6409          darg.rbuf = (char *)&arg;
6395 6410          darg.data_ptr = (char *)&arg;
6396 6411          darg.rsize = size;
6397 6412          darg.data_size = size;
6398 6413          darg.desc_ptr = NULL;
6399 6414          darg.desc_num = 0;
6400 6415  
6401 6416          save_arg = darg;
6402 6417          /*
6403 6418           * Since we're not holding a reference to the zone, any number of
6404 6419           * things can go wrong, including the zone disappearing before we get a
6405 6420           * chance to talk to zoneadmd.
6406 6421           */
6407 6422          for (retry = 0; /* forever */; retry++) {
6408 6423                  if (door == NULL &&
6409 6424                      (error = zone_lookup_door(zone_name, &door)) != 0) {
6410 6425                          goto next;
6411 6426                  }
6412 6427                  ASSERT(door != NULL);
6413 6428  
6414 6429                  if ((error = door_ki_upcall_limited(door, &darg, NULL,
6415 6430                      SIZE_MAX, 0)) == 0) {
6416 6431                          break;
6417 6432                  }
6418 6433                  switch (error) {
6419 6434                  case EINTR:
6420 6435                          /* FALLTHROUGH */
6421 6436                  case EAGAIN:    /* process may be forking */
6422 6437                          /*
6423 6438                           * Back off for a bit
6424 6439                           */
6425 6440                          break;
6426 6441                  case EBADF:
6427 6442                          zone_release_door(&door);
6428 6443                          if (zone_lookup_door(zone_name, &door) != 0) {
6429 6444                                  /*
6430 6445                                   * zoneadmd may be dead, but it may come back to
6431 6446                                   * life later.
6432 6447                                   */
6433 6448                                  break;
6434 6449                          }
6435 6450                          break;
6436 6451                  default:
6437 6452                          cmn_err(CE_WARN,
6438 6453                              "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6439 6454                              error);
6440 6455                          goto out;
6441 6456                  }
6442 6457  next:
6443 6458                  /*
6444 6459                   * If this isn't the same zone_t that we originally had in mind,
6445 6460                   * then this is the same as if two kadmin requests come in at
6446 6461                   * the same time: the first one wins.  This means we lose, so we
6447 6462                   * bail.
6448 6463                   */
6449 6464                  if ((zone = zone_find_by_id(zoneid)) == NULL) {
6450 6465                          /*
6451 6466                           * Problem is solved.
6452 6467                           */
6453 6468                          break;
6454 6469                  }
6455 6470                  if (zone->zone_uniqid != uniqid) {
6456 6471                          /*
6457 6472                           * zoneid recycled
6458 6473                           */
6459 6474                          zone_rele(zone);
6460 6475                          break;
6461 6476                  }
6462 6477                  /*
6463 6478                   * We could zone_status_timedwait(), but there doesn't seem to
6464 6479                   * be much point in doing that (plus, it would mean that
6465 6480                   * zone_free() isn't called until this thread exits).
6466 6481                   */
6467 6482                  zone_rele(zone);
6468 6483                  delay(hz);
6469 6484                  darg = save_arg;
6470 6485          }
6471 6486  out:
6472 6487          if (door != NULL) {
6473 6488                  zone_release_door(&door);
6474 6489          }
6475 6490          kmem_free(zone_name, zone_namelen);
6476 6491          thread_exit();
6477 6492  }
6478 6493  
6479 6494  /*
6480 6495   * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6481 6496   * kadmin().  The caller is a process in the zone.
6482 6497   *
6483 6498   * In order to shutdown the zone, we will hand off control to zoneadmd
6484 6499   * (running in the global zone) via a door.  We do a half-hearted job at
6485 6500   * killing all processes in the zone, create a kernel thread to contact
6486 6501   * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6487 6502   * a form of generation number used to let zoneadmd (as well as
6488 6503   * zone_destroy()) know exactly which zone they're re talking about.
6489 6504   */
6490 6505  int
6491 6506  zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6492 6507  {
6493 6508          struct zarg *zargp;
6494 6509          zone_cmd_t zcmd;
6495 6510          zone_t *zone;
6496 6511  
6497 6512          zone = curproc->p_zone;
6498 6513          ASSERT(getzoneid() != GLOBAL_ZONEID);
6499 6514  
6500 6515          switch (cmd) {
6501 6516          case A_SHUTDOWN:
6502 6517                  switch (fcn) {
6503 6518                  case AD_HALT:
6504 6519                  case AD_POWEROFF:
6505 6520                          zcmd = Z_HALT;
6506 6521                          break;
6507 6522                  case AD_BOOT:
6508 6523                          zcmd = Z_REBOOT;
6509 6524                          break;
6510 6525                  case AD_IBOOT:
6511 6526                  case AD_SBOOT:
6512 6527                  case AD_SIBOOT:
6513 6528                  case AD_NOSYNC:
6514 6529                          return (ENOTSUP);
6515 6530                  default:
6516 6531                          return (EINVAL);
6517 6532                  }
6518 6533                  break;
6519 6534          case A_REBOOT:
6520 6535                  zcmd = Z_REBOOT;
6521 6536                  break;
6522 6537          case A_FTRACE:
6523 6538          case A_REMOUNT:
6524 6539          case A_FREEZE:
6525 6540          case A_DUMP:
6526 6541          case A_CONFIG:
6527 6542                  return (ENOTSUP);
6528 6543          default:
6529 6544                  ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6530 6545                  return (EINVAL);
6531 6546          }
6532 6547  
6533 6548          if (secpolicy_zone_admin(credp, B_FALSE))
6534 6549                  return (EPERM);
6535 6550          mutex_enter(&zone_status_lock);
6536 6551  
6537 6552          /*
6538 6553           * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6539 6554           * is in the zone.
6540 6555           */
6541 6556          ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6542 6557          if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6543 6558                  /*
6544 6559                   * This zone is already on its way down.
6545 6560                   */
6546 6561                  mutex_exit(&zone_status_lock);
6547 6562                  return (0);
6548 6563          }
6549 6564          /*
6550 6565           * Prevent future zone_enter()s
6551 6566           */
6552 6567          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6553 6568          mutex_exit(&zone_status_lock);
6554 6569  
6555 6570          /*
6556 6571           * Kill everyone now and call zoneadmd later.
6557 6572           * zone_ki_call_zoneadmd() will do a more thorough job of this
6558 6573           * later.
6559 6574           */
6560 6575          killall(zone->zone_id);
6561 6576          /*
6562 6577           * Now, create the thread to contact zoneadmd and do the rest of the
6563 6578           * work.  This thread can't be created in our zone otherwise
6564 6579           * zone_destroy() would deadlock.
6565 6580           */
6566 6581          zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6567 6582          zargp->arg.cmd = zcmd;
6568 6583          zargp->arg.uniqid = zone->zone_uniqid;
6569 6584          zargp->zone = zone;
6570 6585          (void) strcpy(zargp->arg.locale, "C");
6571 6586          /* mdep was already copied in for us by uadmin */
6572 6587          if (mdep != NULL)
6573 6588                  (void) strlcpy(zargp->arg.bootbuf, mdep,
6574 6589                      sizeof (zargp->arg.bootbuf));
6575 6590          zone_hold(zone);
6576 6591  
6577 6592          (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6578 6593              TS_RUN, minclsyspri);
6579 6594          exit(CLD_EXITED, 0);
6580 6595  
6581 6596          return (EINVAL);
6582 6597  }
6583 6598  
6584 6599  /*
6585 6600   * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6586 6601   * status to ZONE_IS_SHUTTING_DOWN.
6587 6602   *
6588 6603   * This function also shuts down all running zones to ensure that they won't
6589 6604   * fork new processes.
6590 6605   */
6591 6606  void
6592 6607  zone_shutdown_global(void)
6593 6608  {
6594 6609          zone_t *current_zonep;
6595 6610  
6596 6611          ASSERT(INGLOBALZONE(curproc));
6597 6612          mutex_enter(&zonehash_lock);
6598 6613          mutex_enter(&zone_status_lock);
6599 6614  
6600 6615          /* Modify the global zone's status first. */
6601 6616          ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6602 6617          zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6603 6618  
6604 6619          /*
6605 6620           * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6606 6621           * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6607 6622           * could cause assertions to fail (e.g., assertions about a zone's
6608 6623           * state during initialization, readying, or booting) or produce races.
6609 6624           * We'll let threads continue to initialize and ready new zones: they'll
6610 6625           * fail to boot the new zones when they see that the global zone is
6611 6626           * shutting down.
6612 6627           */
6613 6628          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6614 6629              current_zonep = list_next(&zone_active, current_zonep)) {
6615 6630                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6616 6631                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6617 6632          }
6618 6633          mutex_exit(&zone_status_lock);
6619 6634          mutex_exit(&zonehash_lock);
6620 6635  }
6621 6636  
6622 6637  /*
6623 6638   * Returns true if the named dataset is visible in the current zone.
6624 6639   * The 'write' parameter is set to 1 if the dataset is also writable.
6625 6640   */
6626 6641  int
6627 6642  zone_dataset_visible(const char *dataset, int *write)
6628 6643  {
6629 6644          static int zfstype = -1;
6630 6645          zone_dataset_t *zd;
6631 6646          size_t len;
6632 6647          zone_t *zone = curproc->p_zone;
6633 6648          const char *name = NULL;
6634 6649          vfs_t *vfsp = NULL;
6635 6650  
6636 6651          if (dataset[0] == '\0')
6637 6652                  return (0);
6638 6653  
6639 6654          /*
6640 6655           * Walk the list once, looking for datasets which match exactly, or
6641 6656           * specify a dataset underneath an exported dataset.  If found, return
6642 6657           * true and note that it is writable.
6643 6658           */
6644 6659          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6645 6660              zd = list_next(&zone->zone_datasets, zd)) {
6646 6661  
6647 6662                  len = strlen(zd->zd_dataset);
6648 6663                  if (strlen(dataset) >= len &&
6649 6664                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6650 6665                      (dataset[len] == '\0' || dataset[len] == '/' ||
6651 6666                      dataset[len] == '@')) {
6652 6667                          if (write)
6653 6668                                  *write = 1;
6654 6669                          return (1);
6655 6670                  }
6656 6671          }
6657 6672  
6658 6673          /*
6659 6674           * Walk the list a second time, searching for datasets which are parents
6660 6675           * of exported datasets.  These should be visible, but read-only.
6661 6676           *
6662 6677           * Note that we also have to support forms such as 'pool/dataset/', with
6663 6678           * a trailing slash.
6664 6679           */
6665 6680          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6666 6681              zd = list_next(&zone->zone_datasets, zd)) {
6667 6682  
6668 6683                  len = strlen(dataset);
6669 6684                  if (dataset[len - 1] == '/')
6670 6685                          len--;  /* Ignore trailing slash */
6671 6686                  if (len < strlen(zd->zd_dataset) &&
6672 6687                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6673 6688                      zd->zd_dataset[len] == '/') {
6674 6689                          if (write)
6675 6690                                  *write = 0;
6676 6691                          return (1);
6677 6692                  }
6678 6693          }
6679 6694  
6680 6695          /*
6681 6696           * We reach here if the given dataset is not found in the zone_dataset
6682 6697           * list. Check if this dataset was added as a filesystem (ie. "add fs")
6683 6698           * instead of delegation. For this we search for the dataset in the
6684 6699           * zone_vfslist of this zone. If found, return true and note that it is
6685 6700           * not writable.
6686 6701           */
6687 6702  
6688 6703          /*
6689 6704           * Initialize zfstype if it is not initialized yet.
6690 6705           */
6691 6706          if (zfstype == -1) {
6692 6707                  struct vfssw *vswp = vfs_getvfssw("zfs");
6693 6708                  zfstype = vswp - vfssw;
6694 6709                  vfs_unrefvfssw(vswp);
6695 6710          }
6696 6711  
6697 6712          vfs_list_read_lock();
6698 6713          vfsp = zone->zone_vfslist;
6699 6714          do {
6700 6715                  ASSERT(vfsp);
6701 6716                  if (vfsp->vfs_fstype == zfstype) {
6702 6717                          name = refstr_value(vfsp->vfs_resource);
6703 6718  
6704 6719                          /*
6705 6720                           * Check if we have an exact match.
6706 6721                           */
6707 6722                          if (strcmp(dataset, name) == 0) {
6708 6723                                  vfs_list_unlock();
6709 6724                                  if (write)
6710 6725                                          *write = 0;
6711 6726                                  return (1);
6712 6727                          }
6713 6728                          /*
6714 6729                           * We need to check if we are looking for parents of
6715 6730                           * a dataset. These should be visible, but read-only.
6716 6731                           */
6717 6732                          len = strlen(dataset);
6718 6733                          if (dataset[len - 1] == '/')
6719 6734                                  len--;
6720 6735  
6721 6736                          if (len < strlen(name) &&
6722 6737                              bcmp(dataset, name, len) == 0 && name[len] == '/') {
6723 6738                                  vfs_list_unlock();
6724 6739                                  if (write)
6725 6740                                          *write = 0;
6726 6741                                  return (1);
6727 6742                          }
6728 6743                  }
6729 6744                  vfsp = vfsp->vfs_zone_next;
6730 6745          } while (vfsp != zone->zone_vfslist);
6731 6746  
6732 6747          vfs_list_unlock();
6733 6748          return (0);
6734 6749  }
6735 6750  
6736 6751  /*
6737 6752   * zone_find_by_any_path() -
6738 6753   *
6739 6754   * kernel-private routine similar to zone_find_by_path(), but which
6740 6755   * effectively compares against zone paths rather than zonerootpath
6741 6756   * (i.e., the last component of zonerootpaths, which should be "root/",
6742 6757   * are not compared.)  This is done in order to accurately identify all
6743 6758   * paths, whether zone-visible or not, including those which are parallel
6744 6759   * to /root/, such as /dev/, /home/, etc...
6745 6760   *
6746 6761   * If the specified path does not fall under any zone path then global
6747 6762   * zone is returned.
6748 6763   *
6749 6764   * The treat_abs parameter indicates whether the path should be treated as
6750 6765   * an absolute path although it does not begin with "/".  (This supports
6751 6766   * nfs mount syntax such as host:any/path.)
6752 6767   *
6753 6768   * The caller is responsible for zone_rele of the returned zone.
6754 6769   */
6755 6770  zone_t *
6756 6771  zone_find_by_any_path(const char *path, boolean_t treat_abs)
6757 6772  {
6758 6773          zone_t *zone;
6759 6774          int path_offset = 0;
6760 6775  
6761 6776          if (path == NULL) {
6762 6777                  zone_hold(global_zone);
6763 6778                  return (global_zone);
6764 6779          }
6765 6780  
6766 6781          if (*path != '/') {
6767 6782                  ASSERT(treat_abs);
6768 6783                  path_offset = 1;
6769 6784          }
6770 6785  
6771 6786          mutex_enter(&zonehash_lock);
6772 6787          for (zone = list_head(&zone_active); zone != NULL;
6773 6788              zone = list_next(&zone_active, zone)) {
6774 6789                  char    *c;
6775 6790                  size_t  pathlen;
6776 6791                  char *rootpath_start;
6777 6792  
6778 6793                  if (zone == global_zone)        /* skip global zone */
6779 6794                          continue;
6780 6795  
6781 6796                  /* scan backwards to find start of last component */
6782 6797                  c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6783 6798                  do {
6784 6799                          c--;
6785 6800                  } while (*c != '/');
6786 6801  
6787 6802                  pathlen = c - zone->zone_rootpath + 1 - path_offset;
6788 6803                  rootpath_start = (zone->zone_rootpath + path_offset);
6789 6804                  if (strncmp(path, rootpath_start, pathlen) == 0)
6790 6805                          break;
6791 6806          }
6792 6807          if (zone == NULL)
6793 6808                  zone = global_zone;
6794 6809          zone_hold(zone);
6795 6810          mutex_exit(&zonehash_lock);
6796 6811          return (zone);
6797 6812  }
6798 6813  
6799 6814  /*
6800 6815   * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6801 6816   * zone_dl_t pointer if found, and NULL otherwise.
6802 6817   */
6803 6818  static zone_dl_t *
6804 6819  zone_find_dl(zone_t *zone, datalink_id_t linkid)
6805 6820  {
6806 6821          zone_dl_t *zdl;
6807 6822  
6808 6823          ASSERT(mutex_owned(&zone->zone_lock));
6809 6824          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6810 6825              zdl = list_next(&zone->zone_dl_list, zdl)) {
6811 6826                  if (zdl->zdl_id == linkid)
6812 6827                          break;
6813 6828          }
6814 6829          return (zdl);
6815 6830  }
6816 6831  
6817 6832  static boolean_t
6818 6833  zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6819 6834  {
6820 6835          boolean_t exists;
6821 6836  
6822 6837          mutex_enter(&zone->zone_lock);
6823 6838          exists = (zone_find_dl(zone, linkid) != NULL);
6824 6839          mutex_exit(&zone->zone_lock);
6825 6840          return (exists);
6826 6841  }
6827 6842  
6828 6843  /*
6829 6844   * Add an data link name for the zone.
6830 6845   */
6831 6846  static int
6832 6847  zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6833 6848  {
6834 6849          zone_dl_t *zdl;
6835 6850          zone_t *zone;
6836 6851          zone_t *thiszone;
6837 6852  
6838 6853          if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6839 6854                  return (set_errno(ENXIO));
6840 6855  
6841 6856          /* Verify that the datalink ID doesn't already belong to a zone. */
6842 6857          mutex_enter(&zonehash_lock);
6843 6858          for (zone = list_head(&zone_active); zone != NULL;
6844 6859              zone = list_next(&zone_active, zone)) {
6845 6860                  if (zone_dl_exists(zone, linkid)) {
6846 6861                          mutex_exit(&zonehash_lock);
6847 6862                          zone_rele(thiszone);
6848 6863                          return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6849 6864                  }
6850 6865          }
6851 6866  
6852 6867          zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6853 6868          zdl->zdl_id = linkid;
6854 6869          zdl->zdl_net = NULL;
6855 6870          mutex_enter(&thiszone->zone_lock);
6856 6871          list_insert_head(&thiszone->zone_dl_list, zdl);
6857 6872          mutex_exit(&thiszone->zone_lock);
6858 6873          mutex_exit(&zonehash_lock);
6859 6874          zone_rele(thiszone);
6860 6875          return (0);
6861 6876  }
6862 6877  
6863 6878  static int
6864 6879  zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6865 6880  {
6866 6881          zone_dl_t *zdl;
6867 6882          zone_t *zone;
6868 6883          int err = 0;
6869 6884  
6870 6885          if ((zone = zone_find_by_id(zoneid)) == NULL)
6871 6886                  return (set_errno(EINVAL));
6872 6887  
6873 6888          mutex_enter(&zone->zone_lock);
6874 6889          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6875 6890                  err = ENXIO;
6876 6891          } else {
6877 6892                  list_remove(&zone->zone_dl_list, zdl);
6878 6893                  if (zdl->zdl_net != NULL)
6879 6894                          nvlist_free(zdl->zdl_net);
6880 6895                  kmem_free(zdl, sizeof (zone_dl_t));
6881 6896          }
6882 6897          mutex_exit(&zone->zone_lock);
6883 6898          zone_rele(zone);
6884 6899          return (err == 0 ? 0 : set_errno(err));
6885 6900  }
6886 6901  
6887 6902  /*
6888 6903   * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6889 6904   * the linkid.  Otherwise we just check if the specified zoneidp has been
6890 6905   * assigned the supplied linkid.
6891 6906   */
6892 6907  int
6893 6908  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6894 6909  {
6895 6910          zone_t *zone;
6896 6911          int err = ENXIO;
6897 6912  
6898 6913          if (*zoneidp != ALL_ZONES) {
6899 6914                  if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6900 6915                          if (zone_dl_exists(zone, linkid))
6901 6916                                  err = 0;
6902 6917                          zone_rele(zone);
6903 6918                  }
6904 6919                  return (err);
6905 6920          }
6906 6921  
6907 6922          mutex_enter(&zonehash_lock);
6908 6923          for (zone = list_head(&zone_active); zone != NULL;
6909 6924              zone = list_next(&zone_active, zone)) {
6910 6925                  if (zone_dl_exists(zone, linkid)) {
6911 6926                          *zoneidp = zone->zone_id;
6912 6927                          err = 0;
6913 6928                          break;
6914 6929                  }
6915 6930          }
6916 6931          mutex_exit(&zonehash_lock);
6917 6932          return (err);
6918 6933  }
6919 6934  
6920 6935  /*
6921 6936   * Get the list of datalink IDs assigned to a zone.
6922 6937   *
6923 6938   * On input, *nump is the number of datalink IDs that can fit in the supplied
6924 6939   * idarray.  Upon return, *nump is either set to the number of datalink IDs
6925 6940   * that were placed in the array if the array was large enough, or to the
6926 6941   * number of datalink IDs that the function needs to place in the array if the
6927 6942   * array is too small.
6928 6943   */
6929 6944  static int
6930 6945  zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6931 6946  {
6932 6947          uint_t num, dlcount;
6933 6948          zone_t *zone;
6934 6949          zone_dl_t *zdl;
6935 6950          datalink_id_t *idptr = idarray;
6936 6951  
6937 6952          if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6938 6953                  return (set_errno(EFAULT));
6939 6954          if ((zone = zone_find_by_id(zoneid)) == NULL)
6940 6955                  return (set_errno(ENXIO));
6941 6956  
6942 6957          num = 0;
6943 6958          mutex_enter(&zone->zone_lock);
6944 6959          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6945 6960              zdl = list_next(&zone->zone_dl_list, zdl)) {
6946 6961                  /*
6947 6962                   * If the list is bigger than what the caller supplied, just
6948 6963                   * count, don't do copyout.
6949 6964                   */
6950 6965                  if (++num > dlcount)
6951 6966                          continue;
6952 6967                  if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6953 6968                          mutex_exit(&zone->zone_lock);
6954 6969                          zone_rele(zone);
6955 6970                          return (set_errno(EFAULT));
6956 6971                  }
6957 6972                  idptr++;
6958 6973          }
6959 6974          mutex_exit(&zone->zone_lock);
6960 6975          zone_rele(zone);
6961 6976  
6962 6977          /* Increased or decreased, caller should be notified. */
6963 6978          if (num != dlcount) {
6964 6979                  if (copyout(&num, nump, sizeof (num)) != 0)
6965 6980                          return (set_errno(EFAULT));
6966 6981          }
6967 6982          return (0);
6968 6983  }
6969 6984  
6970 6985  /*
6971 6986   * Public interface for looking up a zone by zoneid. It's a customized version
6972 6987   * for netstack_zone_create(). It can only be called from the zsd create
6973 6988   * callbacks, since it doesn't have reference on the zone structure hence if
6974 6989   * it is called elsewhere the zone could disappear after the zonehash_lock
6975 6990   * is dropped.
6976 6991   *
6977 6992   * Furthermore it
6978 6993   * 1. Doesn't check the status of the zone.
6979 6994   * 2. It will be called even before zone_init is called, in that case the
6980 6995   *    address of zone0 is returned directly, and netstack_zone_create()
6981 6996   *    will only assign a value to zone0.zone_netstack, won't break anything.
6982 6997   * 3. Returns without the zone being held.
6983 6998   */
6984 6999  zone_t *
6985 7000  zone_find_by_id_nolock(zoneid_t zoneid)
6986 7001  {
6987 7002          zone_t *zone;
6988 7003  
6989 7004          mutex_enter(&zonehash_lock);
6990 7005          if (zonehashbyid == NULL)
6991 7006                  zone = &zone0;
6992 7007          else
6993 7008                  zone = zone_find_all_by_id(zoneid);
6994 7009          mutex_exit(&zonehash_lock);
6995 7010          return (zone);
6996 7011  }
6997 7012  
6998 7013  /*
6999 7014   * Walk the datalinks for a given zone
7000 7015   */
7001 7016  int
7002 7017  zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7003 7018      void *data)
7004 7019  {
7005 7020          zone_t          *zone;
7006 7021          zone_dl_t       *zdl;
7007 7022          datalink_id_t   *idarray;
7008 7023          uint_t          idcount = 0;
7009 7024          int             i, ret = 0;
7010 7025  
7011 7026          if ((zone = zone_find_by_id(zoneid)) == NULL)
7012 7027                  return (ENOENT);
7013 7028  
7014 7029          /*
7015 7030           * We first build an array of linkid's so that we can walk these and
7016 7031           * execute the callback with the zone_lock dropped.
7017 7032           */
7018 7033          mutex_enter(&zone->zone_lock);
7019 7034          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7020 7035              zdl = list_next(&zone->zone_dl_list, zdl)) {
7021 7036                  idcount++;
7022 7037          }
7023 7038  
7024 7039          if (idcount == 0) {
7025 7040                  mutex_exit(&zone->zone_lock);
7026 7041                  zone_rele(zone);
7027 7042                  return (0);
7028 7043          }
7029 7044  
7030 7045          idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7031 7046          if (idarray == NULL) {
7032 7047                  mutex_exit(&zone->zone_lock);
7033 7048                  zone_rele(zone);
7034 7049                  return (ENOMEM);
7035 7050          }
7036 7051  
7037 7052          for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7038 7053              i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7039 7054                  idarray[i] = zdl->zdl_id;
7040 7055          }
7041 7056  
7042 7057          mutex_exit(&zone->zone_lock);
7043 7058  
7044 7059          for (i = 0; i < idcount && ret == 0; i++) {
7045 7060                  if ((ret = (*cb)(idarray[i], data)) != 0)
7046 7061                          break;
7047 7062          }
7048 7063  
7049 7064          zone_rele(zone);
7050 7065          kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7051 7066          return (ret);
7052 7067  }
7053 7068  
7054 7069  static char *
7055 7070  zone_net_type2name(int type)
7056 7071  {
7057 7072          switch (type) {
7058 7073          case ZONE_NETWORK_ADDRESS:
7059 7074                  return (ZONE_NET_ADDRNAME);
7060 7075          case ZONE_NETWORK_DEFROUTER:
7061 7076                  return (ZONE_NET_RTRNAME);
7062 7077          default:
7063 7078                  return (NULL);
7064 7079          }
7065 7080  }
7066 7081  
7067 7082  static int
7068 7083  zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7069 7084  {
7070 7085          zone_t *zone;
7071 7086          zone_dl_t *zdl;
7072 7087          nvlist_t *nvl;
7073 7088          int err = 0;
7074 7089          uint8_t *new = NULL;
7075 7090          char *nvname;
7076 7091          int bufsize;
7077 7092          datalink_id_t linkid = znbuf->zn_linkid;
7078 7093  
7079 7094          if (secpolicy_zone_config(CRED()) != 0)
7080 7095                  return (set_errno(EPERM));
7081 7096  
7082 7097          if (zoneid == GLOBAL_ZONEID)
7083 7098                  return (set_errno(EINVAL));
7084 7099  
7085 7100          nvname = zone_net_type2name(znbuf->zn_type);
7086 7101          bufsize = znbuf->zn_len;
7087 7102          new = znbuf->zn_val;
7088 7103          if (nvname == NULL)
7089 7104                  return (set_errno(EINVAL));
7090 7105  
7091 7106          if ((zone = zone_find_by_id(zoneid)) == NULL) {
7092 7107                  return (set_errno(EINVAL));
7093 7108          }
7094 7109  
7095 7110          mutex_enter(&zone->zone_lock);
7096 7111          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7097 7112                  err = ENXIO;
7098 7113                  goto done;
7099 7114          }
7100 7115          if ((nvl = zdl->zdl_net) == NULL) {
7101 7116                  if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7102 7117                          err = ENOMEM;
7103 7118                          goto done;
7104 7119                  } else {
7105 7120                          zdl->zdl_net = nvl;
7106 7121                  }
7107 7122          }
7108 7123          if (nvlist_exists(nvl, nvname)) {
7109 7124                  err = EINVAL;
7110 7125                  goto done;
7111 7126          }
7112 7127          err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7113 7128          ASSERT(err == 0);
7114 7129  done:
7115 7130          mutex_exit(&zone->zone_lock);
7116 7131          zone_rele(zone);
7117 7132          if (err != 0)
7118 7133                  return (set_errno(err));
7119 7134          else
7120 7135                  return (0);
7121 7136  }
7122 7137  
7123 7138  static int
7124 7139  zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7125 7140  {
7126 7141          zone_t *zone;
7127 7142          zone_dl_t *zdl;
7128 7143          nvlist_t *nvl;
7129 7144          uint8_t *ptr;
7130 7145          uint_t psize;
7131 7146          int err = 0;
7132 7147          char *nvname;
7133 7148          int bufsize;
7134 7149          void *buf;
7135 7150          datalink_id_t linkid = znbuf->zn_linkid;
7136 7151  
7137 7152          if (zoneid == GLOBAL_ZONEID)
7138 7153                  return (set_errno(EINVAL));
7139 7154  
7140 7155          nvname = zone_net_type2name(znbuf->zn_type);
7141 7156          bufsize = znbuf->zn_len;
7142 7157          buf = znbuf->zn_val;
7143 7158  
7144 7159          if (nvname == NULL)
7145 7160                  return (set_errno(EINVAL));
7146 7161          if ((zone = zone_find_by_id(zoneid)) == NULL)
7147 7162                  return (set_errno(EINVAL));
7148 7163  
7149 7164          mutex_enter(&zone->zone_lock);
7150 7165          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7151 7166                  err = ENXIO;
7152 7167                  goto done;
7153 7168          }
7154 7169          if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7155 7170                  err = ENOENT;
7156 7171                  goto done;
7157 7172          }
7158 7173          err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7159 7174          ASSERT(err == 0);
7160 7175  
7161 7176          if (psize > bufsize) {
7162 7177                  err = ENOBUFS;
7163 7178                  goto done;
7164 7179          }
7165 7180          znbuf->zn_len = psize;
7166 7181          bcopy(ptr, buf, psize);
7167 7182  done:
7168 7183          mutex_exit(&zone->zone_lock);
7169 7184          zone_rele(zone);
7170 7185          if (err != 0)
7171 7186                  return (set_errno(err));
7172 7187          else
7173 7188                  return (0);
7174 7189  }

↓ open down ↓

4882 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX