Print this page
    
Add VZONEROOT flag because not all zone roots have VROOT set.
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015, Joyent Inc. All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27   27   */
  28   28  
  29   29  /*
  30   30   * Zones
  31   31   *
  32   32   *   A zone is a named collection of processes, namespace constraints,
  33   33   *   and other system resources which comprise a secure and manageable
  34   34   *   application containment facility.
  35   35   *
  36   36   *   Zones (represented by the reference counted zone_t) are tracked in
  37   37   *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38   38   *   (zoneid_t) are used to track zone association.  Zone IDs are
  39   39   *   dynamically generated when the zone is created; if a persistent
  40   40   *   identifier is needed (core files, accounting logs, audit trail,
  41   41   *   etc.), the zone name should be used.
  42   42   *
  43   43   *
  44   44   *   Global Zone:
  45   45   *
  46   46   *   The global zone (zoneid 0) is automatically associated with all
  47   47   *   system resources that have not been bound to a user-created zone.
  48   48   *   This means that even systems where zones are not in active use
  49   49   *   have a global zone, and all processes, mounts, etc. are
  50   50   *   associated with that zone.  The global zone is generally
  51   51   *   unconstrained in terms of privileges and access, though the usual
  52   52   *   credential and privilege based restrictions apply.
  53   53   *
  54   54   *
  55   55   *   Zone States:
  56   56   *
  57   57   *   The states in which a zone may be in and the transitions are as
  58   58   *   follows:
  59   59   *
  60   60   *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61   61   *   initialized zone is added to the list of active zones on the system but
  62   62   *   isn't accessible.
  63   63   *
  64   64   *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65   65   *   not yet completed. Not possible to enter the zone, but attributes can
  66   66   *   be retrieved.
  67   67   *
  68   68   *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69   69   *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70   70   *   executed.  A zone remains in this state until it transitions into
  71   71   *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72   72   *
  73   73   *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74   74   *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75   75   *   state.
  76   76   *
  77   77   *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78   78   *   successfully started init.   A zone remains in this state until
  79   79   *   zone_shutdown() is called.
  80   80   *
  81   81   *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82   82   *   killing all processes running in the zone. The zone remains
  83   83   *   in this state until there are no more user processes running in the zone.
  84   84   *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85   85   *   Since zone_shutdown() is restartable, it may be called successfully
  86   86   *   multiple times for the same zone_t.  Setting of the zone's state to
  87   87   *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88   88   *   the zone's status without worrying about it being a moving target.
  89   89   *
  90   90   *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91   91   *   are no more user processes in the zone.  The zone remains in this
  92   92   *   state until there are no more kernel threads associated with the
  93   93   *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94   94   *   fail.
  95   95   *
  96   96   *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97   97   *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98   98   *   join the zone or create kernel threads therein.
  99   99   *
 100  100   *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  101   *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  102   *   return NULL from now on.
 103  103   *
 104  104   *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  105   *   processes or threads doing work on behalf of the zone.  The zone is
 106  106   *   removed from the list of active zones.  zone_destroy() returns, and
 107  107   *   the zone can be recreated.
 108  108   *
 109  109   *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
 110  110   *   callbacks are executed, and all memory associated with the zone is
 111  111   *   freed.
 112  112   *
 113  113   *   Threads can wait for the zone to enter a requested state by using
 114  114   *   zone_status_wait() or zone_status_timedwait() with the desired
 115  115   *   state passed in as an argument.  Zone state transitions are
 116  116   *   uni-directional; it is not possible to move back to an earlier state.
 117  117   *
 118  118   *
 119  119   *   Zone-Specific Data:
 120  120   *
 121  121   *   Subsystems needing to maintain zone-specific data can store that
 122  122   *   data using the ZSD mechanism.  This provides a zone-specific data
 123  123   *   store, similar to thread-specific data (see pthread_getspecific(3C)
 124  124   *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 125  125   *   to register callbacks to be invoked when a zone is created, shut
 126  126   *   down, or destroyed.  This can be used to initialize zone-specific
 127  127   *   data for new zones and to clean up when zones go away.
 128  128   *
 129  129   *
 130  130   *   Data Structures:
 131  131   *
 132  132   *   The per-zone structure (zone_t) is reference counted, and freed
 133  133   *   when all references are released.  zone_hold and zone_rele can be
 134  134   *   used to adjust the reference count.  In addition, reference counts
 135  135   *   associated with the cred_t structure are tracked separately using
 136  136   *   zone_cred_hold and zone_cred_rele.
 137  137   *
 138  138   *   Pointers to active zone_t's are stored in two hash tables; one
 139  139   *   for searching by id, the other for searching by name.  Lookups
 140  140   *   can be performed on either basis, using zone_find_by_id and
 141  141   *   zone_find_by_name.  Both return zone_t pointers with the zone
 142  142   *   held, so zone_rele should be called when the pointer is no longer
 143  143   *   needed.  Zones can also be searched by path; zone_find_by_path
 144  144   *   returns the zone with which a path name is associated (global
 145  145   *   zone if the path is not within some other zone's file system
 146  146   *   hierarchy).  This currently requires iterating through each zone,
 147  147   *   so it is slower than an id or name search via a hash table.
 148  148   *
 149  149   *
 150  150   *   Locking:
 151  151   *
 152  152   *   zonehash_lock: This is a top-level global lock used to protect the
 153  153   *       zone hash tables and lists.  Zones cannot be created or destroyed
 154  154   *       while this lock is held.
 155  155   *   zone_status_lock: This is a global lock protecting zone state.
 156  156   *       Zones cannot change state while this lock is held.  It also
 157  157   *       protects the list of kernel threads associated with a zone.
 158  158   *   zone_lock: This is a per-zone lock used to protect several fields of
 159  159   *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 160  160   *       this lock means that the zone cannot go away.
 161  161   *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 162  162   *       related to the zone.max-lwps rctl.
 163  163   *   zone_mem_lock: This is a per-zone lock used to protect the fields
 164  164   *       related to the zone.max-locked-memory and zone.max-swap rctls.
 165  165   *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 166  166   *       currently just max_lofi
 167  167   *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 168  168   *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 169  169   *       list (a list of zones in the ZONE_IS_DEAD state).
 170  170   *
 171  171   *   Ordering requirements:
 172  172   *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 173  173   *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
 174  174   *
 175  175   *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 176  176   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 177  177   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 178  178   *
 179  179   *   Blocking memory allocations are permitted while holding any of the
 180  180   *   zone locks.
 181  181   *
 182  182   *
 183  183   *   System Call Interface:
 184  184   *
 185  185   *   The zone subsystem can be managed and queried from user level with
 186  186   *   the following system calls (all subcodes of the primary "zone"
 187  187   *   system call):
 188  188   *   - zone_create: creates a zone with selected attributes (name,
 189  189   *     root path, privileges, resource controls, ZFS datasets)
 190  190   *   - zone_enter: allows the current process to enter a zone
 191  191   *   - zone_getattr: reports attributes of a zone
 192  192   *   - zone_setattr: set attributes of a zone
 193  193   *   - zone_boot: set 'init' running for the zone
 194  194   *   - zone_list: lists all zones active in the system
 195  195   *   - zone_lookup: looks up zone id based on name
 196  196   *   - zone_shutdown: initiates shutdown process (see states above)
 197  197   *   - zone_destroy: completes shutdown process (see states above)
 198  198   *
 199  199   */
 200  200  
 201  201  #include <sys/priv_impl.h>
 202  202  #include <sys/cred.h>
 203  203  #include <c2/audit.h>
 204  204  #include <sys/debug.h>
 205  205  #include <sys/file.h>
 206  206  #include <sys/kmem.h>
 207  207  #include <sys/kstat.h>
 208  208  #include <sys/mutex.h>
 209  209  #include <sys/note.h>
 210  210  #include <sys/pathname.h>
 211  211  #include <sys/proc.h>
 212  212  #include <sys/project.h>
 213  213  #include <sys/sysevent.h>
 214  214  #include <sys/task.h>
 215  215  #include <sys/systm.h>
 216  216  #include <sys/types.h>
 217  217  #include <sys/utsname.h>
 218  218  #include <sys/vnode.h>
 219  219  #include <sys/vfs.h>
 220  220  #include <sys/systeminfo.h>
 221  221  #include <sys/policy.h>
 222  222  #include <sys/cred_impl.h>
 223  223  #include <sys/contract_impl.h>
 224  224  #include <sys/contract/process_impl.h>
 225  225  #include <sys/class.h>
 226  226  #include <sys/pool.h>
 227  227  #include <sys/pool_pset.h>
 228  228  #include <sys/pset.h>
 229  229  #include <sys/strlog.h>
 230  230  #include <sys/sysmacros.h>
 231  231  #include <sys/callb.h>
 232  232  #include <sys/vmparam.h>
 233  233  #include <sys/corectl.h>
 234  234  #include <sys/ipc_impl.h>
 235  235  #include <sys/klpd.h>
 236  236  
 237  237  #include <sys/door.h>
 238  238  #include <sys/cpuvar.h>
 239  239  #include <sys/sdt.h>
 240  240  
 241  241  #include <sys/uadmin.h>
 242  242  #include <sys/session.h>
 243  243  #include <sys/cmn_err.h>
 244  244  #include <sys/modhash.h>
 245  245  #include <sys/sunddi.h>
 246  246  #include <sys/nvpair.h>
 247  247  #include <sys/rctl.h>
 248  248  #include <sys/fss.h>
 249  249  #include <sys/brand.h>
 250  250  #include <sys/zone.h>
 251  251  #include <net/if.h>
 252  252  #include <sys/cpucaps.h>
 253  253  #include <vm/seg.h>
 254  254  #include <sys/mac.h>
 255  255  
 256  256  /*
 257  257   * This constant specifies the number of seconds that threads waiting for
 258  258   * subsystems to release a zone's general-purpose references will wait before
 259  259   * they log the zone's reference counts.  The constant's value shouldn't
 260  260   * be so small that reference counts are unnecessarily reported for zones
 261  261   * whose references are slowly released.  On the other hand, it shouldn't be so
 262  262   * large that users reboot their systems out of frustration over hung zones
 263  263   * before the system logs the zones' reference counts.
 264  264   */
 265  265  #define ZONE_DESTROY_TIMEOUT_SECS       60
 266  266  
 267  267  /* List of data link IDs which are accessible from the zone */
 268  268  typedef struct zone_dl {
 269  269          datalink_id_t   zdl_id;
 270  270          nvlist_t        *zdl_net;
 271  271          list_node_t     zdl_linkage;
 272  272  } zone_dl_t;
 273  273  
 274  274  /*
 275  275   * cv used to signal that all references to the zone have been released.  This
 276  276   * needs to be global since there may be multiple waiters, and the first to
 277  277   * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 278  278   */
 279  279  static kcondvar_t zone_destroy_cv;
 280  280  /*
 281  281   * Lock used to serialize access to zone_cv.  This could have been per-zone,
 282  282   * but then we'd need another lock for zone_destroy_cv, and why bother?
 283  283   */
 284  284  static kmutex_t zone_status_lock;
 285  285  
 286  286  /*
 287  287   * ZSD-related global variables.
 288  288   */
 289  289  static kmutex_t zsd_key_lock;   /* protects the following two */
 290  290  /*
 291  291   * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 292  292   */
 293  293  static zone_key_t zsd_keyval = 0;
 294  294  /*
 295  295   * Global list of registered keys.  We use this when a new zone is created.
 296  296   */
 297  297  static list_t zsd_registered_keys;
 298  298  
 299  299  int zone_hash_size = 256;
 300  300  static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 301  301  static kmutex_t zonehash_lock;
 302  302  static uint_t zonecount;
 303  303  static id_space_t *zoneid_space;
 304  304  
 305  305  /*
 306  306   * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 307  307   * kernel proper runs, and which manages all other zones.
 308  308   *
 309  309   * Although not declared as static, the variable "zone0" should not be used
 310  310   * except for by code that needs to reference the global zone early on in boot,
 311  311   * before it is fully initialized.  All other consumers should use
 312  312   * 'global_zone'.
 313  313   */
 314  314  zone_t zone0;
 315  315  zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 316  316  
 317  317  /*
 318  318   * List of active zones, protected by zonehash_lock.
 319  319   */
 320  320  static list_t zone_active;
 321  321  
 322  322  /*
 323  323   * List of destroyed zones that still have outstanding cred references.
 324  324   * Used for debugging.  Uses a separate lock to avoid lock ordering
 325  325   * problems in zone_free.
 326  326   */
 327  327  static list_t zone_deathrow;
 328  328  static kmutex_t zone_deathrow_lock;
 329  329  
 330  330  /* number of zones is limited by virtual interface limit in IP */
 331  331  uint_t maxzones = 8192;
 332  332  
 333  333  /* Event channel to sent zone state change notifications */
 334  334  evchan_t *zone_event_chan;
 335  335  
 336  336  /*
 337  337   * This table holds the mapping from kernel zone states to
 338  338   * states visible in the state notification API.
 339  339   * The idea is that we only expose "obvious" states and
 340  340   * do not expose states which are just implementation details.
 341  341   */
 342  342  const char  *zone_status_table[] = {
 343  343          ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 344  344          ZONE_EVENT_INITIALIZED,         /* initialized */
 345  345          ZONE_EVENT_READY,               /* ready */
 346  346          ZONE_EVENT_READY,               /* booting */
 347  347          ZONE_EVENT_RUNNING,             /* running */
 348  348          ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 349  349          ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 350  350          ZONE_EVENT_SHUTTING_DOWN,       /* down */
 351  351          ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 352  352          ZONE_EVENT_UNINITIALIZED,       /* dead */
 353  353  };
 354  354  
 355  355  /*
 356  356   * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  357   * (see sys/zone.h).
 358  358   */
 359  359  static char *zone_ref_subsys_names[] = {
 360  360          "NFS",          /* ZONE_REF_NFS */
 361  361          "NFSv4",        /* ZONE_REF_NFSV4 */
 362  362          "SMBFS",        /* ZONE_REF_SMBFS */
 363  363          "MNTFS",        /* ZONE_REF_MNTFS */
 364  364          "LOFI",         /* ZONE_REF_LOFI */
 365  365          "VFS",          /* ZONE_REF_VFS */
 366  366          "IPC"           /* ZONE_REF_IPC */
 367  367  };
 368  368  
 369  369  /*
 370  370   * This isn't static so lint doesn't complain.
 371  371   */
 372  372  rctl_hndl_t rc_zone_cpu_shares;
 373  373  rctl_hndl_t rc_zone_locked_mem;
 374  374  rctl_hndl_t rc_zone_max_swap;
 375  375  rctl_hndl_t rc_zone_max_lofi;
 376  376  rctl_hndl_t rc_zone_cpu_cap;
 377  377  rctl_hndl_t rc_zone_nlwps;
 378  378  rctl_hndl_t rc_zone_nprocs;
 379  379  rctl_hndl_t rc_zone_shmmax;
 380  380  rctl_hndl_t rc_zone_shmmni;
 381  381  rctl_hndl_t rc_zone_semmni;
 382  382  rctl_hndl_t rc_zone_msgmni;
 383  383  
 384  384  const char * const zone_default_initname = "/sbin/init";
 385  385  static char * const zone_prefix = "/zone/";
 386  386  static int zone_shutdown(zoneid_t zoneid);
 387  387  static int zone_add_datalink(zoneid_t, datalink_id_t);
 388  388  static int zone_remove_datalink(zoneid_t, datalink_id_t);
 389  389  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 390  390  static int zone_set_network(zoneid_t, zone_net_data_t *);
 391  391  static int zone_get_network(zoneid_t, zone_net_data_t *);
 392  392  
 393  393  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 394  394  
 395  395  static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 396  396  static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 397  397  static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398  398  static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 399  399      zone_key_t);
 400  400  static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 401  401  static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 402  402      kmutex_t *);
 403  403  static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 404  404      kmutex_t *);
 405  405  
 406  406  /*
 407  407   * Bump this number when you alter the zone syscall interfaces; this is
 408  408   * because we need to have support for previous API versions in libc
 409  409   * to support patching; libc calls into the kernel to determine this number.
 410  410   *
 411  411   * Version 1 of the API is the version originally shipped with Solaris 10
 412  412   * Version 2 alters the zone_create system call in order to support more
 413  413   *     arguments by moving the args into a structure; and to do better
 414  414   *     error reporting when zone_create() fails.
 415  415   * Version 3 alters the zone_create system call in order to support the
 416  416   *     import of ZFS datasets to zones.
 417  417   * Version 4 alters the zone_create system call in order to support
 418  418   *     Trusted Extensions.
 419  419   * Version 5 alters the zone_boot system call, and converts its old
 420  420   *     bootargs parameter to be set by the zone_setattr API instead.
 421  421   * Version 6 adds the flag argument to zone_create.
 422  422   */
 423  423  static const int ZONE_SYSCALL_API_VERSION = 6;
 424  424  
 425  425  /*
 426  426   * Certain filesystems (such as NFS and autofs) need to know which zone
 427  427   * the mount is being placed in.  Because of this, we need to be able to
 428  428   * ensure that a zone isn't in the process of being created/destroyed such
 429  429   * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 430  430   * it gets added the list of mounted zones, it ends up on the wrong zone's
 431  431   * mount list. Since a zone can't reside on an NFS file system, we don't
 432  432   * have to worry about the zonepath itself.
 433  433   *
 434  434   * The following functions: block_mounts()/resume_mounts() and
 435  435   * mount_in_progress()/mount_completed() are used by zones and the VFS
 436  436   * layer (respectively) to synchronize zone state transitions and new
 437  437   * mounts within a zone. This syncronization is on a per-zone basis, so
 438  438   * activity for one zone will not interfere with activity for another zone.
 439  439   *
 440  440   * The semantics are like a reader-reader lock such that there may
 441  441   * either be multiple mounts (or zone state transitions, if that weren't
 442  442   * serialized by zonehash_lock) in progress at the same time, but not
 443  443   * both.
 444  444   *
 445  445   * We use cv's so the user can ctrl-C out of the operation if it's
 446  446   * taking too long.
 447  447   *
 448  448   * The semantics are such that there is unfair bias towards the
 449  449   * "current" operation.  This means that zone halt may starve if
 450  450   * there is a rapid succession of new mounts coming in to the zone.
 451  451   */
 452  452  /*
 453  453   * Prevent new mounts from progressing to the point of calling
 454  454   * VFS_MOUNT().  If there are already mounts in this "region", wait for
 455  455   * them to complete.
 456  456   */
 457  457  static int
 458  458  block_mounts(zone_t *zp)
 459  459  {
 460  460          int retval = 0;
 461  461  
 462  462          /*
 463  463           * Since it may block for a long time, block_mounts() shouldn't be
 464  464           * called with zonehash_lock held.
 465  465           */
 466  466          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 467  467          mutex_enter(&zp->zone_mount_lock);
 468  468          while (zp->zone_mounts_in_progress > 0) {
 469  469                  if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 470  470                          goto signaled;
 471  471          }
 472  472          /*
 473  473           * A negative value of mounts_in_progress indicates that mounts
 474  474           * have been blocked by (-mounts_in_progress) different callers
 475  475           * (remotely possible if two threads enter zone_shutdown at the same
 476  476           * time).
 477  477           */
 478  478          zp->zone_mounts_in_progress--;
 479  479          retval = 1;
 480  480  signaled:
 481  481          mutex_exit(&zp->zone_mount_lock);
 482  482          return (retval);
 483  483  }
 484  484  
 485  485  /*
 486  486   * The VFS layer may progress with new mounts as far as we're concerned.
 487  487   * Allow them to progress if we were the last obstacle.
 488  488   */
 489  489  static void
 490  490  resume_mounts(zone_t *zp)
 491  491  {
 492  492          mutex_enter(&zp->zone_mount_lock);
 493  493          if (++zp->zone_mounts_in_progress == 0)
 494  494                  cv_broadcast(&zp->zone_mount_cv);
 495  495          mutex_exit(&zp->zone_mount_lock);
 496  496  }
 497  497  
 498  498  /*
 499  499   * The VFS layer is busy with a mount; this zone should wait until all
 500  500   * of its mounts are completed to progress.
 501  501   */
 502  502  void
 503  503  mount_in_progress(zone_t *zp)
 504  504  {
 505  505          mutex_enter(&zp->zone_mount_lock);
 506  506          while (zp->zone_mounts_in_progress < 0)
 507  507                  cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 508  508          zp->zone_mounts_in_progress++;
 509  509          mutex_exit(&zp->zone_mount_lock);
 510  510  }
 511  511  
 512  512  /*
 513  513   * VFS is done with one mount; wake up any waiting block_mounts()
 514  514   * callers if this is the last mount.
 515  515   */
 516  516  void
 517  517  mount_completed(zone_t *zp)
 518  518  {
 519  519          mutex_enter(&zp->zone_mount_lock);
 520  520          if (--zp->zone_mounts_in_progress == 0)
 521  521                  cv_broadcast(&zp->zone_mount_cv);
 522  522          mutex_exit(&zp->zone_mount_lock);
 523  523  }
 524  524  
 525  525  /*
 526  526   * ZSD routines.
 527  527   *
 528  528   * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 529  529   * defined by the pthread_key_create() and related interfaces.
 530  530   *
 531  531   * Kernel subsystems may register one or more data items and/or
 532  532   * callbacks to be executed when a zone is created, shutdown, or
 533  533   * destroyed.
 534  534   *
 535  535   * Unlike the thread counterpart, destructor callbacks will be executed
 536  536   * even if the data pointer is NULL and/or there are no constructor
 537  537   * callbacks, so it is the responsibility of such callbacks to check for
 538  538   * NULL data values if necessary.
 539  539   *
 540  540   * The locking strategy and overall picture is as follows:
 541  541   *
 542  542   * When someone calls zone_key_create(), a template ZSD entry is added to the
 543  543   * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 544  544   * holding that lock all the existing zones are marked as
 545  545   * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 546  546   * zone_zsd list (protected by zone_lock). The global list is updated first
 547  547   * (under zone_key_lock) to make sure that newly created zones use the
 548  548   * most recent list of keys. Then under zonehash_lock we walk the zones
 549  549   * and mark them.  Similar locking is used in zone_key_delete().
 550  550   *
 551  551   * The actual create, shutdown, and destroy callbacks are done without
 552  552   * holding any lock. And zsd_flags are used to ensure that the operations
 553  553   * completed so that when zone_key_create (and zone_create) is done, as well as
 554  554   * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 555  555   * are completed.
 556  556   *
 557  557   * When new zones are created constructor callbacks for all registered ZSD
 558  558   * entries will be called. That also uses the above two phases of marking
 559  559   * what needs to be done, and then running the callbacks without holding
 560  560   * any locks.
 561  561   *
 562  562   * The framework does not provide any locking around zone_getspecific() and
 563  563   * zone_setspecific() apart from that needed for internal consistency, so
 564  564   * callers interested in atomic "test-and-set" semantics will need to provide
 565  565   * their own locking.
 566  566   */
 567  567  
 568  568  /*
 569  569   * Helper function to find the zsd_entry associated with the key in the
 570  570   * given list.
 571  571   */
 572  572  static struct zsd_entry *
 573  573  zsd_find(list_t *l, zone_key_t key)
 574  574  {
 575  575          struct zsd_entry *zsd;
 576  576  
 577  577          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 578  578                  if (zsd->zsd_key == key) {
 579  579                          return (zsd);
 580  580                  }
 581  581          }
 582  582          return (NULL);
 583  583  }
 584  584  
 585  585  /*
 586  586   * Helper function to find the zsd_entry associated with the key in the
 587  587   * given list. Move it to the front of the list.
 588  588   */
 589  589  static struct zsd_entry *
 590  590  zsd_find_mru(list_t *l, zone_key_t key)
 591  591  {
 592  592          struct zsd_entry *zsd;
 593  593  
 594  594          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 595  595                  if (zsd->zsd_key == key) {
 596  596                          /*
 597  597                           * Move to head of list to keep list in MRU order.
 598  598                           */
 599  599                          if (zsd != list_head(l)) {
 600  600                                  list_remove(l, zsd);
 601  601                                  list_insert_head(l, zsd);
 602  602                          }
 603  603                          return (zsd);
 604  604                  }
 605  605          }
 606  606          return (NULL);
 607  607  }
 608  608  
 609  609  void
 610  610  zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 611  611      void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 612  612  {
 613  613          struct zsd_entry *zsdp;
 614  614          struct zsd_entry *t;
 615  615          struct zone *zone;
 616  616          zone_key_t  key;
 617  617  
 618  618          zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 619  619          zsdp->zsd_data = NULL;
 620  620          zsdp->zsd_create = create;
 621  621          zsdp->zsd_shutdown = shutdown;
 622  622          zsdp->zsd_destroy = destroy;
 623  623  
 624  624          /*
 625  625           * Insert in global list of callbacks. Makes future zone creations
 626  626           * see it.
 627  627           */
 628  628          mutex_enter(&zsd_key_lock);
 629  629          key = zsdp->zsd_key = ++zsd_keyval;
 630  630          ASSERT(zsd_keyval != 0);
 631  631          list_insert_tail(&zsd_registered_keys, zsdp);
 632  632          mutex_exit(&zsd_key_lock);
 633  633  
 634  634          /*
 635  635           * Insert for all existing zones and mark them as needing
 636  636           * a create callback.
 637  637           */
 638  638          mutex_enter(&zonehash_lock);    /* stop the world */
 639  639          for (zone = list_head(&zone_active); zone != NULL;
 640  640              zone = list_next(&zone_active, zone)) {
 641  641                  zone_status_t status;
 642  642  
 643  643                  mutex_enter(&zone->zone_lock);
 644  644  
 645  645                  /* Skip zones that are on the way down or not yet up */
 646  646                  status = zone_status_get(zone);
 647  647                  if (status >= ZONE_IS_DOWN ||
 648  648                      status == ZONE_IS_UNINITIALIZED) {
 649  649                          mutex_exit(&zone->zone_lock);
 650  650                          continue;
 651  651                  }
 652  652  
 653  653                  t = zsd_find_mru(&zone->zone_zsd, key);
 654  654                  if (t != NULL) {
 655  655                          /*
 656  656                           * A zsd_configure already inserted it after
 657  657                           * we dropped zsd_key_lock above.
 658  658                           */
 659  659                          mutex_exit(&zone->zone_lock);
 660  660                          continue;
 661  661                  }
 662  662                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 663  663                  t->zsd_key = key;
 664  664                  t->zsd_create = create;
 665  665                  t->zsd_shutdown = shutdown;
 666  666                  t->zsd_destroy = destroy;
 667  667                  if (create != NULL) {
 668  668                          t->zsd_flags = ZSD_CREATE_NEEDED;
 669  669                          DTRACE_PROBE2(zsd__create__needed,
 670  670                              zone_t *, zone, zone_key_t, key);
 671  671                  }
 672  672                  list_insert_tail(&zone->zone_zsd, t);
 673  673                  mutex_exit(&zone->zone_lock);
 674  674          }
 675  675          mutex_exit(&zonehash_lock);
 676  676  
 677  677          if (create != NULL) {
 678  678                  /* Now call the create callback for this key */
 679  679                  zsd_apply_all_zones(zsd_apply_create, key);
 680  680          }
 681  681          /*
 682  682           * It is safe for consumers to use the key now, make it
 683  683           * globally visible. Specifically zone_getspecific() will
 684  684           * always successfully return the zone specific data associated
 685  685           * with the key.
 686  686           */
 687  687          *keyp = key;
 688  688  
 689  689  }
 690  690  
 691  691  /*
 692  692   * Function called when a module is being unloaded, or otherwise wishes
 693  693   * to unregister its ZSD key and callbacks.
 694  694   *
 695  695   * Remove from the global list and determine the functions that need to
 696  696   * be called under a global lock. Then call the functions without
 697  697   * holding any locks. Finally free up the zone_zsd entries. (The apply
 698  698   * functions need to access the zone_zsd entries to find zsd_data etc.)
 699  699   */
 700  700  int
 701  701  zone_key_delete(zone_key_t key)
 702  702  {
 703  703          struct zsd_entry *zsdp = NULL;
 704  704          zone_t *zone;
 705  705  
 706  706          mutex_enter(&zsd_key_lock);
 707  707          zsdp = zsd_find_mru(&zsd_registered_keys, key);
 708  708          if (zsdp == NULL) {
 709  709                  mutex_exit(&zsd_key_lock);
 710  710                  return (-1);
 711  711          }
 712  712          list_remove(&zsd_registered_keys, zsdp);
 713  713          mutex_exit(&zsd_key_lock);
 714  714  
 715  715          mutex_enter(&zonehash_lock);
 716  716          for (zone = list_head(&zone_active); zone != NULL;
 717  717              zone = list_next(&zone_active, zone)) {
 718  718                  struct zsd_entry *del;
 719  719  
 720  720                  mutex_enter(&zone->zone_lock);
 721  721                  del = zsd_find_mru(&zone->zone_zsd, key);
 722  722                  if (del == NULL) {
 723  723                          /*
 724  724                           * Somebody else got here first e.g the zone going
 725  725                           * away.
 726  726                           */
 727  727                          mutex_exit(&zone->zone_lock);
 728  728                          continue;
 729  729                  }
 730  730                  ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 731  731                  ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 732  732                  if (del->zsd_shutdown != NULL &&
 733  733                      (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 734  734                          del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 735  735                          DTRACE_PROBE2(zsd__shutdown__needed,
 736  736                              zone_t *, zone, zone_key_t, key);
 737  737                  }
 738  738                  if (del->zsd_destroy != NULL &&
 739  739                      (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 740  740                          del->zsd_flags |= ZSD_DESTROY_NEEDED;
 741  741                          DTRACE_PROBE2(zsd__destroy__needed,
 742  742                              zone_t *, zone, zone_key_t, key);
 743  743                  }
 744  744                  mutex_exit(&zone->zone_lock);
 745  745          }
 746  746          mutex_exit(&zonehash_lock);
 747  747          kmem_free(zsdp, sizeof (*zsdp));
 748  748  
 749  749          /* Now call the shutdown and destroy callback for this key */
 750  750          zsd_apply_all_zones(zsd_apply_shutdown, key);
 751  751          zsd_apply_all_zones(zsd_apply_destroy, key);
 752  752  
 753  753          /* Now we can free up the zsdp structures in each zone */
 754  754          mutex_enter(&zonehash_lock);
 755  755          for (zone = list_head(&zone_active); zone != NULL;
 756  756              zone = list_next(&zone_active, zone)) {
 757  757                  struct zsd_entry *del;
 758  758  
 759  759                  mutex_enter(&zone->zone_lock);
 760  760                  del = zsd_find(&zone->zone_zsd, key);
 761  761                  if (del != NULL) {
 762  762                          list_remove(&zone->zone_zsd, del);
 763  763                          ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 764  764                          kmem_free(del, sizeof (*del));
 765  765                  }
 766  766                  mutex_exit(&zone->zone_lock);
 767  767          }
 768  768          mutex_exit(&zonehash_lock);
 769  769  
 770  770          return (0);
 771  771  }
 772  772  
 773  773  /*
 774  774   * ZSD counterpart of pthread_setspecific().
 775  775   *
 776  776   * Since all zsd callbacks, including those with no create function,
 777  777   * have an entry in zone_zsd, if the key is registered it is part of
 778  778   * the zone_zsd list.
 779  779   * Return an error if the key wasn't registerd.
 780  780   */
 781  781  int
 782  782  zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 783  783  {
 784  784          struct zsd_entry *t;
 785  785  
 786  786          mutex_enter(&zone->zone_lock);
 787  787          t = zsd_find_mru(&zone->zone_zsd, key);
 788  788          if (t != NULL) {
 789  789                  /*
 790  790                   * Replace old value with new
 791  791                   */
 792  792                  t->zsd_data = (void *)data;
 793  793                  mutex_exit(&zone->zone_lock);
 794  794                  return (0);
 795  795          }
 796  796          mutex_exit(&zone->zone_lock);
 797  797          return (-1);
 798  798  }
 799  799  
 800  800  /*
 801  801   * ZSD counterpart of pthread_getspecific().
 802  802   */
 803  803  void *
 804  804  zone_getspecific(zone_key_t key, zone_t *zone)
 805  805  {
 806  806          struct zsd_entry *t;
 807  807          void *data;
 808  808  
 809  809          mutex_enter(&zone->zone_lock);
 810  810          t = zsd_find_mru(&zone->zone_zsd, key);
 811  811          data = (t == NULL ? NULL : t->zsd_data);
 812  812          mutex_exit(&zone->zone_lock);
 813  813          return (data);
 814  814  }
 815  815  
 816  816  /*
 817  817   * Function used to initialize a zone's list of ZSD callbacks and data
 818  818   * when the zone is being created.  The callbacks are initialized from
 819  819   * the template list (zsd_registered_keys). The constructor callback is
 820  820   * executed later (once the zone exists and with locks dropped).
 821  821   */
 822  822  static void
 823  823  zone_zsd_configure(zone_t *zone)
 824  824  {
 825  825          struct zsd_entry *zsdp;
 826  826          struct zsd_entry *t;
 827  827  
 828  828          ASSERT(MUTEX_HELD(&zonehash_lock));
 829  829          ASSERT(list_head(&zone->zone_zsd) == NULL);
 830  830          mutex_enter(&zone->zone_lock);
 831  831          mutex_enter(&zsd_key_lock);
 832  832          for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 833  833              zsdp = list_next(&zsd_registered_keys, zsdp)) {
 834  834                  /*
 835  835                   * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 836  836                   * should not have added anything to it.
 837  837                   */
 838  838                  ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 839  839  
 840  840                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 841  841                  t->zsd_key = zsdp->zsd_key;
 842  842                  t->zsd_create = zsdp->zsd_create;
 843  843                  t->zsd_shutdown = zsdp->zsd_shutdown;
 844  844                  t->zsd_destroy = zsdp->zsd_destroy;
 845  845                  if (zsdp->zsd_create != NULL) {
 846  846                          t->zsd_flags = ZSD_CREATE_NEEDED;
 847  847                          DTRACE_PROBE2(zsd__create__needed,
 848  848                              zone_t *, zone, zone_key_t, zsdp->zsd_key);
 849  849                  }
 850  850                  list_insert_tail(&zone->zone_zsd, t);
 851  851          }
 852  852          mutex_exit(&zsd_key_lock);
 853  853          mutex_exit(&zone->zone_lock);
 854  854  }
 855  855  
 856  856  enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 857  857  
 858  858  /*
 859  859   * Helper function to execute shutdown or destructor callbacks.
 860  860   */
 861  861  static void
 862  862  zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 863  863  {
 864  864          struct zsd_entry *t;
 865  865  
 866  866          ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 867  867          ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 868  868          ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 869  869  
 870  870          /*
 871  871           * Run the callback solely based on what is registered for the zone
 872  872           * in zone_zsd. The global list can change independently of this
 873  873           * as keys are registered and unregistered and we don't register new
 874  874           * callbacks for a zone that is in the process of going away.
 875  875           */
 876  876          mutex_enter(&zone->zone_lock);
 877  877          for (t = list_head(&zone->zone_zsd); t != NULL;
 878  878              t = list_next(&zone->zone_zsd, t)) {
 879  879                  zone_key_t key = t->zsd_key;
 880  880  
 881  881                  /* Skip if no callbacks registered */
 882  882  
 883  883                  if (ct == ZSD_SHUTDOWN) {
 884  884                          if (t->zsd_shutdown != NULL &&
 885  885                              (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 886  886                                  t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 887  887                                  DTRACE_PROBE2(zsd__shutdown__needed,
 888  888                                      zone_t *, zone, zone_key_t, key);
 889  889                          }
 890  890                  } else {
 891  891                          if (t->zsd_destroy != NULL &&
 892  892                              (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 893  893                                  t->zsd_flags |= ZSD_DESTROY_NEEDED;
 894  894                                  DTRACE_PROBE2(zsd__destroy__needed,
 895  895                                      zone_t *, zone, zone_key_t, key);
 896  896                          }
 897  897                  }
 898  898          }
 899  899          mutex_exit(&zone->zone_lock);
 900  900  
 901  901          /* Now call the shutdown and destroy callback for this key */
 902  902          zsd_apply_all_keys(zsd_apply_shutdown, zone);
 903  903          zsd_apply_all_keys(zsd_apply_destroy, zone);
 904  904  
 905  905  }
 906  906  
 907  907  /*
 908  908   * Called when the zone is going away; free ZSD-related memory, and
 909  909   * destroy the zone_zsd list.
 910  910   */
 911  911  static void
 912  912  zone_free_zsd(zone_t *zone)
 913  913  {
 914  914          struct zsd_entry *t, *next;
 915  915  
 916  916          /*
 917  917           * Free all the zsd_entry's we had on this zone.
 918  918           */
 919  919          mutex_enter(&zone->zone_lock);
 920  920          for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 921  921                  next = list_next(&zone->zone_zsd, t);
 922  922                  list_remove(&zone->zone_zsd, t);
 923  923                  ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 924  924                  kmem_free(t, sizeof (*t));
 925  925          }
 926  926          list_destroy(&zone->zone_zsd);
 927  927          mutex_exit(&zone->zone_lock);
 928  928  
 929  929  }
 930  930  
 931  931  /*
 932  932   * Apply a function to all zones for particular key value.
 933  933   *
 934  934   * The applyfn has to drop zonehash_lock if it does some work, and
 935  935   * then reacquire it before it returns.
 936  936   * When the lock is dropped we don't follow list_next even
 937  937   * if it is possible to do so without any hazards. This is
 938  938   * because we want the design to allow for the list of zones
 939  939   * to change in any arbitrary way during the time the
 940  940   * lock was dropped.
 941  941   *
 942  942   * It is safe to restart the loop at list_head since the applyfn
 943  943   * changes the zsd_flags as it does work, so a subsequent
 944  944   * pass through will have no effect in applyfn, hence the loop will terminate
 945  945   * in at worst O(N^2).
 946  946   */
 947  947  static void
 948  948  zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
 949  949  {
 950  950          zone_t *zone;
 951  951  
 952  952          mutex_enter(&zonehash_lock);
 953  953          zone = list_head(&zone_active);
 954  954          while (zone != NULL) {
 955  955                  if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
 956  956                          /* Lock dropped - restart at head */
 957  957                          zone = list_head(&zone_active);
 958  958                  } else {
 959  959                          zone = list_next(&zone_active, zone);
 960  960                  }
 961  961          }
 962  962          mutex_exit(&zonehash_lock);
 963  963  }
 964  964  
 965  965  /*
 966  966   * Apply a function to all keys for a particular zone.
 967  967   *
 968  968   * The applyfn has to drop zonehash_lock if it does some work, and
 969  969   * then reacquire it before it returns.
 970  970   * When the lock is dropped we don't follow list_next even
 971  971   * if it is possible to do so without any hazards. This is
 972  972   * because we want the design to allow for the list of zsd callbacks
 973  973   * to change in any arbitrary way during the time the
 974  974   * lock was dropped.
 975  975   *
 976  976   * It is safe to restart the loop at list_head since the applyfn
 977  977   * changes the zsd_flags as it does work, so a subsequent
 978  978   * pass through will have no effect in applyfn, hence the loop will terminate
 979  979   * in at worst O(N^2).
 980  980   */
 981  981  static void
 982  982  zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
 983  983  {
 984  984          struct zsd_entry *t;
 985  985  
 986  986          mutex_enter(&zone->zone_lock);
 987  987          t = list_head(&zone->zone_zsd);
 988  988          while (t != NULL) {
 989  989                  if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
 990  990                          /* Lock dropped - restart at head */
 991  991                          t = list_head(&zone->zone_zsd);
 992  992                  } else {
 993  993                          t = list_next(&zone->zone_zsd, t);
 994  994                  }
 995  995          }
 996  996          mutex_exit(&zone->zone_lock);
 997  997  }
 998  998  
 999  999  /*
1000 1000   * Call the create function for the zone and key if CREATE_NEEDED
1001 1001   * is set.
1002 1002   * If some other thread gets here first and sets CREATE_INPROGRESS, then
1003 1003   * we wait for that thread to complete so that we can ensure that
1004 1004   * all the callbacks are done when we've looped over all zones/keys.
1005 1005   *
1006 1006   * When we call the create function, we drop the global held by the
1007 1007   * caller, and return true to tell the caller it needs to re-evalute the
1008 1008   * state.
1009 1009   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1010 1010   * remains held on exit.
1011 1011   */
1012 1012  static boolean_t
1013 1013  zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1014 1014      zone_t *zone, zone_key_t key)
1015 1015  {
1016 1016          void *result;
1017 1017          struct zsd_entry *t;
1018 1018          boolean_t dropped;
1019 1019  
1020 1020          if (lockp != NULL) {
1021 1021                  ASSERT(MUTEX_HELD(lockp));
1022 1022          }
1023 1023          if (zone_lock_held) {
1024 1024                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1025 1025          } else {
1026 1026                  mutex_enter(&zone->zone_lock);
1027 1027          }
1028 1028  
1029 1029          t = zsd_find(&zone->zone_zsd, key);
1030 1030          if (t == NULL) {
1031 1031                  /*
1032 1032                   * Somebody else got here first e.g the zone going
1033 1033                   * away.
1034 1034                   */
1035 1035                  if (!zone_lock_held)
1036 1036                          mutex_exit(&zone->zone_lock);
1037 1037                  return (B_FALSE);
1038 1038          }
1039 1039          dropped = B_FALSE;
1040 1040          if (zsd_wait_for_inprogress(zone, t, lockp))
1041 1041                  dropped = B_TRUE;
1042 1042  
1043 1043          if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1044 1044                  t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1045 1045                  t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1046 1046                  DTRACE_PROBE2(zsd__create__inprogress,
1047 1047                      zone_t *, zone, zone_key_t, key);
1048 1048                  mutex_exit(&zone->zone_lock);
1049 1049                  if (lockp != NULL)
1050 1050                          mutex_exit(lockp);
1051 1051  
1052 1052                  dropped = B_TRUE;
1053 1053                  ASSERT(t->zsd_create != NULL);
1054 1054                  DTRACE_PROBE2(zsd__create__start,
1055 1055                      zone_t *, zone, zone_key_t, key);
1056 1056  
1057 1057                  result = (*t->zsd_create)(zone->zone_id);
1058 1058  
1059 1059                  DTRACE_PROBE2(zsd__create__end,
1060 1060                      zone_t *, zone, voidn *, result);
1061 1061  
1062 1062                  ASSERT(result != NULL);
1063 1063                  if (lockp != NULL)
1064 1064                          mutex_enter(lockp);
1065 1065                  mutex_enter(&zone->zone_lock);
1066 1066                  t->zsd_data = result;
1067 1067                  t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1068 1068                  t->zsd_flags |= ZSD_CREATE_COMPLETED;
1069 1069                  cv_broadcast(&t->zsd_cv);
1070 1070                  DTRACE_PROBE2(zsd__create__completed,
1071 1071                      zone_t *, zone, zone_key_t, key);
1072 1072          }
1073 1073          if (!zone_lock_held)
1074 1074                  mutex_exit(&zone->zone_lock);
1075 1075          return (dropped);
1076 1076  }
1077 1077  
1078 1078  /*
1079 1079   * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1080 1080   * is set.
1081 1081   * If some other thread gets here first and sets *_INPROGRESS, then
1082 1082   * we wait for that thread to complete so that we can ensure that
1083 1083   * all the callbacks are done when we've looped over all zones/keys.
1084 1084   *
1085 1085   * When we call the shutdown function, we drop the global held by the
1086 1086   * caller, and return true to tell the caller it needs to re-evalute the
1087 1087   * state.
1088 1088   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1089 1089   * remains held on exit.
1090 1090   */
1091 1091  static boolean_t
1092 1092  zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1093 1093      zone_t *zone, zone_key_t key)
1094 1094  {
1095 1095          struct zsd_entry *t;
1096 1096          void *data;
1097 1097          boolean_t dropped;
1098 1098  
1099 1099          if (lockp != NULL) {
1100 1100                  ASSERT(MUTEX_HELD(lockp));
1101 1101          }
1102 1102          if (zone_lock_held) {
1103 1103                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1104 1104          } else {
1105 1105                  mutex_enter(&zone->zone_lock);
1106 1106          }
1107 1107  
1108 1108          t = zsd_find(&zone->zone_zsd, key);
1109 1109          if (t == NULL) {
1110 1110                  /*
1111 1111                   * Somebody else got here first e.g the zone going
1112 1112                   * away.
1113 1113                   */
1114 1114                  if (!zone_lock_held)
1115 1115                          mutex_exit(&zone->zone_lock);
1116 1116                  return (B_FALSE);
1117 1117          }
1118 1118          dropped = B_FALSE;
1119 1119          if (zsd_wait_for_creator(zone, t, lockp))
1120 1120                  dropped = B_TRUE;
1121 1121  
1122 1122          if (zsd_wait_for_inprogress(zone, t, lockp))
1123 1123                  dropped = B_TRUE;
1124 1124  
1125 1125          if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1126 1126                  t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1127 1127                  t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1128 1128                  DTRACE_PROBE2(zsd__shutdown__inprogress,
1129 1129                      zone_t *, zone, zone_key_t, key);
1130 1130                  mutex_exit(&zone->zone_lock);
1131 1131                  if (lockp != NULL)
1132 1132                          mutex_exit(lockp);
1133 1133                  dropped = B_TRUE;
1134 1134  
1135 1135                  ASSERT(t->zsd_shutdown != NULL);
1136 1136                  data = t->zsd_data;
1137 1137  
1138 1138                  DTRACE_PROBE2(zsd__shutdown__start,
1139 1139                      zone_t *, zone, zone_key_t, key);
1140 1140  
1141 1141                  (t->zsd_shutdown)(zone->zone_id, data);
1142 1142                  DTRACE_PROBE2(zsd__shutdown__end,
1143 1143                      zone_t *, zone, zone_key_t, key);
1144 1144  
1145 1145                  if (lockp != NULL)
1146 1146                          mutex_enter(lockp);
1147 1147                  mutex_enter(&zone->zone_lock);
1148 1148                  t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1149 1149                  t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1150 1150                  cv_broadcast(&t->zsd_cv);
1151 1151                  DTRACE_PROBE2(zsd__shutdown__completed,
1152 1152                      zone_t *, zone, zone_key_t, key);
1153 1153          }
1154 1154          if (!zone_lock_held)
1155 1155                  mutex_exit(&zone->zone_lock);
1156 1156          return (dropped);
1157 1157  }
1158 1158  
1159 1159  /*
1160 1160   * Call the destroy function for the zone and key if DESTROY_NEEDED
1161 1161   * is set.
1162 1162   * If some other thread gets here first and sets *_INPROGRESS, then
1163 1163   * we wait for that thread to complete so that we can ensure that
1164 1164   * all the callbacks are done when we've looped over all zones/keys.
1165 1165   *
1166 1166   * When we call the destroy function, we drop the global held by the
1167 1167   * caller, and return true to tell the caller it needs to re-evalute the
1168 1168   * state.
1169 1169   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1170 1170   * remains held on exit.
1171 1171   */
1172 1172  static boolean_t
1173 1173  zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1174 1174      zone_t *zone, zone_key_t key)
1175 1175  {
1176 1176          struct zsd_entry *t;
1177 1177          void *data;
1178 1178          boolean_t dropped;
1179 1179  
1180 1180          if (lockp != NULL) {
1181 1181                  ASSERT(MUTEX_HELD(lockp));
1182 1182          }
1183 1183          if (zone_lock_held) {
1184 1184                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1185 1185          } else {
1186 1186                  mutex_enter(&zone->zone_lock);
1187 1187          }
1188 1188  
1189 1189          t = zsd_find(&zone->zone_zsd, key);
1190 1190          if (t == NULL) {
1191 1191                  /*
1192 1192                   * Somebody else got here first e.g the zone going
1193 1193                   * away.
1194 1194                   */
1195 1195                  if (!zone_lock_held)
1196 1196                          mutex_exit(&zone->zone_lock);
1197 1197                  return (B_FALSE);
1198 1198          }
1199 1199          dropped = B_FALSE;
1200 1200          if (zsd_wait_for_creator(zone, t, lockp))
1201 1201                  dropped = B_TRUE;
1202 1202  
1203 1203          if (zsd_wait_for_inprogress(zone, t, lockp))
1204 1204                  dropped = B_TRUE;
1205 1205  
1206 1206          if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1207 1207                  t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1208 1208                  t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1209 1209                  DTRACE_PROBE2(zsd__destroy__inprogress,
1210 1210                      zone_t *, zone, zone_key_t, key);
1211 1211                  mutex_exit(&zone->zone_lock);
1212 1212                  if (lockp != NULL)
1213 1213                          mutex_exit(lockp);
1214 1214                  dropped = B_TRUE;
1215 1215  
1216 1216                  ASSERT(t->zsd_destroy != NULL);
1217 1217                  data = t->zsd_data;
1218 1218                  DTRACE_PROBE2(zsd__destroy__start,
1219 1219                      zone_t *, zone, zone_key_t, key);
1220 1220  
1221 1221                  (t->zsd_destroy)(zone->zone_id, data);
1222 1222                  DTRACE_PROBE2(zsd__destroy__end,
1223 1223                      zone_t *, zone, zone_key_t, key);
1224 1224  
1225 1225                  if (lockp != NULL)
1226 1226                          mutex_enter(lockp);
1227 1227                  mutex_enter(&zone->zone_lock);
1228 1228                  t->zsd_data = NULL;
1229 1229                  t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1230 1230                  t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1231 1231                  cv_broadcast(&t->zsd_cv);
1232 1232                  DTRACE_PROBE2(zsd__destroy__completed,
1233 1233                      zone_t *, zone, zone_key_t, key);
1234 1234          }
1235 1235          if (!zone_lock_held)
1236 1236                  mutex_exit(&zone->zone_lock);
1237 1237          return (dropped);
1238 1238  }
1239 1239  
1240 1240  /*
1241 1241   * Wait for any CREATE_NEEDED flag to be cleared.
1242 1242   * Returns true if lockp was temporarily dropped while waiting.
1243 1243   */
1244 1244  static boolean_t
1245 1245  zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1246 1246  {
1247 1247          boolean_t dropped = B_FALSE;
1248 1248  
1249 1249          while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1250 1250                  DTRACE_PROBE2(zsd__wait__for__creator,
1251 1251                      zone_t *, zone, struct zsd_entry *, t);
1252 1252                  if (lockp != NULL) {
1253 1253                          dropped = B_TRUE;
1254 1254                          mutex_exit(lockp);
1255 1255                  }
1256 1256                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1257 1257                  if (lockp != NULL) {
1258 1258                          /* First drop zone_lock to preserve order */
1259 1259                          mutex_exit(&zone->zone_lock);
1260 1260                          mutex_enter(lockp);
1261 1261                          mutex_enter(&zone->zone_lock);
1262 1262                  }
1263 1263          }
1264 1264          return (dropped);
1265 1265  }
1266 1266  
1267 1267  /*
1268 1268   * Wait for any INPROGRESS flag to be cleared.
1269 1269   * Returns true if lockp was temporarily dropped while waiting.
1270 1270   */
1271 1271  static boolean_t
1272 1272  zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1273 1273  {
1274 1274          boolean_t dropped = B_FALSE;
1275 1275  
1276 1276          while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1277 1277                  DTRACE_PROBE2(zsd__wait__for__inprogress,
1278 1278                      zone_t *, zone, struct zsd_entry *, t);
1279 1279                  if (lockp != NULL) {
1280 1280                          dropped = B_TRUE;
1281 1281                          mutex_exit(lockp);
1282 1282                  }
1283 1283                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1284 1284                  if (lockp != NULL) {
1285 1285                          /* First drop zone_lock to preserve order */
1286 1286                          mutex_exit(&zone->zone_lock);
1287 1287                          mutex_enter(lockp);
1288 1288                          mutex_enter(&zone->zone_lock);
1289 1289                  }
1290 1290          }
1291 1291          return (dropped);
1292 1292  }
1293 1293  
1294 1294  /*
1295 1295   * Frees memory associated with the zone dataset list.
1296 1296   */
1297 1297  static void
1298 1298  zone_free_datasets(zone_t *zone)
1299 1299  {
1300 1300          zone_dataset_t *t, *next;
1301 1301  
1302 1302          for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1303 1303                  next = list_next(&zone->zone_datasets, t);
1304 1304                  list_remove(&zone->zone_datasets, t);
1305 1305                  kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1306 1306                  kmem_free(t, sizeof (*t));
1307 1307          }
1308 1308          list_destroy(&zone->zone_datasets);
1309 1309  }
1310 1310  
1311 1311  /*
1312 1312   * zone.cpu-shares resource control support.
1313 1313   */
1314 1314  /*ARGSUSED*/
1315 1315  static rctl_qty_t
1316 1316  zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1317 1317  {
1318 1318          ASSERT(MUTEX_HELD(&p->p_lock));
1319 1319          return (p->p_zone->zone_shares);
1320 1320  }
1321 1321  
1322 1322  /*ARGSUSED*/
1323 1323  static int
1324 1324  zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1325 1325      rctl_qty_t nv)
1326 1326  {
1327 1327          ASSERT(MUTEX_HELD(&p->p_lock));
1328 1328          ASSERT(e->rcep_t == RCENTITY_ZONE);
1329 1329          if (e->rcep_p.zone == NULL)
1330 1330                  return (0);
1331 1331  
1332 1332          e->rcep_p.zone->zone_shares = nv;
1333 1333          return (0);
1334 1334  }
1335 1335  
1336 1336  static rctl_ops_t zone_cpu_shares_ops = {
1337 1337          rcop_no_action,
1338 1338          zone_cpu_shares_usage,
1339 1339          zone_cpu_shares_set,
1340 1340          rcop_no_test
1341 1341  };
1342 1342  
1343 1343  /*
1344 1344   * zone.cpu-cap resource control support.
1345 1345   */
1346 1346  /*ARGSUSED*/
1347 1347  static rctl_qty_t
1348 1348  zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1349 1349  {
1350 1350          ASSERT(MUTEX_HELD(&p->p_lock));
1351 1351          return (cpucaps_zone_get(p->p_zone));
1352 1352  }
1353 1353  
1354 1354  /*ARGSUSED*/
1355 1355  static int
1356 1356  zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1357 1357      rctl_qty_t nv)
1358 1358  {
1359 1359          zone_t *zone = e->rcep_p.zone;
1360 1360  
1361 1361          ASSERT(MUTEX_HELD(&p->p_lock));
1362 1362          ASSERT(e->rcep_t == RCENTITY_ZONE);
1363 1363  
1364 1364          if (zone == NULL)
1365 1365                  return (0);
1366 1366  
1367 1367          /*
1368 1368           * set cap to the new value.
1369 1369           */
1370 1370          return (cpucaps_zone_set(zone, nv));
1371 1371  }
1372 1372  
1373 1373  static rctl_ops_t zone_cpu_cap_ops = {
1374 1374          rcop_no_action,
1375 1375          zone_cpu_cap_get,
1376 1376          zone_cpu_cap_set,
1377 1377          rcop_no_test
1378 1378  };
1379 1379  
1380 1380  /*ARGSUSED*/
1381 1381  static rctl_qty_t
1382 1382  zone_lwps_usage(rctl_t *r, proc_t *p)
1383 1383  {
1384 1384          rctl_qty_t nlwps;
1385 1385          zone_t *zone = p->p_zone;
1386 1386  
1387 1387          ASSERT(MUTEX_HELD(&p->p_lock));
1388 1388  
1389 1389          mutex_enter(&zone->zone_nlwps_lock);
1390 1390          nlwps = zone->zone_nlwps;
1391 1391          mutex_exit(&zone->zone_nlwps_lock);
1392 1392  
1393 1393          return (nlwps);
1394 1394  }
1395 1395  
1396 1396  /*ARGSUSED*/
1397 1397  static int
1398 1398  zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1399 1399      rctl_qty_t incr, uint_t flags)
1400 1400  {
1401 1401          rctl_qty_t nlwps;
1402 1402  
1403 1403          ASSERT(MUTEX_HELD(&p->p_lock));
1404 1404          ASSERT(e->rcep_t == RCENTITY_ZONE);
1405 1405          if (e->rcep_p.zone == NULL)
1406 1406                  return (0);
1407 1407          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1408 1408          nlwps = e->rcep_p.zone->zone_nlwps;
1409 1409  
1410 1410          if (nlwps + incr > rcntl->rcv_value)
1411 1411                  return (1);
1412 1412  
1413 1413          return (0);
1414 1414  }
1415 1415  
1416 1416  /*ARGSUSED*/
1417 1417  static int
1418 1418  zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1419 1419  {
1420 1420          ASSERT(MUTEX_HELD(&p->p_lock));
1421 1421          ASSERT(e->rcep_t == RCENTITY_ZONE);
1422 1422          if (e->rcep_p.zone == NULL)
1423 1423                  return (0);
1424 1424          e->rcep_p.zone->zone_nlwps_ctl = nv;
1425 1425          return (0);
1426 1426  }
1427 1427  
1428 1428  static rctl_ops_t zone_lwps_ops = {
1429 1429          rcop_no_action,
1430 1430          zone_lwps_usage,
1431 1431          zone_lwps_set,
1432 1432          zone_lwps_test,
1433 1433  };
1434 1434  
1435 1435  /*ARGSUSED*/
1436 1436  static rctl_qty_t
1437 1437  zone_procs_usage(rctl_t *r, proc_t *p)
1438 1438  {
1439 1439          rctl_qty_t nprocs;
1440 1440          zone_t *zone = p->p_zone;
1441 1441  
1442 1442          ASSERT(MUTEX_HELD(&p->p_lock));
1443 1443  
1444 1444          mutex_enter(&zone->zone_nlwps_lock);
1445 1445          nprocs = zone->zone_nprocs;
1446 1446          mutex_exit(&zone->zone_nlwps_lock);
1447 1447  
1448 1448          return (nprocs);
1449 1449  }
1450 1450  
1451 1451  /*ARGSUSED*/
1452 1452  static int
1453 1453  zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1454 1454      rctl_qty_t incr, uint_t flags)
1455 1455  {
1456 1456          rctl_qty_t nprocs;
1457 1457  
1458 1458          ASSERT(MUTEX_HELD(&p->p_lock));
1459 1459          ASSERT(e->rcep_t == RCENTITY_ZONE);
1460 1460          if (e->rcep_p.zone == NULL)
1461 1461                  return (0);
1462 1462          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1463 1463          nprocs = e->rcep_p.zone->zone_nprocs;
1464 1464  
1465 1465          if (nprocs + incr > rcntl->rcv_value)
1466 1466                  return (1);
1467 1467  
1468 1468          return (0);
1469 1469  }
1470 1470  
1471 1471  /*ARGSUSED*/
1472 1472  static int
1473 1473  zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1474 1474  {
1475 1475          ASSERT(MUTEX_HELD(&p->p_lock));
1476 1476          ASSERT(e->rcep_t == RCENTITY_ZONE);
1477 1477          if (e->rcep_p.zone == NULL)
1478 1478                  return (0);
1479 1479          e->rcep_p.zone->zone_nprocs_ctl = nv;
1480 1480          return (0);
1481 1481  }
1482 1482  
1483 1483  static rctl_ops_t zone_procs_ops = {
1484 1484          rcop_no_action,
1485 1485          zone_procs_usage,
1486 1486          zone_procs_set,
1487 1487          zone_procs_test,
1488 1488  };
1489 1489  
1490 1490  /*ARGSUSED*/
1491 1491  static rctl_qty_t
1492 1492  zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1493 1493  {
1494 1494          ASSERT(MUTEX_HELD(&p->p_lock));
1495 1495          return (p->p_zone->zone_shmmax);
1496 1496  }
1497 1497  
1498 1498  /*ARGSUSED*/
1499 1499  static int
1500 1500  zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1501 1501      rctl_qty_t incr, uint_t flags)
1502 1502  {
1503 1503          rctl_qty_t v;
1504 1504          ASSERT(MUTEX_HELD(&p->p_lock));
1505 1505          ASSERT(e->rcep_t == RCENTITY_ZONE);
1506 1506          v = e->rcep_p.zone->zone_shmmax + incr;
1507 1507          if (v > rval->rcv_value)
1508 1508                  return (1);
1509 1509          return (0);
1510 1510  }
1511 1511  
1512 1512  static rctl_ops_t zone_shmmax_ops = {
1513 1513          rcop_no_action,
1514 1514          zone_shmmax_usage,
1515 1515          rcop_no_set,
1516 1516          zone_shmmax_test
1517 1517  };
1518 1518  
1519 1519  /*ARGSUSED*/
1520 1520  static rctl_qty_t
1521 1521  zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1522 1522  {
1523 1523          ASSERT(MUTEX_HELD(&p->p_lock));
1524 1524          return (p->p_zone->zone_ipc.ipcq_shmmni);
1525 1525  }
1526 1526  
1527 1527  /*ARGSUSED*/
1528 1528  static int
1529 1529  zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1530 1530      rctl_qty_t incr, uint_t flags)
1531 1531  {
1532 1532          rctl_qty_t v;
1533 1533          ASSERT(MUTEX_HELD(&p->p_lock));
1534 1534          ASSERT(e->rcep_t == RCENTITY_ZONE);
1535 1535          v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1536 1536          if (v > rval->rcv_value)
1537 1537                  return (1);
1538 1538          return (0);
1539 1539  }
1540 1540  
1541 1541  static rctl_ops_t zone_shmmni_ops = {
1542 1542          rcop_no_action,
1543 1543          zone_shmmni_usage,
1544 1544          rcop_no_set,
1545 1545          zone_shmmni_test
1546 1546  };
1547 1547  
1548 1548  /*ARGSUSED*/
1549 1549  static rctl_qty_t
1550 1550  zone_semmni_usage(rctl_t *rctl, struct proc *p)
1551 1551  {
1552 1552          ASSERT(MUTEX_HELD(&p->p_lock));
1553 1553          return (p->p_zone->zone_ipc.ipcq_semmni);
1554 1554  }
1555 1555  
1556 1556  /*ARGSUSED*/
1557 1557  static int
1558 1558  zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1559 1559      rctl_qty_t incr, uint_t flags)
1560 1560  {
1561 1561          rctl_qty_t v;
1562 1562          ASSERT(MUTEX_HELD(&p->p_lock));
1563 1563          ASSERT(e->rcep_t == RCENTITY_ZONE);
1564 1564          v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1565 1565          if (v > rval->rcv_value)
1566 1566                  return (1);
1567 1567          return (0);
1568 1568  }
1569 1569  
1570 1570  static rctl_ops_t zone_semmni_ops = {
1571 1571          rcop_no_action,
1572 1572          zone_semmni_usage,
1573 1573          rcop_no_set,
1574 1574          zone_semmni_test
1575 1575  };
1576 1576  
1577 1577  /*ARGSUSED*/
1578 1578  static rctl_qty_t
1579 1579  zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1580 1580  {
1581 1581          ASSERT(MUTEX_HELD(&p->p_lock));
1582 1582          return (p->p_zone->zone_ipc.ipcq_msgmni);
1583 1583  }
1584 1584  
1585 1585  /*ARGSUSED*/
1586 1586  static int
1587 1587  zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1588 1588      rctl_qty_t incr, uint_t flags)
1589 1589  {
1590 1590          rctl_qty_t v;
1591 1591          ASSERT(MUTEX_HELD(&p->p_lock));
1592 1592          ASSERT(e->rcep_t == RCENTITY_ZONE);
1593 1593          v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1594 1594          if (v > rval->rcv_value)
1595 1595                  return (1);
1596 1596          return (0);
1597 1597  }
1598 1598  
1599 1599  static rctl_ops_t zone_msgmni_ops = {
1600 1600          rcop_no_action,
1601 1601          zone_msgmni_usage,
1602 1602          rcop_no_set,
1603 1603          zone_msgmni_test
1604 1604  };
1605 1605  
1606 1606  /*ARGSUSED*/
1607 1607  static rctl_qty_t
1608 1608  zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1609 1609  {
1610 1610          rctl_qty_t q;
1611 1611          ASSERT(MUTEX_HELD(&p->p_lock));
1612 1612          mutex_enter(&p->p_zone->zone_mem_lock);
1613 1613          q = p->p_zone->zone_locked_mem;
1614 1614          mutex_exit(&p->p_zone->zone_mem_lock);
1615 1615          return (q);
1616 1616  }
1617 1617  
1618 1618  /*ARGSUSED*/
1619 1619  static int
1620 1620  zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1621 1621      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1622 1622  {
1623 1623          rctl_qty_t q;
1624 1624          zone_t *z;
1625 1625  
1626 1626          z = e->rcep_p.zone;
1627 1627          ASSERT(MUTEX_HELD(&p->p_lock));
1628 1628          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1629 1629          q = z->zone_locked_mem;
1630 1630          if (q + incr > rcntl->rcv_value)
1631 1631                  return (1);
1632 1632          return (0);
1633 1633  }
1634 1634  
1635 1635  /*ARGSUSED*/
1636 1636  static int
1637 1637  zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1638 1638      rctl_qty_t nv)
1639 1639  {
1640 1640          ASSERT(MUTEX_HELD(&p->p_lock));
1641 1641          ASSERT(e->rcep_t == RCENTITY_ZONE);
1642 1642          if (e->rcep_p.zone == NULL)
1643 1643                  return (0);
1644 1644          e->rcep_p.zone->zone_locked_mem_ctl = nv;
1645 1645          return (0);
1646 1646  }
1647 1647  
1648 1648  static rctl_ops_t zone_locked_mem_ops = {
1649 1649          rcop_no_action,
1650 1650          zone_locked_mem_usage,
1651 1651          zone_locked_mem_set,
1652 1652          zone_locked_mem_test
1653 1653  };
1654 1654  
1655 1655  /*ARGSUSED*/
1656 1656  static rctl_qty_t
1657 1657  zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1658 1658  {
1659 1659          rctl_qty_t q;
1660 1660          zone_t *z = p->p_zone;
1661 1661  
1662 1662          ASSERT(MUTEX_HELD(&p->p_lock));
1663 1663          mutex_enter(&z->zone_mem_lock);
1664 1664          q = z->zone_max_swap;
1665 1665          mutex_exit(&z->zone_mem_lock);
1666 1666          return (q);
1667 1667  }
1668 1668  
1669 1669  /*ARGSUSED*/
1670 1670  static int
1671 1671  zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1672 1672      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1673 1673  {
1674 1674          rctl_qty_t q;
1675 1675          zone_t *z;
1676 1676  
1677 1677          z = e->rcep_p.zone;
1678 1678          ASSERT(MUTEX_HELD(&p->p_lock));
1679 1679          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1680 1680          q = z->zone_max_swap;
1681 1681          if (q + incr > rcntl->rcv_value)
1682 1682                  return (1);
1683 1683          return (0);
1684 1684  }
1685 1685  
1686 1686  /*ARGSUSED*/
1687 1687  static int
1688 1688  zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1689 1689      rctl_qty_t nv)
1690 1690  {
1691 1691          ASSERT(MUTEX_HELD(&p->p_lock));
1692 1692          ASSERT(e->rcep_t == RCENTITY_ZONE);
1693 1693          if (e->rcep_p.zone == NULL)
1694 1694                  return (0);
1695 1695          e->rcep_p.zone->zone_max_swap_ctl = nv;
1696 1696          return (0);
1697 1697  }
1698 1698  
1699 1699  static rctl_ops_t zone_max_swap_ops = {
1700 1700          rcop_no_action,
1701 1701          zone_max_swap_usage,
1702 1702          zone_max_swap_set,
1703 1703          zone_max_swap_test
1704 1704  };
1705 1705  
1706 1706  /*ARGSUSED*/
1707 1707  static rctl_qty_t
1708 1708  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1709 1709  {
1710 1710          rctl_qty_t q;
1711 1711          zone_t *z = p->p_zone;
1712 1712  
1713 1713          ASSERT(MUTEX_HELD(&p->p_lock));
1714 1714          mutex_enter(&z->zone_rctl_lock);
1715 1715          q = z->zone_max_lofi;
1716 1716          mutex_exit(&z->zone_rctl_lock);
1717 1717          return (q);
1718 1718  }
1719 1719  
1720 1720  /*ARGSUSED*/
1721 1721  static int
1722 1722  zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1723 1723      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1724 1724  {
1725 1725          rctl_qty_t q;
1726 1726          zone_t *z;
1727 1727  
1728 1728          z = e->rcep_p.zone;
1729 1729          ASSERT(MUTEX_HELD(&p->p_lock));
1730 1730          ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1731 1731          q = z->zone_max_lofi;
1732 1732          if (q + incr > rcntl->rcv_value)
1733 1733                  return (1);
1734 1734          return (0);
1735 1735  }
1736 1736  
1737 1737  /*ARGSUSED*/
1738 1738  static int
1739 1739  zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1740 1740      rctl_qty_t nv)
1741 1741  {
1742 1742          ASSERT(MUTEX_HELD(&p->p_lock));
1743 1743          ASSERT(e->rcep_t == RCENTITY_ZONE);
1744 1744          if (e->rcep_p.zone == NULL)
1745 1745                  return (0);
1746 1746          e->rcep_p.zone->zone_max_lofi_ctl = nv;
1747 1747          return (0);
1748 1748  }
1749 1749  
1750 1750  static rctl_ops_t zone_max_lofi_ops = {
1751 1751          rcop_no_action,
1752 1752          zone_max_lofi_usage,
1753 1753          zone_max_lofi_set,
1754 1754          zone_max_lofi_test
1755 1755  };
1756 1756  
1757 1757  /*
1758 1758   * Helper function to brand the zone with a unique ID.
1759 1759   */
1760 1760  static void
1761 1761  zone_uniqid(zone_t *zone)
1762 1762  {
1763 1763          static uint64_t uniqid = 0;
1764 1764  
1765 1765          ASSERT(MUTEX_HELD(&zonehash_lock));
1766 1766          zone->zone_uniqid = uniqid++;
1767 1767  }
1768 1768  
1769 1769  /*
1770 1770   * Returns a held pointer to the "kcred" for the specified zone.
1771 1771   */
1772 1772  struct cred *
1773 1773  zone_get_kcred(zoneid_t zoneid)
1774 1774  {
1775 1775          zone_t *zone;
1776 1776          cred_t *cr;
1777 1777  
1778 1778          if ((zone = zone_find_by_id(zoneid)) == NULL)
1779 1779                  return (NULL);
1780 1780          cr = zone->zone_kcred;
1781 1781          crhold(cr);
1782 1782          zone_rele(zone);
1783 1783          return (cr);
1784 1784  }
1785 1785  
1786 1786  static int
1787 1787  zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1788 1788  {
1789 1789          zone_t *zone = ksp->ks_private;
1790 1790          zone_kstat_t *zk = ksp->ks_data;
1791 1791  
1792 1792          if (rw == KSTAT_WRITE)
1793 1793                  return (EACCES);
1794 1794  
1795 1795          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1796 1796          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1797 1797          return (0);
1798 1798  }
1799 1799  
1800 1800  static int
1801 1801  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1802 1802  {
1803 1803          zone_t *zone = ksp->ks_private;
1804 1804          zone_kstat_t *zk = ksp->ks_data;
1805 1805  
1806 1806          if (rw == KSTAT_WRITE)
1807 1807                  return (EACCES);
1808 1808  
1809 1809          zk->zk_usage.value.ui64 = zone->zone_nprocs;
1810 1810          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1811 1811          return (0);
1812 1812  }
1813 1813  
1814 1814  static int
1815 1815  zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1816 1816  {
1817 1817          zone_t *zone = ksp->ks_private;
1818 1818          zone_kstat_t *zk = ksp->ks_data;
1819 1819  
1820 1820          if (rw == KSTAT_WRITE)
1821 1821                  return (EACCES);
1822 1822  
1823 1823          zk->zk_usage.value.ui64 = zone->zone_max_swap;
1824 1824          zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1825 1825          return (0);
1826 1826  }
1827 1827  
1828 1828  static kstat_t *
1829 1829  zone_kstat_create_common(zone_t *zone, char *name,
1830 1830      int (*updatefunc) (kstat_t *, int))
1831 1831  {
1832 1832          kstat_t *ksp;
1833 1833          zone_kstat_t *zk;
1834 1834  
1835 1835          ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1836 1836              sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1837 1837              KSTAT_FLAG_VIRTUAL);
1838 1838  
1839 1839          if (ksp == NULL)
1840 1840                  return (NULL);
1841 1841  
1842 1842          zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1843 1843          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1844 1844          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1845 1845          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1846 1846          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1847 1847          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1848 1848          ksp->ks_update = updatefunc;
1849 1849          ksp->ks_private = zone;
1850 1850          kstat_install(ksp);
1851 1851          return (ksp);
1852 1852  }
1853 1853  
1854 1854  
1855 1855  static int
1856 1856  zone_mcap_kstat_update(kstat_t *ksp, int rw)
1857 1857  {
1858 1858          zone_t *zone = ksp->ks_private;
1859 1859          zone_mcap_kstat_t *zmp = ksp->ks_data;
1860 1860  
1861 1861          if (rw == KSTAT_WRITE)
1862 1862                  return (EACCES);
1863 1863  
1864 1864          zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1865 1865          zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1866 1866          zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1867 1867          zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1868 1868          zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1869 1869  
1870 1870          return (0);
1871 1871  }
1872 1872  
1873 1873  static kstat_t *
1874 1874  zone_mcap_kstat_create(zone_t *zone)
1875 1875  {
1876 1876          kstat_t *ksp;
1877 1877          zone_mcap_kstat_t *zmp;
1878 1878  
1879 1879          if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1880 1880              zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1881 1881              sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1882 1882              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1883 1883                  return (NULL);
1884 1884  
1885 1885          if (zone->zone_id != GLOBAL_ZONEID)
1886 1886                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1887 1887  
1888 1888          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1889 1889          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1890 1890          ksp->ks_lock = &zone->zone_mcap_lock;
1891 1891          zone->zone_mcap_stats = zmp;
1892 1892  
1893 1893          /* The kstat "name" field is not large enough for a full zonename */
1894 1894          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1895 1895          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1896 1896          kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1897 1897          kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1898 1898          kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1899 1899          kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1900 1900          kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1901 1901              KSTAT_DATA_UINT64);
1902 1902  
1903 1903          ksp->ks_update = zone_mcap_kstat_update;
1904 1904          ksp->ks_private = zone;
1905 1905  
1906 1906          kstat_install(ksp);
1907 1907          return (ksp);
1908 1908  }
1909 1909  
1910 1910  static int
1911 1911  zone_misc_kstat_update(kstat_t *ksp, int rw)
1912 1912  {
1913 1913          zone_t *zone = ksp->ks_private;
1914 1914          zone_misc_kstat_t *zmp = ksp->ks_data;
1915 1915          hrtime_t hrtime;
1916 1916          uint64_t tmp;
1917 1917  
1918 1918          if (rw == KSTAT_WRITE)
1919 1919                  return (EACCES);
1920 1920  
1921 1921          tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
1922 1922          hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1923 1923          scalehrtime(&hrtime);
1924 1924          zmp->zm_stime.value.ui64 = hrtime;
1925 1925  
1926 1926          tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
1927 1927          hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1928 1928          scalehrtime(&hrtime);
1929 1929          zmp->zm_utime.value.ui64 = hrtime;
1930 1930  
1931 1931          tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
1932 1932          hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
1933 1933          scalehrtime(&hrtime);
1934 1934          zmp->zm_wtime.value.ui64 = hrtime;
1935 1935  
1936 1936          zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
1937 1937          zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
1938 1938          zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
1939 1939  
1940 1940          zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
1941 1941          zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
1942 1942          zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
1943 1943          zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
1944 1944  
1945 1945          zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
1946 1946  
1947 1947          zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
1948 1948          zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
1949 1949  
1950 1950          return (0);
1951 1951  }
1952 1952  
1953 1953  static kstat_t *
1954 1954  zone_misc_kstat_create(zone_t *zone)
1955 1955  {
1956 1956          kstat_t *ksp;
1957 1957          zone_misc_kstat_t *zmp;
1958 1958  
1959 1959          if ((ksp = kstat_create_zone("zones", zone->zone_id,
1960 1960              zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
1961 1961              sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
1962 1962              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1963 1963                  return (NULL);
1964 1964  
1965 1965          if (zone->zone_id != GLOBAL_ZONEID)
1966 1966                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1967 1967  
1968 1968          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
1969 1969          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1970 1970          ksp->ks_lock = &zone->zone_misc_lock;
1971 1971          zone->zone_misc_stats = zmp;
1972 1972  
1973 1973          /* The kstat "name" field is not large enough for a full zonename */
1974 1974          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1975 1975          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1976 1976          kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
1977 1977          kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
1978 1978          kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
1979 1979          kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
1980 1980          kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
1981 1981          kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
1982 1982              KSTAT_DATA_UINT32);
1983 1983          kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
1984 1984          kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
1985 1985              KSTAT_DATA_UINT32);
1986 1986          kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
1987 1987          kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
1988 1988          kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
1989 1989              KSTAT_DATA_UINT32);
1990 1990          kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
1991 1991          kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
1992 1992  
1993 1993          ksp->ks_update = zone_misc_kstat_update;
1994 1994          ksp->ks_private = zone;
1995 1995  
1996 1996          kstat_install(ksp);
1997 1997          return (ksp);
1998 1998  }
1999 1999  
2000 2000  static void
2001 2001  zone_kstat_create(zone_t *zone)
2002 2002  {
2003 2003          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2004 2004              "lockedmem", zone_lockedmem_kstat_update);
2005 2005          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2006 2006              "swapresv", zone_swapresv_kstat_update);
2007 2007          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2008 2008              "nprocs", zone_nprocs_kstat_update);
2009 2009  
2010 2010          if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2011 2011                  zone->zone_mcap_stats = kmem_zalloc(
2012 2012                      sizeof (zone_mcap_kstat_t), KM_SLEEP);
2013 2013          }
2014 2014  
2015 2015          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2016 2016                  zone->zone_misc_stats = kmem_zalloc(
2017 2017                      sizeof (zone_misc_kstat_t), KM_SLEEP);
2018 2018          }
2019 2019  }
2020 2020  
2021 2021  static void
2022 2022  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2023 2023  {
2024 2024          void *data;
2025 2025  
2026 2026          if (*pkstat != NULL) {
2027 2027                  data = (*pkstat)->ks_data;
2028 2028                  kstat_delete(*pkstat);
2029 2029                  kmem_free(data, datasz);
2030 2030                  *pkstat = NULL;
2031 2031          }
2032 2032  }
2033 2033  
2034 2034  static void
2035 2035  zone_kstat_delete(zone_t *zone)
2036 2036  {
2037 2037          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2038 2038              sizeof (zone_kstat_t));
2039 2039          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2040 2040              sizeof (zone_kstat_t));
2041 2041          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2042 2042              sizeof (zone_kstat_t));
2043 2043          zone_kstat_delete_common(&zone->zone_mcap_ksp,
2044 2044              sizeof (zone_mcap_kstat_t));
2045 2045          zone_kstat_delete_common(&zone->zone_misc_ksp,
2046 2046              sizeof (zone_misc_kstat_t));
2047 2047  }
2048 2048  
2049 2049  /*
2050 2050   * Called very early on in boot to initialize the ZSD list so that
2051 2051   * zone_key_create() can be called before zone_init().  It also initializes
2052 2052   * portions of zone0 which may be used before zone_init() is called.  The
2053 2053   * variable "global_zone" will be set when zone0 is fully initialized by
2054 2054   * zone_init().
2055 2055   */
2056 2056  void
2057 2057  zone_zsd_init(void)
2058 2058  {
2059 2059          mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2060 2060          mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2061 2061          list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2062 2062              offsetof(struct zsd_entry, zsd_linkage));
2063 2063          list_create(&zone_active, sizeof (zone_t),
2064 2064              offsetof(zone_t, zone_linkage));
2065 2065          list_create(&zone_deathrow, sizeof (zone_t),
2066 2066              offsetof(zone_t, zone_linkage));
2067 2067  
2068 2068          mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2069 2069          mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2070 2070          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2071 2071          zone0.zone_shares = 1;
2072 2072          zone0.zone_nlwps = 0;
2073 2073          zone0.zone_nlwps_ctl = INT_MAX;
2074 2074          zone0.zone_nprocs = 0;
2075 2075          zone0.zone_nprocs_ctl = INT_MAX;
2076 2076          zone0.zone_locked_mem = 0;
2077 2077          zone0.zone_locked_mem_ctl = UINT64_MAX;
2078 2078          ASSERT(zone0.zone_max_swap == 0);
2079 2079          zone0.zone_max_swap_ctl = UINT64_MAX;
2080 2080          zone0.zone_max_lofi = 0;
2081 2081          zone0.zone_max_lofi_ctl = UINT64_MAX;
2082 2082          zone0.zone_shmmax = 0;
2083 2083          zone0.zone_ipc.ipcq_shmmni = 0;
2084 2084          zone0.zone_ipc.ipcq_semmni = 0;
2085 2085          zone0.zone_ipc.ipcq_msgmni = 0;
2086 2086          zone0.zone_name = GLOBAL_ZONENAME;
2087 2087          zone0.zone_nodename = utsname.nodename;
2088 2088          zone0.zone_domain = srpc_domain;
2089 2089          zone0.zone_hostid = HW_INVALID_HOSTID;
2090 2090          zone0.zone_fs_allowed = NULL;
2091 2091          psecflags_default(&zone0.zone_secflags);
2092 2092          zone0.zone_ref = 1;
2093 2093          zone0.zone_id = GLOBAL_ZONEID;
2094 2094          zone0.zone_status = ZONE_IS_RUNNING;
2095 2095          zone0.zone_rootpath = "/";
2096 2096          zone0.zone_rootpathlen = 2;
2097 2097          zone0.zone_psetid = ZONE_PS_INVAL;
2098 2098          zone0.zone_ncpus = 0;
2099 2099          zone0.zone_ncpus_online = 0;
2100 2100          zone0.zone_proc_initpid = 1;
2101 2101          zone0.zone_initname = initname;
2102 2102          zone0.zone_lockedmem_kstat = NULL;
2103 2103          zone0.zone_swapresv_kstat = NULL;
2104 2104          zone0.zone_nprocs_kstat = NULL;
2105 2105  
2106 2106          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2107 2107              offsetof(zone_ref_t, zref_linkage));
2108 2108          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2109 2109              offsetof(struct zsd_entry, zsd_linkage));
2110 2110          list_insert_head(&zone_active, &zone0);
2111 2111  
2112 2112          /*
2113 2113           * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2114 2114           * to anything meaningful.  It is assigned to be 'rootdir' in
2115 2115           * vfs_mountroot().
2116 2116           */
2117 2117          zone0.zone_rootvp = NULL;
2118 2118          zone0.zone_vfslist = NULL;
2119 2119          zone0.zone_bootargs = initargs;
2120 2120          zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2121 2121          /*
2122 2122           * The global zone has all privileges
2123 2123           */
2124 2124          priv_fillset(zone0.zone_privset);
2125 2125          /*
2126 2126           * Add p0 to the global zone
2127 2127           */
2128 2128          zone0.zone_zsched = &p0;
2129 2129          p0.p_zone = &zone0;
2130 2130  }
2131 2131  
2132 2132  /*
2133 2133   * Compute a hash value based on the contents of the label and the DOI.  The
2134 2134   * hash algorithm is somewhat arbitrary, but is based on the observation that
2135 2135   * humans will likely pick labels that differ by amounts that work out to be
2136 2136   * multiples of the number of hash chains, and thus stirring in some primes
2137 2137   * should help.
2138 2138   */
2139 2139  static uint_t
2140 2140  hash_bylabel(void *hdata, mod_hash_key_t key)
2141 2141  {
2142 2142          const ts_label_t *lab = (ts_label_t *)key;
2143 2143          const uint32_t *up, *ue;
2144 2144          uint_t hash;
2145 2145          int i;
2146 2146  
2147 2147          _NOTE(ARGUNUSED(hdata));
2148 2148  
2149 2149          hash = lab->tsl_doi + (lab->tsl_doi << 1);
2150 2150          /* we depend on alignment of label, but not representation */
2151 2151          up = (const uint32_t *)&lab->tsl_label;
2152 2152          ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2153 2153          i = 1;
2154 2154          while (up < ue) {
2155 2155                  /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2156 2156                  hash += *up + (*up << ((i % 16) + 1));
2157 2157                  up++;
2158 2158                  i++;
2159 2159          }
2160 2160          return (hash);
2161 2161  }
2162 2162  
2163 2163  /*
2164 2164   * All that mod_hash cares about here is zero (equal) versus non-zero (not
2165 2165   * equal).  This may need to be changed if less than / greater than is ever
2166 2166   * needed.
2167 2167   */
2168 2168  static int
2169 2169  hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2170 2170  {
2171 2171          ts_label_t *lab1 = (ts_label_t *)key1;
2172 2172          ts_label_t *lab2 = (ts_label_t *)key2;
2173 2173  
2174 2174          return (label_equal(lab1, lab2) ? 0 : 1);
2175 2175  }
2176 2176  
2177 2177  /*
2178 2178   * Called by main() to initialize the zones framework.
2179 2179   */
2180 2180  void
2181 2181  zone_init(void)
2182 2182  {
2183 2183          rctl_dict_entry_t *rde;
2184 2184          rctl_val_t *dval;
2185 2185          rctl_set_t *set;
2186 2186          rctl_alloc_gp_t *gp;
2187 2187          rctl_entity_p_t e;
2188 2188          int res;
2189 2189  
2190 2190          ASSERT(curproc == &p0);
2191 2191  
2192 2192          /*
2193 2193           * Create ID space for zone IDs.  ID 0 is reserved for the
2194 2194           * global zone.
2195 2195           */
2196 2196          zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2197 2197  
2198 2198          /*
2199 2199           * Initialize generic zone resource controls, if any.
2200 2200           */
2201 2201          rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2202 2202              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2203 2203              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2204 2204              FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2205 2205  
2206 2206          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2207 2207              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2208 2208              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2209 2209              RCTL_GLOBAL_INFINITE,
2210 2210              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2211 2211  
2212 2212          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2213 2213              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2214 2214              INT_MAX, INT_MAX, &zone_lwps_ops);
2215 2215  
2216 2216          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2217 2217              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2218 2218              INT_MAX, INT_MAX, &zone_procs_ops);
2219 2219  
2220 2220          /*
2221 2221           * System V IPC resource controls
2222 2222           */
2223 2223          rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2224 2224              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2225 2225              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2226 2226  
2227 2227          rc_zone_semmni = rctl_register("zone.max-sem-ids",
2228 2228              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2229 2229              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2230 2230  
2231 2231          rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2232 2232              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2233 2233              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2234 2234  
2235 2235          rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2236 2236              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2237 2237              RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2238 2238  
2239 2239          /*
2240 2240           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2241 2241           * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2242 2242           */
2243 2243          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2244 2244          bzero(dval, sizeof (rctl_val_t));
2245 2245          dval->rcv_value = 1;
2246 2246          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2247 2247          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2248 2248          dval->rcv_action_recip_pid = -1;
2249 2249  
2250 2250          rde = rctl_dict_lookup("zone.cpu-shares");
2251 2251          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2252 2252  
2253 2253          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2254 2254              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2255 2255              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2256 2256              &zone_locked_mem_ops);
2257 2257  
2258 2258          rc_zone_max_swap = rctl_register("zone.max-swap",
2259 2259              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2260 2260              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2261 2261              &zone_max_swap_ops);
2262 2262  
2263 2263          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2264 2264              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2265 2265              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2266 2266              &zone_max_lofi_ops);
2267 2267  
2268 2268          /*
2269 2269           * Initialize the ``global zone''.
2270 2270           */
2271 2271          set = rctl_set_create();
2272 2272          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2273 2273          mutex_enter(&p0.p_lock);
2274 2274          e.rcep_p.zone = &zone0;
2275 2275          e.rcep_t = RCENTITY_ZONE;
2276 2276          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2277 2277              gp);
2278 2278  
2279 2279          zone0.zone_nlwps = p0.p_lwpcnt;
2280 2280          zone0.zone_nprocs = 1;
2281 2281          zone0.zone_ntasks = 1;
2282 2282          mutex_exit(&p0.p_lock);
2283 2283          zone0.zone_restart_init = B_TRUE;
2284 2284          zone0.zone_brand = &native_brand;
2285 2285          rctl_prealloc_destroy(gp);
2286 2286          /*
2287 2287           * pool_default hasn't been initialized yet, so we let pool_init()
2288 2288           * take care of making sure the global zone is in the default pool.
2289 2289           */
2290 2290  
2291 2291          /*
2292 2292           * Initialize global zone kstats
2293 2293           */
2294 2294          zone_kstat_create(&zone0);
2295 2295  
2296 2296          /*
2297 2297           * Initialize zone label.
2298 2298           * mlp are initialized when tnzonecfg is loaded.
2299 2299           */
2300 2300          zone0.zone_slabel = l_admin_low;
2301 2301          rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2302 2302          label_hold(l_admin_low);
2303 2303  
2304 2304          /*
2305 2305           * Initialise the lock for the database structure used by mntfs.
2306 2306           */
2307 2307          rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2308 2308  
2309 2309          zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
2310 2310  
2311 2311          mutex_enter(&zonehash_lock);
2312 2312          zone_uniqid(&zone0);
2313 2313          ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2314 2314  
2315 2315          zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2316 2316              mod_hash_null_valdtor);
2317 2317          zonehashbyname = mod_hash_create_strhash("zone_by_name",
2318 2318              zone_hash_size, mod_hash_null_valdtor);
2319 2319          /*
2320 2320           * maintain zonehashbylabel only for labeled systems
2321 2321           */
2322 2322          if (is_system_labeled())
2323 2323                  zonehashbylabel = mod_hash_create_extended("zone_by_label",
2324 2324                      zone_hash_size, mod_hash_null_keydtor,
2325 2325                      mod_hash_null_valdtor, hash_bylabel, NULL,
2326 2326                      hash_labelkey_cmp, KM_SLEEP);
2327 2327          zonecount = 1;
2328 2328  
2329 2329          (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2330 2330              (mod_hash_val_t)&zone0);
2331 2331          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2332 2332              (mod_hash_val_t)&zone0);
2333 2333          if (is_system_labeled()) {
2334 2334                  zone0.zone_flags |= ZF_HASHED_LABEL;
2335 2335                  (void) mod_hash_insert(zonehashbylabel,
2336 2336                      (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2337 2337          }
2338 2338          mutex_exit(&zonehash_lock);
2339 2339  
2340 2340          /*
2341 2341           * We avoid setting zone_kcred until now, since kcred is initialized
2342 2342           * sometime after zone_zsd_init() and before zone_init().
2343 2343           */
2344 2344          zone0.zone_kcred = kcred;
2345 2345          /*
2346 2346           * The global zone is fully initialized (except for zone_rootvp which
2347 2347           * will be set when the root filesystem is mounted).
2348 2348           */
2349 2349          global_zone = &zone0;
2350 2350  
2351 2351          /*
2352 2352           * Setup an event channel to send zone status change notifications on
2353 2353           */
2354 2354          res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2355 2355              EVCH_CREAT);
2356 2356  
2357 2357          if (res)
2358 2358                  panic("Sysevent_evc_bind failed during zone setup.\n");
2359 2359  
2360 2360  }
2361 2361  
2362 2362  static void
2363 2363  zone_free(zone_t *zone)
2364 2364  {
2365 2365          ASSERT(zone != global_zone);
2366 2366          ASSERT(zone->zone_ntasks == 0);
2367 2367          ASSERT(zone->zone_nlwps == 0);
2368 2368          ASSERT(zone->zone_nprocs == 0);
2369 2369          ASSERT(zone->zone_cred_ref == 0);
2370 2370          ASSERT(zone->zone_kcred == NULL);
2371 2371          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2372 2372              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2373 2373          ASSERT(list_is_empty(&zone->zone_ref_list));
2374 2374  
2375 2375          /*
2376 2376           * Remove any zone caps.
2377 2377           */
2378 2378          cpucaps_zone_remove(zone);
2379 2379  
2380 2380          ASSERT(zone->zone_cpucap == NULL);
2381 2381  
2382 2382          /* remove from deathrow list */
2383 2383          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2384 2384                  ASSERT(zone->zone_ref == 0);
2385 2385                  mutex_enter(&zone_deathrow_lock);
2386 2386                  list_remove(&zone_deathrow, zone);
  
    | 
      ↓ open down ↓ | 
    2386 lines elided | 
    
      ↑ open up ↑ | 
  
2387 2387                  mutex_exit(&zone_deathrow_lock);
2388 2388          }
2389 2389  
2390 2390          list_destroy(&zone->zone_ref_list);
2391 2391          zone_free_zsd(zone);
2392 2392          zone_free_datasets(zone);
2393 2393          list_destroy(&zone->zone_dl_list);
2394 2394  
2395 2395          cpu_uarray_free(zone->zone_ustate);
2396 2396  
2397      -        if (zone->zone_rootvp != NULL)
2398      -                VN_RELE(zone->zone_rootvp);
     2397 +        if (zone->zone_rootvp != NULL) {
     2398 +                vnode_t *vp = zone->zone_rootvp;
     2399 +
     2400 +                mutex_enter(&vp->v_lock);
     2401 +                vp->v_flag &= ~VZONEROOT;
     2402 +                mutex_exit(&vp->v_lock);
     2403 +                VN_RELE(vp);
     2404 +                /* No need to worry about NULL-ing out zone_rootvp. */
     2405 +        }
2399 2406          if (zone->zone_rootpath)
2400 2407                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2401 2408          if (zone->zone_name != NULL)
2402 2409                  kmem_free(zone->zone_name, ZONENAME_MAX);
2403 2410          if (zone->zone_slabel != NULL)
2404 2411                  label_rele(zone->zone_slabel);
2405 2412          if (zone->zone_nodename != NULL)
2406 2413                  kmem_free(zone->zone_nodename, _SYS_NMLN);
2407 2414          if (zone->zone_domain != NULL)
2408 2415                  kmem_free(zone->zone_domain, _SYS_NMLN);
2409 2416          if (zone->zone_privset != NULL)
2410 2417                  kmem_free(zone->zone_privset, sizeof (priv_set_t));
2411 2418          if (zone->zone_rctls != NULL)
2412 2419                  rctl_set_free(zone->zone_rctls);
2413 2420          if (zone->zone_bootargs != NULL)
2414 2421                  strfree(zone->zone_bootargs);
2415 2422          if (zone->zone_initname != NULL)
2416 2423                  strfree(zone->zone_initname);
2417 2424          if (zone->zone_fs_allowed != NULL)
2418 2425                  strfree(zone->zone_fs_allowed);
2419 2426          if (zone->zone_pfexecd != NULL)
2420 2427                  klpd_freelist(&zone->zone_pfexecd);
2421 2428          id_free(zoneid_space, zone->zone_id);
2422 2429          mutex_destroy(&zone->zone_lock);
2423 2430          cv_destroy(&zone->zone_cv);
2424 2431          rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2425 2432          rw_destroy(&zone->zone_mntfs_db_lock);
2426 2433          kmem_free(zone, sizeof (zone_t));
2427 2434  }
2428 2435  
2429 2436  /*
2430 2437   * See block comment at the top of this file for information about zone
2431 2438   * status values.
2432 2439   */
2433 2440  /*
2434 2441   * Convenience function for setting zone status.
2435 2442   */
2436 2443  static void
2437 2444  zone_status_set(zone_t *zone, zone_status_t status)
2438 2445  {
2439 2446  
2440 2447          nvlist_t *nvl = NULL;
2441 2448          ASSERT(MUTEX_HELD(&zone_status_lock));
2442 2449          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2443 2450              status >= zone_status_get(zone));
2444 2451  
2445 2452          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2446 2453              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2447 2454              nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2448 2455              zone_status_table[status]) ||
2449 2456              nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2450 2457              zone_status_table[zone->zone_status]) ||
2451 2458              nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2452 2459              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2453 2460              sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2454 2461              ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2455 2462  #ifdef DEBUG
2456 2463                  (void) printf(
2457 2464                      "Failed to allocate and send zone state change event.\n");
2458 2465  #endif
2459 2466          }
2460 2467          nvlist_free(nvl);
2461 2468  
2462 2469          zone->zone_status = status;
2463 2470  
2464 2471          cv_broadcast(&zone->zone_cv);
2465 2472  }
2466 2473  
2467 2474  /*
2468 2475   * Public function to retrieve the zone status.  The zone status may
2469 2476   * change after it is retrieved.
2470 2477   */
2471 2478  zone_status_t
2472 2479  zone_status_get(zone_t *zone)
2473 2480  {
2474 2481          return (zone->zone_status);
2475 2482  }
2476 2483  
2477 2484  static int
2478 2485  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2479 2486  {
2480 2487          char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2481 2488          int err = 0;
2482 2489  
2483 2490          ASSERT(zone != global_zone);
2484 2491          if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2485 2492                  goto done;      /* EFAULT or ENAMETOOLONG */
2486 2493  
2487 2494          if (zone->zone_bootargs != NULL)
2488 2495                  strfree(zone->zone_bootargs);
2489 2496  
2490 2497          zone->zone_bootargs = strdup(buf);
2491 2498  
2492 2499  done:
2493 2500          kmem_free(buf, BOOTARGS_MAX);
2494 2501          return (err);
2495 2502  }
2496 2503  
2497 2504  static int
2498 2505  zone_set_brand(zone_t *zone, const char *brand)
2499 2506  {
2500 2507          struct brand_attr *attrp;
2501 2508          brand_t *bp;
2502 2509  
2503 2510          attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2504 2511          if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2505 2512                  kmem_free(attrp, sizeof (struct brand_attr));
2506 2513                  return (EFAULT);
2507 2514          }
2508 2515  
2509 2516          bp = brand_register_zone(attrp);
2510 2517          kmem_free(attrp, sizeof (struct brand_attr));
2511 2518          if (bp == NULL)
2512 2519                  return (EINVAL);
2513 2520  
2514 2521          /*
2515 2522           * This is the only place where a zone can change it's brand.
2516 2523           * We already need to hold zone_status_lock to check the zone
2517 2524           * status, so we'll just use that lock to serialize zone
2518 2525           * branding requests as well.
2519 2526           */
2520 2527          mutex_enter(&zone_status_lock);
2521 2528  
2522 2529          /* Re-Branding is not allowed and the zone can't be booted yet */
2523 2530          if ((ZONE_IS_BRANDED(zone)) ||
2524 2531              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2525 2532                  mutex_exit(&zone_status_lock);
2526 2533                  brand_unregister_zone(bp);
2527 2534                  return (EINVAL);
2528 2535          }
2529 2536  
2530 2537          /* set up the brand specific data */
2531 2538          zone->zone_brand = bp;
2532 2539          ZBROP(zone)->b_init_brand_data(zone);
2533 2540  
2534 2541          mutex_exit(&zone_status_lock);
2535 2542          return (0);
2536 2543  }
2537 2544  
2538 2545  static int
2539 2546  zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
2540 2547  {
2541 2548          int err = 0;
2542 2549          psecflags_t psf;
2543 2550  
2544 2551          ASSERT(zone != global_zone);
2545 2552  
2546 2553          if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
2547 2554                  return (err);
2548 2555  
2549 2556          if (zone_status_get(zone) > ZONE_IS_READY)
2550 2557                  return (EINVAL);
2551 2558  
2552 2559          if (!psecflags_validate(&psf))
2553 2560                  return (EINVAL);
2554 2561  
2555 2562          (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
2556 2563  
2557 2564          /* Set security flags on the zone's zsched */
2558 2565          (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
2559 2566              sizeof (zone->zone_zsched->p_secflags));
2560 2567  
2561 2568          return (0);
2562 2569  }
2563 2570  
2564 2571  static int
2565 2572  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2566 2573  {
2567 2574          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2568 2575          int err = 0;
2569 2576  
2570 2577          ASSERT(zone != global_zone);
2571 2578          if ((err = copyinstr(zone_fs_allowed, buf,
2572 2579              ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2573 2580                  goto done;
2574 2581  
2575 2582          if (zone->zone_fs_allowed != NULL)
2576 2583                  strfree(zone->zone_fs_allowed);
2577 2584  
2578 2585          zone->zone_fs_allowed = strdup(buf);
2579 2586  
2580 2587  done:
2581 2588          kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2582 2589          return (err);
2583 2590  }
2584 2591  
2585 2592  static int
2586 2593  zone_set_initname(zone_t *zone, const char *zone_initname)
2587 2594  {
2588 2595          char initname[INITNAME_SZ];
2589 2596          size_t len;
2590 2597          int err = 0;
2591 2598  
2592 2599          ASSERT(zone != global_zone);
2593 2600          if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2594 2601                  return (err);   /* EFAULT or ENAMETOOLONG */
2595 2602  
2596 2603          if (zone->zone_initname != NULL)
2597 2604                  strfree(zone->zone_initname);
2598 2605  
2599 2606          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2600 2607          (void) strcpy(zone->zone_initname, initname);
2601 2608          return (0);
2602 2609  }
2603 2610  
2604 2611  static int
2605 2612  zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2606 2613  {
2607 2614          uint64_t mcap;
2608 2615          int err = 0;
2609 2616  
2610 2617          if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2611 2618                  zone->zone_phys_mcap = mcap;
2612 2619  
2613 2620          return (err);
2614 2621  }
2615 2622  
2616 2623  static int
2617 2624  zone_set_sched_class(zone_t *zone, const char *new_class)
2618 2625  {
2619 2626          char sched_class[PC_CLNMSZ];
2620 2627          id_t classid;
2621 2628          int err;
2622 2629  
2623 2630          ASSERT(zone != global_zone);
2624 2631          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2625 2632                  return (err);   /* EFAULT or ENAMETOOLONG */
2626 2633  
2627 2634          if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2628 2635                  return (set_errno(EINVAL));
2629 2636          zone->zone_defaultcid = classid;
2630 2637          ASSERT(zone->zone_defaultcid > 0 &&
2631 2638              zone->zone_defaultcid < loaded_classes);
2632 2639  
2633 2640          return (0);
2634 2641  }
2635 2642  
2636 2643  /*
2637 2644   * Block indefinitely waiting for (zone_status >= status)
2638 2645   */
2639 2646  void
2640 2647  zone_status_wait(zone_t *zone, zone_status_t status)
2641 2648  {
2642 2649          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2643 2650  
2644 2651          mutex_enter(&zone_status_lock);
2645 2652          while (zone->zone_status < status) {
2646 2653                  cv_wait(&zone->zone_cv, &zone_status_lock);
2647 2654          }
2648 2655          mutex_exit(&zone_status_lock);
2649 2656  }
2650 2657  
2651 2658  /*
2652 2659   * Private CPR-safe version of zone_status_wait().
2653 2660   */
2654 2661  static void
2655 2662  zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2656 2663  {
2657 2664          callb_cpr_t cprinfo;
2658 2665  
2659 2666          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2660 2667  
2661 2668          CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2662 2669              str);
2663 2670          mutex_enter(&zone_status_lock);
2664 2671          while (zone->zone_status < status) {
2665 2672                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2666 2673                  cv_wait(&zone->zone_cv, &zone_status_lock);
2667 2674                  CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2668 2675          }
2669 2676          /*
2670 2677           * zone_status_lock is implicitly released by the following.
2671 2678           */
2672 2679          CALLB_CPR_EXIT(&cprinfo);
2673 2680  }
2674 2681  
2675 2682  /*
2676 2683   * Block until zone enters requested state or signal is received.  Return (0)
2677 2684   * if signaled, non-zero otherwise.
2678 2685   */
2679 2686  int
2680 2687  zone_status_wait_sig(zone_t *zone, zone_status_t status)
2681 2688  {
2682 2689          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2683 2690  
2684 2691          mutex_enter(&zone_status_lock);
2685 2692          while (zone->zone_status < status) {
2686 2693                  if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2687 2694                          mutex_exit(&zone_status_lock);
2688 2695                          return (0);
2689 2696                  }
2690 2697          }
2691 2698          mutex_exit(&zone_status_lock);
2692 2699          return (1);
2693 2700  }
2694 2701  
2695 2702  /*
2696 2703   * Block until the zone enters the requested state or the timeout expires,
2697 2704   * whichever happens first.  Return (-1) if operation timed out, time remaining
2698 2705   * otherwise.
2699 2706   */
2700 2707  clock_t
2701 2708  zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2702 2709  {
2703 2710          clock_t timeleft = 0;
2704 2711  
2705 2712          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2706 2713  
2707 2714          mutex_enter(&zone_status_lock);
2708 2715          while (zone->zone_status < status && timeleft != -1) {
2709 2716                  timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2710 2717          }
2711 2718          mutex_exit(&zone_status_lock);
2712 2719          return (timeleft);
2713 2720  }
2714 2721  
2715 2722  /*
2716 2723   * Block until the zone enters the requested state, the current process is
2717 2724   * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2718 2725   * operation timed out, 0 if signaled, time remaining otherwise.
2719 2726   */
2720 2727  clock_t
2721 2728  zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2722 2729  {
2723 2730          clock_t timeleft = tim - ddi_get_lbolt();
2724 2731  
2725 2732          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2726 2733  
2727 2734          mutex_enter(&zone_status_lock);
2728 2735          while (zone->zone_status < status) {
2729 2736                  timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2730 2737                      tim);
2731 2738                  if (timeleft <= 0)
2732 2739                          break;
2733 2740          }
2734 2741          mutex_exit(&zone_status_lock);
2735 2742          return (timeleft);
2736 2743  }
2737 2744  
2738 2745  /*
2739 2746   * Zones have two reference counts: one for references from credential
2740 2747   * structures (zone_cred_ref), and one (zone_ref) for everything else.
2741 2748   * This is so we can allow a zone to be rebooted while there are still
2742 2749   * outstanding cred references, since certain drivers cache dblks (which
2743 2750   * implicitly results in cached creds).  We wait for zone_ref to drop to
2744 2751   * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2745 2752   * later freed when the zone_cred_ref drops to 0, though nothing other
2746 2753   * than the zone id and privilege set should be accessed once the zone
2747 2754   * is "dead".
2748 2755   *
2749 2756   * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2750 2757   * to force halt/reboot to block waiting for the zone_cred_ref to drop
2751 2758   * to 0.  This can be useful to flush out other sources of cached creds
2752 2759   * that may be less innocuous than the driver case.
2753 2760   *
2754 2761   * Zones also provide a tracked reference counting mechanism in which zone
2755 2762   * references are represented by "crumbs" (zone_ref structures).  Crumbs help
2756 2763   * debuggers determine the sources of leaked zone references.  See
2757 2764   * zone_hold_ref() and zone_rele_ref() below for more information.
2758 2765   */
2759 2766  
2760 2767  int zone_wait_for_cred = 0;
2761 2768  
2762 2769  static void
2763 2770  zone_hold_locked(zone_t *z)
2764 2771  {
2765 2772          ASSERT(MUTEX_HELD(&z->zone_lock));
2766 2773          z->zone_ref++;
2767 2774          ASSERT(z->zone_ref != 0);
2768 2775  }
2769 2776  
2770 2777  /*
2771 2778   * Increment the specified zone's reference count.  The zone's zone_t structure
2772 2779   * will not be freed as long as the zone's reference count is nonzero.
2773 2780   * Decrement the zone's reference count via zone_rele().
2774 2781   *
2775 2782   * NOTE: This function should only be used to hold zones for short periods of
2776 2783   * time.  Use zone_hold_ref() if the zone must be held for a long time.
2777 2784   */
2778 2785  void
2779 2786  zone_hold(zone_t *z)
2780 2787  {
2781 2788          mutex_enter(&z->zone_lock);
2782 2789          zone_hold_locked(z);
2783 2790          mutex_exit(&z->zone_lock);
2784 2791  }
2785 2792  
2786 2793  /*
2787 2794   * If the non-cred ref count drops to 1 and either the cred ref count
2788 2795   * is 0 or we aren't waiting for cred references, the zone is ready to
2789 2796   * be destroyed.
2790 2797   */
2791 2798  #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
2792 2799              (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2793 2800  
2794 2801  /*
2795 2802   * Common zone reference release function invoked by zone_rele() and
2796 2803   * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
2797 2804   * zone's subsystem-specific reference counters are not affected by the
2798 2805   * release.  If ref is not NULL, then the zone_ref_t to which it refers is
2799 2806   * removed from the specified zone's reference list.  ref must be non-NULL iff
2800 2807   * subsys is not ZONE_REF_NUM_SUBSYS.
2801 2808   */
2802 2809  static void
2803 2810  zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2804 2811  {
2805 2812          boolean_t wakeup;
2806 2813  
2807 2814          mutex_enter(&z->zone_lock);
2808 2815          ASSERT(z->zone_ref != 0);
2809 2816          z->zone_ref--;
2810 2817          if (subsys != ZONE_REF_NUM_SUBSYS) {
2811 2818                  ASSERT(z->zone_subsys_ref[subsys] != 0);
2812 2819                  z->zone_subsys_ref[subsys]--;
2813 2820                  list_remove(&z->zone_ref_list, ref);
2814 2821          }
2815 2822          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2816 2823                  /* no more refs, free the structure */
2817 2824                  mutex_exit(&z->zone_lock);
2818 2825                  zone_free(z);
2819 2826                  return;
2820 2827          }
2821 2828          /* signal zone_destroy so the zone can finish halting */
2822 2829          wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2823 2830          mutex_exit(&z->zone_lock);
2824 2831  
2825 2832          if (wakeup) {
2826 2833                  /*
2827 2834                   * Grabbing zonehash_lock here effectively synchronizes with
2828 2835                   * zone_destroy() to avoid missed signals.
2829 2836                   */
2830 2837                  mutex_enter(&zonehash_lock);
2831 2838                  cv_broadcast(&zone_destroy_cv);
2832 2839                  mutex_exit(&zonehash_lock);
2833 2840          }
2834 2841  }
2835 2842  
2836 2843  /*
2837 2844   * Decrement the specified zone's reference count.  The specified zone will
2838 2845   * cease to exist after this function returns if the reference count drops to
2839 2846   * zero.  This function should be paired with zone_hold().
2840 2847   */
2841 2848  void
2842 2849  zone_rele(zone_t *z)
2843 2850  {
2844 2851          zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
2845 2852  }
2846 2853  
2847 2854  /*
2848 2855   * Initialize a zone reference structure.  This function must be invoked for
2849 2856   * a reference structure before the structure is passed to zone_hold_ref().
2850 2857   */
2851 2858  void
2852 2859  zone_init_ref(zone_ref_t *ref)
2853 2860  {
2854 2861          ref->zref_zone = NULL;
2855 2862          list_link_init(&ref->zref_linkage);
2856 2863  }
2857 2864  
2858 2865  /*
2859 2866   * Acquire a reference to zone z.  The caller must specify the
2860 2867   * zone_ref_subsys_t constant associated with its subsystem.  The specified
2861 2868   * zone_ref_t structure will represent a reference to the specified zone.  Use
2862 2869   * zone_rele_ref() to release the reference.
2863 2870   *
2864 2871   * The referenced zone_t structure will not be freed as long as the zone_t's
2865 2872   * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
2866 2873   * references.
2867 2874   *
2868 2875   * NOTE: The zone_ref_t structure must be initialized before it is used.
2869 2876   * See zone_init_ref() above.
2870 2877   */
2871 2878  void
2872 2879  zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
2873 2880  {
2874 2881          ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
2875 2882  
2876 2883          /*
2877 2884           * Prevent consumers from reusing a reference structure before
2878 2885           * releasing it.
2879 2886           */
2880 2887          VERIFY(ref->zref_zone == NULL);
2881 2888  
2882 2889          ref->zref_zone = z;
2883 2890          mutex_enter(&z->zone_lock);
2884 2891          zone_hold_locked(z);
2885 2892          z->zone_subsys_ref[subsys]++;
2886 2893          ASSERT(z->zone_subsys_ref[subsys] != 0);
2887 2894          list_insert_head(&z->zone_ref_list, ref);
2888 2895          mutex_exit(&z->zone_lock);
2889 2896  }
2890 2897  
2891 2898  /*
2892 2899   * Release the zone reference represented by the specified zone_ref_t.
2893 2900   * The reference is invalid after it's released; however, the zone_ref_t
2894 2901   * structure can be reused without having to invoke zone_init_ref().
2895 2902   * subsys should be the same value that was passed to zone_hold_ref()
2896 2903   * when the reference was acquired.
2897 2904   */
2898 2905  void
2899 2906  zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
2900 2907  {
2901 2908          zone_rele_common(ref->zref_zone, ref, subsys);
2902 2909  
2903 2910          /*
2904 2911           * Set the zone_ref_t's zref_zone field to NULL to generate panics
2905 2912           * when consumers dereference the reference.  This helps us catch
2906 2913           * consumers who use released references.  Furthermore, this lets
2907 2914           * consumers reuse the zone_ref_t structure without having to
2908 2915           * invoke zone_init_ref().
2909 2916           */
2910 2917          ref->zref_zone = NULL;
2911 2918  }
2912 2919  
2913 2920  void
2914 2921  zone_cred_hold(zone_t *z)
2915 2922  {
2916 2923          mutex_enter(&z->zone_lock);
2917 2924          z->zone_cred_ref++;
2918 2925          ASSERT(z->zone_cred_ref != 0);
2919 2926          mutex_exit(&z->zone_lock);
2920 2927  }
2921 2928  
2922 2929  void
2923 2930  zone_cred_rele(zone_t *z)
2924 2931  {
2925 2932          boolean_t wakeup;
2926 2933  
2927 2934          mutex_enter(&z->zone_lock);
2928 2935          ASSERT(z->zone_cred_ref != 0);
2929 2936          z->zone_cred_ref--;
2930 2937          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2931 2938                  /* no more refs, free the structure */
2932 2939                  mutex_exit(&z->zone_lock);
2933 2940                  zone_free(z);
2934 2941                  return;
2935 2942          }
2936 2943          /*
2937 2944           * If zone_destroy is waiting for the cred references to drain
2938 2945           * out, and they have, signal it.
2939 2946           */
2940 2947          wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2941 2948              zone_status_get(z) >= ZONE_IS_DEAD);
2942 2949          mutex_exit(&z->zone_lock);
2943 2950  
2944 2951          if (wakeup) {
2945 2952                  /*
2946 2953                   * Grabbing zonehash_lock here effectively synchronizes with
2947 2954                   * zone_destroy() to avoid missed signals.
2948 2955                   */
2949 2956                  mutex_enter(&zonehash_lock);
2950 2957                  cv_broadcast(&zone_destroy_cv);
2951 2958                  mutex_exit(&zonehash_lock);
2952 2959          }
2953 2960  }
2954 2961  
2955 2962  void
2956 2963  zone_task_hold(zone_t *z)
2957 2964  {
2958 2965          mutex_enter(&z->zone_lock);
2959 2966          z->zone_ntasks++;
2960 2967          ASSERT(z->zone_ntasks != 0);
2961 2968          mutex_exit(&z->zone_lock);
2962 2969  }
2963 2970  
2964 2971  void
2965 2972  zone_task_rele(zone_t *zone)
2966 2973  {
2967 2974          uint_t refcnt;
2968 2975  
2969 2976          mutex_enter(&zone->zone_lock);
2970 2977          ASSERT(zone->zone_ntasks != 0);
2971 2978          refcnt = --zone->zone_ntasks;
2972 2979          if (refcnt > 1) {       /* Common case */
2973 2980                  mutex_exit(&zone->zone_lock);
2974 2981                  return;
2975 2982          }
2976 2983          zone_hold_locked(zone); /* so we can use the zone_t later */
2977 2984          mutex_exit(&zone->zone_lock);
2978 2985          if (refcnt == 1) {
2979 2986                  /*
2980 2987                   * See if the zone is shutting down.
2981 2988                   */
2982 2989                  mutex_enter(&zone_status_lock);
2983 2990                  if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2984 2991                          goto out;
2985 2992                  }
2986 2993  
2987 2994                  /*
2988 2995                   * Make sure the ntasks didn't change since we
2989 2996                   * dropped zone_lock.
2990 2997                   */
2991 2998                  mutex_enter(&zone->zone_lock);
2992 2999                  if (refcnt != zone->zone_ntasks) {
2993 3000                          mutex_exit(&zone->zone_lock);
2994 3001                          goto out;
2995 3002                  }
2996 3003                  mutex_exit(&zone->zone_lock);
2997 3004  
2998 3005                  /*
2999 3006                   * No more user processes in the zone.  The zone is empty.
3000 3007                   */
3001 3008                  zone_status_set(zone, ZONE_IS_EMPTY);
3002 3009                  goto out;
3003 3010          }
3004 3011  
3005 3012          ASSERT(refcnt == 0);
3006 3013          /*
3007 3014           * zsched has exited; the zone is dead.
3008 3015           */
3009 3016          zone->zone_zsched = NULL;               /* paranoia */
3010 3017          mutex_enter(&zone_status_lock);
3011 3018          zone_status_set(zone, ZONE_IS_DEAD);
3012 3019  out:
3013 3020          mutex_exit(&zone_status_lock);
3014 3021          zone_rele(zone);
3015 3022  }
3016 3023  
3017 3024  zoneid_t
3018 3025  getzoneid(void)
3019 3026  {
3020 3027          return (curproc->p_zone->zone_id);
3021 3028  }
3022 3029  
3023 3030  /*
3024 3031   * Internal versions of zone_find_by_*().  These don't zone_hold() or
3025 3032   * check the validity of a zone's state.
3026 3033   */
3027 3034  static zone_t *
3028 3035  zone_find_all_by_id(zoneid_t zoneid)
3029 3036  {
3030 3037          mod_hash_val_t hv;
3031 3038          zone_t *zone = NULL;
3032 3039  
3033 3040          ASSERT(MUTEX_HELD(&zonehash_lock));
3034 3041  
3035 3042          if (mod_hash_find(zonehashbyid,
3036 3043              (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3037 3044                  zone = (zone_t *)hv;
3038 3045          return (zone);
3039 3046  }
3040 3047  
3041 3048  static zone_t *
3042 3049  zone_find_all_by_label(const ts_label_t *label)
3043 3050  {
3044 3051          mod_hash_val_t hv;
3045 3052          zone_t *zone = NULL;
3046 3053  
3047 3054          ASSERT(MUTEX_HELD(&zonehash_lock));
3048 3055  
3049 3056          /*
3050 3057           * zonehashbylabel is not maintained for unlabeled systems
3051 3058           */
3052 3059          if (!is_system_labeled())
3053 3060                  return (NULL);
3054 3061          if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3055 3062                  zone = (zone_t *)hv;
3056 3063          return (zone);
3057 3064  }
3058 3065  
3059 3066  static zone_t *
3060 3067  zone_find_all_by_name(char *name)
3061 3068  {
3062 3069          mod_hash_val_t hv;
3063 3070          zone_t *zone = NULL;
3064 3071  
3065 3072          ASSERT(MUTEX_HELD(&zonehash_lock));
3066 3073  
3067 3074          if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3068 3075                  zone = (zone_t *)hv;
3069 3076          return (zone);
3070 3077  }
3071 3078  
3072 3079  /*
3073 3080   * Public interface for looking up a zone by zoneid.  Only returns the zone if
3074 3081   * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3075 3082   * Caller must call zone_rele() once it is done with the zone.
3076 3083   *
3077 3084   * The zone may begin the zone_destroy() sequence immediately after this
3078 3085   * function returns, but may be safely used until zone_rele() is called.
3079 3086   */
3080 3087  zone_t *
3081 3088  zone_find_by_id(zoneid_t zoneid)
3082 3089  {
3083 3090          zone_t *zone;
3084 3091          zone_status_t status;
3085 3092  
3086 3093          mutex_enter(&zonehash_lock);
3087 3094          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3088 3095                  mutex_exit(&zonehash_lock);
3089 3096                  return (NULL);
3090 3097          }
3091 3098          status = zone_status_get(zone);
3092 3099          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3093 3100                  /*
3094 3101                   * For all practical purposes the zone doesn't exist.
3095 3102                   */
3096 3103                  mutex_exit(&zonehash_lock);
3097 3104                  return (NULL);
3098 3105          }
3099 3106          zone_hold(zone);
3100 3107          mutex_exit(&zonehash_lock);
3101 3108          return (zone);
3102 3109  }
3103 3110  
3104 3111  /*
3105 3112   * Similar to zone_find_by_id, but using zone label as the key.
3106 3113   */
3107 3114  zone_t *
3108 3115  zone_find_by_label(const ts_label_t *label)
3109 3116  {
3110 3117          zone_t *zone;
3111 3118          zone_status_t status;
3112 3119  
3113 3120          mutex_enter(&zonehash_lock);
3114 3121          if ((zone = zone_find_all_by_label(label)) == NULL) {
3115 3122                  mutex_exit(&zonehash_lock);
3116 3123                  return (NULL);
3117 3124          }
3118 3125  
3119 3126          status = zone_status_get(zone);
3120 3127          if (status > ZONE_IS_DOWN) {
3121 3128                  /*
3122 3129                   * For all practical purposes the zone doesn't exist.
3123 3130                   */
3124 3131                  mutex_exit(&zonehash_lock);
3125 3132                  return (NULL);
3126 3133          }
3127 3134          zone_hold(zone);
3128 3135          mutex_exit(&zonehash_lock);
3129 3136          return (zone);
3130 3137  }
3131 3138  
3132 3139  /*
3133 3140   * Similar to zone_find_by_id, but using zone name as the key.
3134 3141   */
3135 3142  zone_t *
3136 3143  zone_find_by_name(char *name)
3137 3144  {
3138 3145          zone_t *zone;
3139 3146          zone_status_t status;
3140 3147  
3141 3148          mutex_enter(&zonehash_lock);
3142 3149          if ((zone = zone_find_all_by_name(name)) == NULL) {
3143 3150                  mutex_exit(&zonehash_lock);
3144 3151                  return (NULL);
3145 3152          }
3146 3153          status = zone_status_get(zone);
3147 3154          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3148 3155                  /*
3149 3156                   * For all practical purposes the zone doesn't exist.
3150 3157                   */
3151 3158                  mutex_exit(&zonehash_lock);
3152 3159                  return (NULL);
3153 3160          }
3154 3161          zone_hold(zone);
3155 3162          mutex_exit(&zonehash_lock);
3156 3163          return (zone);
3157 3164  }
3158 3165  
3159 3166  /*
3160 3167   * Similar to zone_find_by_id(), using the path as a key.  For instance,
3161 3168   * if there is a zone "foo" rooted at /foo/root, and the path argument
3162 3169   * is "/foo/root/proc", it will return the held zone_t corresponding to
3163 3170   * zone "foo".
3164 3171   *
3165 3172   * zone_find_by_path() always returns a non-NULL value, since at the
3166 3173   * very least every path will be contained in the global zone.
3167 3174   *
3168 3175   * As with the other zone_find_by_*() functions, the caller is
3169 3176   * responsible for zone_rele()ing the return value of this function.
3170 3177   */
3171 3178  zone_t *
3172 3179  zone_find_by_path(const char *path)
3173 3180  {
3174 3181          zone_t *zone;
3175 3182          zone_t *zret = NULL;
3176 3183          zone_status_t status;
3177 3184  
3178 3185          if (path == NULL) {
3179 3186                  /*
3180 3187                   * Call from rootconf().
3181 3188                   */
3182 3189                  zone_hold(global_zone);
3183 3190                  return (global_zone);
3184 3191          }
3185 3192          ASSERT(*path == '/');
3186 3193          mutex_enter(&zonehash_lock);
3187 3194          for (zone = list_head(&zone_active); zone != NULL;
3188 3195              zone = list_next(&zone_active, zone)) {
3189 3196                  if (ZONE_PATH_VISIBLE(path, zone))
3190 3197                          zret = zone;
3191 3198          }
3192 3199          ASSERT(zret != NULL);
3193 3200          status = zone_status_get(zret);
3194 3201          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3195 3202                  /*
3196 3203                   * Zone practically doesn't exist.
3197 3204                   */
3198 3205                  zret = global_zone;
3199 3206          }
3200 3207          zone_hold(zret);
3201 3208          mutex_exit(&zonehash_lock);
3202 3209          return (zret);
3203 3210  }
3204 3211  
3205 3212  /*
3206 3213   * Public interface for updating per-zone load averages.  Called once per
3207 3214   * second.
3208 3215   *
3209 3216   * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3210 3217   */
3211 3218  void
3212 3219  zone_loadavg_update(void)
3213 3220  {
3214 3221          zone_t *zp;
3215 3222          zone_status_t status;
3216 3223          struct loadavg_s *lavg;
3217 3224          hrtime_t zone_total;
3218 3225          uint64_t tmp;
3219 3226          int i;
3220 3227          hrtime_t hr_avg;
3221 3228          int nrun;
3222 3229          static int64_t f[3] = { 135, 27, 9 };
3223 3230          int64_t q, r;
3224 3231  
3225 3232          mutex_enter(&zonehash_lock);
3226 3233          for (zp = list_head(&zone_active); zp != NULL;
3227 3234              zp = list_next(&zone_active, zp)) {
3228 3235                  mutex_enter(&zp->zone_lock);
3229 3236  
3230 3237                  /* Skip zones that are on the way down or not yet up */
3231 3238                  status = zone_status_get(zp);
3232 3239                  if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3233 3240                          /* For all practical purposes the zone doesn't exist. */
3234 3241                          mutex_exit(&zp->zone_lock);
3235 3242                          continue;
3236 3243                  }
3237 3244  
3238 3245                  /*
3239 3246                   * Update the 10 second moving average data in zone_loadavg.
3240 3247                   */
3241 3248                  lavg = &zp->zone_loadavg;
3242 3249  
3243 3250                  tmp = cpu_uarray_sum_all(zp->zone_ustate);
3244 3251                  zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
3245 3252  
3246 3253                  scalehrtime(&zone_total);
3247 3254  
3248 3255                  /* The zone_total should always be increasing. */
3249 3256                  lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3250 3257                      zone_total - lavg->lg_total : 0;
3251 3258                  lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3252 3259                  /* lg_total holds the prev. 1 sec. total */
3253 3260                  lavg->lg_total = zone_total;
3254 3261  
3255 3262                  /*
3256 3263                   * To simplify the calculation, we don't calculate the load avg.
3257 3264                   * until the zone has been up for at least 10 seconds and our
3258 3265                   * moving average is thus full.
3259 3266                   */
3260 3267                  if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3261 3268                          lavg->lg_len++;
3262 3269                          mutex_exit(&zp->zone_lock);
3263 3270                          continue;
3264 3271                  }
3265 3272  
3266 3273                  /* Now calculate the 1min, 5min, 15 min load avg. */
3267 3274                  hr_avg = 0;
3268 3275                  for (i = 0; i < S_LOADAVG_SZ; i++)
3269 3276                          hr_avg += lavg->lg_loads[i];
3270 3277                  hr_avg = hr_avg / S_LOADAVG_SZ;
3271 3278                  nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3272 3279  
3273 3280                  /* Compute load avg. See comment in calcloadavg() */
3274 3281                  for (i = 0; i < 3; i++) {
3275 3282                          q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3276 3283                          r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3277 3284                          zp->zone_hp_avenrun[i] +=
3278 3285                              ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3279 3286  
3280 3287                          /* avenrun[] can only hold 31 bits of load avg. */
3281 3288                          if (zp->zone_hp_avenrun[i] <
3282 3289                              ((uint64_t)1<<(31+16-FSHIFT)))
3283 3290                                  zp->zone_avenrun[i] = (int32_t)
3284 3291                                      (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3285 3292                          else
3286 3293                                  zp->zone_avenrun[i] = 0x7fffffff;
3287 3294                  }
3288 3295  
3289 3296                  mutex_exit(&zp->zone_lock);
3290 3297          }
3291 3298          mutex_exit(&zonehash_lock);
3292 3299  }
3293 3300  
3294 3301  /*
3295 3302   * Get the number of cpus visible to this zone.  The system-wide global
3296 3303   * 'ncpus' is returned if pools are disabled, the caller is in the
3297 3304   * global zone, or a NULL zone argument is passed in.
3298 3305   */
3299 3306  int
3300 3307  zone_ncpus_get(zone_t *zone)
3301 3308  {
3302 3309          int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3303 3310  
3304 3311          return (myncpus != 0 ? myncpus : ncpus);
3305 3312  }
3306 3313  
3307 3314  /*
3308 3315   * Get the number of online cpus visible to this zone.  The system-wide
3309 3316   * global 'ncpus_online' is returned if pools are disabled, the caller
3310 3317   * is in the global zone, or a NULL zone argument is passed in.
3311 3318   */
3312 3319  int
3313 3320  zone_ncpus_online_get(zone_t *zone)
3314 3321  {
3315 3322          int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3316 3323  
3317 3324          return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3318 3325  }
3319 3326  
3320 3327  /*
3321 3328   * Return the pool to which the zone is currently bound.
3322 3329   */
3323 3330  pool_t *
3324 3331  zone_pool_get(zone_t *zone)
3325 3332  {
3326 3333          ASSERT(pool_lock_held());
3327 3334  
3328 3335          return (zone->zone_pool);
3329 3336  }
3330 3337  
3331 3338  /*
3332 3339   * Set the zone's pool pointer and update the zone's visibility to match
3333 3340   * the resources in the new pool.
3334 3341   */
3335 3342  void
3336 3343  zone_pool_set(zone_t *zone, pool_t *pool)
3337 3344  {
3338 3345          ASSERT(pool_lock_held());
3339 3346          ASSERT(MUTEX_HELD(&cpu_lock));
3340 3347  
3341 3348          zone->zone_pool = pool;
3342 3349          zone_pset_set(zone, pool->pool_pset->pset_id);
3343 3350  }
3344 3351  
3345 3352  /*
3346 3353   * Return the cached value of the id of the processor set to which the
3347 3354   * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3348 3355   * facility is disabled.
3349 3356   */
3350 3357  psetid_t
3351 3358  zone_pset_get(zone_t *zone)
3352 3359  {
3353 3360          ASSERT(MUTEX_HELD(&cpu_lock));
3354 3361  
3355 3362          return (zone->zone_psetid);
3356 3363  }
3357 3364  
3358 3365  /*
3359 3366   * Set the cached value of the id of the processor set to which the zone
3360 3367   * is currently bound.  Also update the zone's visibility to match the
3361 3368   * resources in the new processor set.
3362 3369   */
3363 3370  void
3364 3371  zone_pset_set(zone_t *zone, psetid_t newpsetid)
3365 3372  {
3366 3373          psetid_t oldpsetid;
3367 3374  
3368 3375          ASSERT(MUTEX_HELD(&cpu_lock));
3369 3376          oldpsetid = zone_pset_get(zone);
3370 3377  
3371 3378          if (oldpsetid == newpsetid)
3372 3379                  return;
3373 3380          /*
3374 3381           * Global zone sees all.
3375 3382           */
3376 3383          if (zone != global_zone) {
3377 3384                  zone->zone_psetid = newpsetid;
3378 3385                  if (newpsetid != ZONE_PS_INVAL)
3379 3386                          pool_pset_visibility_add(newpsetid, zone);
3380 3387                  if (oldpsetid != ZONE_PS_INVAL)
3381 3388                          pool_pset_visibility_remove(oldpsetid, zone);
3382 3389          }
3383 3390          /*
3384 3391           * Disabling pools, so we should start using the global values
3385 3392           * for ncpus and ncpus_online.
3386 3393           */
3387 3394          if (newpsetid == ZONE_PS_INVAL) {
3388 3395                  zone->zone_ncpus = 0;
3389 3396                  zone->zone_ncpus_online = 0;
3390 3397          }
3391 3398  }
3392 3399  
3393 3400  /*
3394 3401   * Walk the list of active zones and issue the provided callback for
3395 3402   * each of them.
3396 3403   *
3397 3404   * Caller must not be holding any locks that may be acquired under
3398 3405   * zonehash_lock.  See comment at the beginning of the file for a list of
3399 3406   * common locks and their interactions with zones.
3400 3407   */
3401 3408  int
3402 3409  zone_walk(int (*cb)(zone_t *, void *), void *data)
3403 3410  {
3404 3411          zone_t *zone;
3405 3412          int ret = 0;
3406 3413          zone_status_t status;
3407 3414  
3408 3415          mutex_enter(&zonehash_lock);
3409 3416          for (zone = list_head(&zone_active); zone != NULL;
3410 3417              zone = list_next(&zone_active, zone)) {
3411 3418                  /*
3412 3419                   * Skip zones that shouldn't be externally visible.
3413 3420                   */
3414 3421                  status = zone_status_get(zone);
3415 3422                  if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
3416 3423                          continue;
3417 3424                  /*
3418 3425                   * Bail immediately if any callback invocation returns a
3419 3426                   * non-zero value.
3420 3427                   */
3421 3428                  ret = (*cb)(zone, data);
3422 3429                  if (ret != 0)
3423 3430                          break;
3424 3431          }
3425 3432          mutex_exit(&zonehash_lock);
3426 3433          return (ret);
3427 3434  }
3428 3435  
3429 3436  static int
3430 3437  zone_set_root(zone_t *zone, const char *upath)
3431 3438  {
3432 3439          vnode_t *vp;
3433 3440          int trycount;
3434 3441          int error = 0;
3435 3442          char *path;
3436 3443          struct pathname upn, pn;
3437 3444          size_t pathlen;
3438 3445  
3439 3446          if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
3440 3447                  return (error);
3441 3448  
3442 3449          pn_alloc(&pn);
3443 3450  
3444 3451          /* prevent infinite loop */
3445 3452          trycount = 10;
3446 3453          for (;;) {
3447 3454                  if (--trycount <= 0) {
3448 3455                          error = ESTALE;
3449 3456                          goto out;
3450 3457                  }
3451 3458  
3452 3459                  if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3453 3460                          /*
3454 3461                           * VOP_ACCESS() may cover 'vp' with a new
3455 3462                           * filesystem, if 'vp' is an autoFS vnode.
3456 3463                           * Get the new 'vp' if so.
3457 3464                           */
3458 3465                          if ((error =
3459 3466                              VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3460 3467                              (!vn_ismntpt(vp) ||
3461 3468                              (error = traverse(&vp)) == 0)) {
3462 3469                                  pathlen = pn.pn_pathlen + 2;
3463 3470                                  path = kmem_alloc(pathlen, KM_SLEEP);
3464 3471                                  (void) strncpy(path, pn.pn_path,
3465 3472                                      pn.pn_pathlen + 1);
3466 3473                                  path[pathlen - 2] = '/';
3467 3474                                  path[pathlen - 1] = '\0';
3468 3475                                  pn_free(&pn);
3469 3476                                  pn_free(&upn);
3470 3477  
  
    | 
      ↓ open down ↓ | 
    1062 lines elided | 
    
      ↑ open up ↑ | 
  
3471 3478                                  /* Success! */
3472 3479                                  break;
3473 3480                          }
3474 3481                          VN_RELE(vp);
3475 3482                  }
3476 3483                  if (error != ESTALE)
3477 3484                          goto out;
3478 3485          }
3479 3486  
3480 3487          ASSERT(error == 0);
     3488 +        mutex_enter(&vp->v_lock);
     3489 +        if (vp->v_flag & VZONEROOT) {
     3490 +                /* Wow, someone's already using this zone root! */
     3491 +                error = EEXIST; /* XXX KEBE ASKS, better errno? */
     3492 +                mutex_exit(&vp->v_lock);
     3493 +                VN_RELE(vp);
     3494 +                goto out;
     3495 +        }
     3496 +        vp->v_flag |= VZONEROOT;
     3497 +        mutex_exit(&vp->v_lock);
3481 3498          zone->zone_rootvp = vp;         /* we hold a reference to vp */
3482 3499          zone->zone_rootpath = path;
3483 3500          zone->zone_rootpathlen = pathlen;
3484 3501          if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3485 3502                  zone->zone_flags |= ZF_IS_SCRATCH;
3486 3503          return (0);
3487 3504  
3488 3505  out:
3489 3506          pn_free(&pn);
3490 3507          pn_free(&upn);
3491 3508          return (error);
3492 3509  }
3493 3510  
3494 3511  #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
3495 3512                          ((c) >= 'a' && (c) <= 'z') || \
3496 3513                          ((c) >= 'A' && (c) <= 'Z'))
3497 3514  
3498 3515  static int
3499 3516  zone_set_name(zone_t *zone, const char *uname)
3500 3517  {
3501 3518          char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3502 3519          size_t len;
3503 3520          int i, err;
3504 3521  
3505 3522          if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3506 3523                  kmem_free(kname, ZONENAME_MAX);
3507 3524                  return (err);   /* EFAULT or ENAMETOOLONG */
3508 3525          }
3509 3526  
3510 3527          /* must be less than ZONENAME_MAX */
3511 3528          if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3512 3529                  kmem_free(kname, ZONENAME_MAX);
3513 3530                  return (EINVAL);
3514 3531          }
3515 3532  
3516 3533          /*
3517 3534           * Name must start with an alphanumeric and must contain only
3518 3535           * alphanumerics, '-', '_' and '.'.
3519 3536           */
3520 3537          if (!isalnum(kname[0])) {
3521 3538                  kmem_free(kname, ZONENAME_MAX);
3522 3539                  return (EINVAL);
3523 3540          }
3524 3541          for (i = 1; i < len - 1; i++) {
3525 3542                  if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3526 3543                      kname[i] != '.') {
3527 3544                          kmem_free(kname, ZONENAME_MAX);
3528 3545                          return (EINVAL);
3529 3546                  }
3530 3547          }
3531 3548  
3532 3549          zone->zone_name = kname;
3533 3550          return (0);
3534 3551  }
3535 3552  
3536 3553  /*
3537 3554   * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3538 3555   * is NULL or it points to a zone with no hostid emulation, then the machine's
3539 3556   * hostid (i.e., the global zone's hostid) is returned.  This function returns
3540 3557   * zero if neither the zone nor the host machine (global zone) have hostids.  It
3541 3558   * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3542 3559   * hostid and the machine's hostid is invalid.
3543 3560   */
3544 3561  uint32_t
3545 3562  zone_get_hostid(zone_t *zonep)
3546 3563  {
3547 3564          unsigned long machine_hostid;
3548 3565  
3549 3566          if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3550 3567                  if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3551 3568                          return (HW_INVALID_HOSTID);
3552 3569                  return ((uint32_t)machine_hostid);
3553 3570          }
3554 3571          return (zonep->zone_hostid);
3555 3572  }
3556 3573  
3557 3574  /*
3558 3575   * Similar to thread_create(), but makes sure the thread is in the appropriate
3559 3576   * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3560 3577   */
3561 3578  /*ARGSUSED*/
3562 3579  kthread_t *
3563 3580  zthread_create(
3564 3581      caddr_t stk,
3565 3582      size_t stksize,
3566 3583      void (*proc)(),
3567 3584      void *arg,
3568 3585      size_t len,
3569 3586      pri_t pri)
3570 3587  {
3571 3588          kthread_t *t;
3572 3589          zone_t *zone = curproc->p_zone;
3573 3590          proc_t *pp = zone->zone_zsched;
3574 3591  
3575 3592          zone_hold(zone);        /* Reference to be dropped when thread exits */
3576 3593  
3577 3594          /*
3578 3595           * No-one should be trying to create threads if the zone is shutting
3579 3596           * down and there aren't any kernel threads around.  See comment
3580 3597           * in zthread_exit().
3581 3598           */
3582 3599          ASSERT(!(zone->zone_kthreads == NULL &&
3583 3600              zone_status_get(zone) >= ZONE_IS_EMPTY));
3584 3601          /*
3585 3602           * Create a thread, but don't let it run until we've finished setting
3586 3603           * things up.
3587 3604           */
3588 3605          t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3589 3606          ASSERT(t->t_forw == NULL);
3590 3607          mutex_enter(&zone_status_lock);
3591 3608          if (zone->zone_kthreads == NULL) {
3592 3609                  t->t_forw = t->t_back = t;
3593 3610          } else {
3594 3611                  kthread_t *tx = zone->zone_kthreads;
3595 3612  
3596 3613                  t->t_forw = tx;
3597 3614                  t->t_back = tx->t_back;
3598 3615                  tx->t_back->t_forw = t;
3599 3616                  tx->t_back = t;
3600 3617          }
3601 3618          zone->zone_kthreads = t;
3602 3619          mutex_exit(&zone_status_lock);
3603 3620  
3604 3621          mutex_enter(&pp->p_lock);
3605 3622          t->t_proc_flag |= TP_ZTHREAD;
3606 3623          project_rele(t->t_proj);
3607 3624          t->t_proj = project_hold(pp->p_task->tk_proj);
3608 3625  
3609 3626          /*
3610 3627           * Setup complete, let it run.
3611 3628           */
3612 3629          thread_lock(t);
3613 3630          t->t_schedflag |= TS_ALLSTART;
3614 3631          setrun_locked(t);
3615 3632          thread_unlock(t);
3616 3633  
3617 3634          mutex_exit(&pp->p_lock);
3618 3635  
3619 3636          return (t);
3620 3637  }
3621 3638  
3622 3639  /*
3623 3640   * Similar to thread_exit().  Must be called by threads created via
3624 3641   * zthread_exit().
3625 3642   */
3626 3643  void
3627 3644  zthread_exit(void)
3628 3645  {
3629 3646          kthread_t *t = curthread;
3630 3647          proc_t *pp = curproc;
3631 3648          zone_t *zone = pp->p_zone;
3632 3649  
3633 3650          mutex_enter(&zone_status_lock);
3634 3651  
3635 3652          /*
3636 3653           * Reparent to p0
3637 3654           */
3638 3655          kpreempt_disable();
3639 3656          mutex_enter(&pp->p_lock);
3640 3657          t->t_proc_flag &= ~TP_ZTHREAD;
3641 3658          t->t_procp = &p0;
3642 3659          hat_thread_exit(t);
3643 3660          mutex_exit(&pp->p_lock);
3644 3661          kpreempt_enable();
3645 3662  
3646 3663          if (t->t_back == t) {
3647 3664                  ASSERT(t->t_forw == t);
3648 3665                  /*
3649 3666                   * If the zone is empty, once the thread count
3650 3667                   * goes to zero no further kernel threads can be
3651 3668                   * created.  This is because if the creator is a process
3652 3669                   * in the zone, then it must have exited before the zone
3653 3670                   * state could be set to ZONE_IS_EMPTY.
3654 3671                   * Otherwise, if the creator is a kernel thread in the
3655 3672                   * zone, the thread count is non-zero.
3656 3673                   *
3657 3674                   * This really means that non-zone kernel threads should
3658 3675                   * not create zone kernel threads.
3659 3676                   */
3660 3677                  zone->zone_kthreads = NULL;
3661 3678                  if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3662 3679                          zone_status_set(zone, ZONE_IS_DOWN);
3663 3680                          /*
3664 3681                           * Remove any CPU caps on this zone.
3665 3682                           */
3666 3683                          cpucaps_zone_remove(zone);
3667 3684                  }
3668 3685          } else {
3669 3686                  t->t_forw->t_back = t->t_back;
3670 3687                  t->t_back->t_forw = t->t_forw;
3671 3688                  if (zone->zone_kthreads == t)
3672 3689                          zone->zone_kthreads = t->t_forw;
3673 3690          }
3674 3691          mutex_exit(&zone_status_lock);
3675 3692          zone_rele(zone);
3676 3693          thread_exit();
3677 3694          /* NOTREACHED */
3678 3695  }
3679 3696  
3680 3697  static void
3681 3698  zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3682 3699  {
3683 3700          vnode_t *oldvp;
3684 3701  
3685 3702          /* we're going to hold a reference here to the directory */
3686 3703          VN_HOLD(vp);
3687 3704  
3688 3705          /* update abs cwd/root path see c2/audit.c */
3689 3706          if (AU_AUDITING())
3690 3707                  audit_chdirec(vp, vpp);
3691 3708  
3692 3709          mutex_enter(&pp->p_lock);
3693 3710          oldvp = *vpp;
3694 3711          *vpp = vp;
3695 3712          mutex_exit(&pp->p_lock);
3696 3713          if (oldvp != NULL)
3697 3714                  VN_RELE(oldvp);
3698 3715  }
3699 3716  
3700 3717  /*
3701 3718   * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3702 3719   */
3703 3720  static int
3704 3721  nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3705 3722  {
3706 3723          nvpair_t *nvp = NULL;
3707 3724          boolean_t priv_set = B_FALSE;
3708 3725          boolean_t limit_set = B_FALSE;
3709 3726          boolean_t action_set = B_FALSE;
3710 3727  
3711 3728          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3712 3729                  const char *name;
3713 3730                  uint64_t ui64;
3714 3731  
3715 3732                  name = nvpair_name(nvp);
3716 3733                  if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3717 3734                          return (EINVAL);
3718 3735                  (void) nvpair_value_uint64(nvp, &ui64);
3719 3736                  if (strcmp(name, "privilege") == 0) {
3720 3737                          /*
3721 3738                           * Currently only privileged values are allowed, but
3722 3739                           * this may change in the future.
3723 3740                           */
3724 3741                          if (ui64 != RCPRIV_PRIVILEGED)
3725 3742                                  return (EINVAL);
3726 3743                          rv->rcv_privilege = ui64;
3727 3744                          priv_set = B_TRUE;
3728 3745                  } else if (strcmp(name, "limit") == 0) {
3729 3746                          rv->rcv_value = ui64;
3730 3747                          limit_set = B_TRUE;
3731 3748                  } else if (strcmp(name, "action") == 0) {
3732 3749                          if (ui64 != RCTL_LOCAL_NOACTION &&
3733 3750                              ui64 != RCTL_LOCAL_DENY)
3734 3751                                  return (EINVAL);
3735 3752                          rv->rcv_flagaction = ui64;
3736 3753                          action_set = B_TRUE;
3737 3754                  } else {
3738 3755                          return (EINVAL);
3739 3756                  }
3740 3757          }
3741 3758  
3742 3759          if (!(priv_set && limit_set && action_set))
3743 3760                  return (EINVAL);
3744 3761          rv->rcv_action_signal = 0;
3745 3762          rv->rcv_action_recipient = NULL;
3746 3763          rv->rcv_action_recip_pid = -1;
3747 3764          rv->rcv_firing_time = 0;
3748 3765  
3749 3766          return (0);
3750 3767  }
3751 3768  
3752 3769  /*
3753 3770   * Non-global zone version of start_init.
3754 3771   */
3755 3772  void
3756 3773  zone_start_init(void)
3757 3774  {
3758 3775          proc_t *p = ttoproc(curthread);
3759 3776          zone_t *z = p->p_zone;
3760 3777  
3761 3778          ASSERT(!INGLOBALZONE(curproc));
3762 3779  
3763 3780          /*
3764 3781           * For all purposes (ZONE_ATTR_INITPID and restart_init),
3765 3782           * storing just the pid of init is sufficient.
3766 3783           */
3767 3784          z->zone_proc_initpid = p->p_pid;
3768 3785  
3769 3786          /*
3770 3787           * We maintain zone_boot_err so that we can return the cause of the
3771 3788           * failure back to the caller of the zone_boot syscall.
3772 3789           */
3773 3790          p->p_zone->zone_boot_err = start_init_common();
3774 3791  
3775 3792          /*
3776 3793           * We will prevent booting zones from becoming running zones if the
3777 3794           * global zone is shutting down.
3778 3795           */
3779 3796          mutex_enter(&zone_status_lock);
3780 3797          if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3781 3798              ZONE_IS_SHUTTING_DOWN) {
3782 3799                  /*
3783 3800                   * Make sure we are still in the booting state-- we could have
3784 3801                   * raced and already be shutting down, or even further along.
3785 3802                   */
3786 3803                  if (zone_status_get(z) == ZONE_IS_BOOTING) {
3787 3804                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3788 3805                  }
3789 3806                  mutex_exit(&zone_status_lock);
3790 3807                  /* It's gone bad, dispose of the process */
3791 3808                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3792 3809                          mutex_enter(&p->p_lock);
3793 3810                          ASSERT(p->p_flag & SEXITLWPS);
3794 3811                          lwp_exit();
3795 3812                  }
3796 3813          } else {
3797 3814                  if (zone_status_get(z) == ZONE_IS_BOOTING)
3798 3815                          zone_status_set(z, ZONE_IS_RUNNING);
3799 3816                  mutex_exit(&zone_status_lock);
3800 3817                  /* cause the process to return to userland. */
3801 3818                  lwp_rtt();
3802 3819          }
3803 3820  }
3804 3821  
3805 3822  struct zsched_arg {
3806 3823          zone_t *zone;
3807 3824          nvlist_t *nvlist;
3808 3825  };
3809 3826  
3810 3827  /*
3811 3828   * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3812 3829   * anything to do with scheduling, but rather with the fact that
3813 3830   * per-zone kernel threads are parented to zsched, just like regular
3814 3831   * kernel threads are parented to sched (p0).
3815 3832   *
3816 3833   * zsched is also responsible for launching init for the zone.
3817 3834   */
3818 3835  static void
3819 3836  zsched(void *arg)
3820 3837  {
3821 3838          struct zsched_arg *za = arg;
3822 3839          proc_t *pp = curproc;
3823 3840          proc_t *initp = proc_init;
3824 3841          zone_t *zone = za->zone;
3825 3842          cred_t *cr, *oldcred;
3826 3843          rctl_set_t *set;
3827 3844          rctl_alloc_gp_t *gp;
3828 3845          contract_t *ct = NULL;
3829 3846          task_t *tk, *oldtk;
3830 3847          rctl_entity_p_t e;
3831 3848          kproject_t *pj;
3832 3849  
3833 3850          nvlist_t *nvl = za->nvlist;
3834 3851          nvpair_t *nvp = NULL;
3835 3852  
3836 3853          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3837 3854          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3838 3855          PTOU(pp)->u_argc = 0;
3839 3856          PTOU(pp)->u_argv = 0;
3840 3857          PTOU(pp)->u_envp = 0;
3841 3858          PTOU(pp)->u_commpagep = 0;
3842 3859          closeall(P_FINFO(pp));
3843 3860  
3844 3861          /*
3845 3862           * We are this zone's "zsched" process.  As the zone isn't generally
3846 3863           * visible yet we don't need to grab any locks before initializing its
3847 3864           * zone_proc pointer.
3848 3865           */
3849 3866          zone_hold(zone);  /* this hold is released by zone_destroy() */
3850 3867          zone->zone_zsched = pp;
3851 3868          mutex_enter(&pp->p_lock);
3852 3869          pp->p_zone = zone;
3853 3870          mutex_exit(&pp->p_lock);
3854 3871  
3855 3872          /*
3856 3873           * Disassociate process from its 'parent'; parent ourselves to init
3857 3874           * (pid 1) and change other values as needed.
3858 3875           */
3859 3876          sess_create();
3860 3877  
3861 3878          mutex_enter(&pidlock);
3862 3879          proc_detach(pp);
3863 3880          pp->p_ppid = 1;
3864 3881          pp->p_flag |= SZONETOP;
3865 3882          pp->p_ancpid = 1;
3866 3883          pp->p_parent = initp;
3867 3884          pp->p_psibling = NULL;
3868 3885          if (initp->p_child)
3869 3886                  initp->p_child->p_psibling = pp;
3870 3887          pp->p_sibling = initp->p_child;
3871 3888          initp->p_child = pp;
3872 3889  
3873 3890          /* Decrement what newproc() incremented. */
3874 3891          upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3875 3892          /*
3876 3893           * Our credentials are about to become kcred-like, so we don't care
3877 3894           * about the caller's ruid.
3878 3895           */
3879 3896          upcount_inc(crgetruid(kcred), zone->zone_id);
3880 3897          mutex_exit(&pidlock);
3881 3898  
3882 3899          /*
3883 3900           * getting out of global zone, so decrement lwp and process counts
3884 3901           */
3885 3902          pj = pp->p_task->tk_proj;
3886 3903          mutex_enter(&global_zone->zone_nlwps_lock);
3887 3904          pj->kpj_nlwps -= pp->p_lwpcnt;
3888 3905          global_zone->zone_nlwps -= pp->p_lwpcnt;
3889 3906          pj->kpj_nprocs--;
3890 3907          global_zone->zone_nprocs--;
3891 3908          mutex_exit(&global_zone->zone_nlwps_lock);
3892 3909  
3893 3910          /*
3894 3911           * Decrement locked memory counts on old zone and project.
3895 3912           */
3896 3913          mutex_enter(&global_zone->zone_mem_lock);
3897 3914          global_zone->zone_locked_mem -= pp->p_locked_mem;
3898 3915          pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3899 3916          mutex_exit(&global_zone->zone_mem_lock);
3900 3917  
3901 3918          /*
3902 3919           * Create and join a new task in project '0' of this zone.
3903 3920           *
3904 3921           * We don't need to call holdlwps() since we know we're the only lwp in
3905 3922           * this process.
3906 3923           *
3907 3924           * task_join() returns with p_lock held.
3908 3925           */
3909 3926          tk = task_create(0, zone);
3910 3927          mutex_enter(&cpu_lock);
3911 3928          oldtk = task_join(tk, 0);
3912 3929  
3913 3930          pj = pp->p_task->tk_proj;
3914 3931  
3915 3932          mutex_enter(&zone->zone_mem_lock);
3916 3933          zone->zone_locked_mem += pp->p_locked_mem;
3917 3934          pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3918 3935          mutex_exit(&zone->zone_mem_lock);
3919 3936  
3920 3937          /*
3921 3938           * add lwp and process counts to zsched's zone, and increment
3922 3939           * project's task and process count due to the task created in
3923 3940           * the above task_create.
3924 3941           */
3925 3942          mutex_enter(&zone->zone_nlwps_lock);
3926 3943          pj->kpj_nlwps += pp->p_lwpcnt;
3927 3944          pj->kpj_ntasks += 1;
3928 3945          zone->zone_nlwps += pp->p_lwpcnt;
3929 3946          pj->kpj_nprocs++;
3930 3947          zone->zone_nprocs++;
3931 3948          mutex_exit(&zone->zone_nlwps_lock);
3932 3949  
3933 3950          mutex_exit(&curproc->p_lock);
3934 3951          mutex_exit(&cpu_lock);
3935 3952          task_rele(oldtk);
3936 3953  
3937 3954          /*
3938 3955           * The process was created by a process in the global zone, hence the
3939 3956           * credentials are wrong.  We might as well have kcred-ish credentials.
3940 3957           */
3941 3958          cr = zone->zone_kcred;
3942 3959          crhold(cr);
3943 3960          mutex_enter(&pp->p_crlock);
3944 3961          oldcred = pp->p_cred;
3945 3962          pp->p_cred = cr;
3946 3963          mutex_exit(&pp->p_crlock);
3947 3964          crfree(oldcred);
3948 3965  
3949 3966          /*
3950 3967           * Hold credentials again (for thread)
3951 3968           */
3952 3969          crhold(cr);
3953 3970  
3954 3971          /*
3955 3972           * p_lwpcnt can't change since this is a kernel process.
3956 3973           */
3957 3974          crset(pp, cr);
3958 3975  
3959 3976          /*
3960 3977           * Chroot
3961 3978           */
3962 3979          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3963 3980          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3964 3981  
3965 3982          /*
3966 3983           * Initialize zone's rctl set.
3967 3984           */
3968 3985          set = rctl_set_create();
3969 3986          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3970 3987          mutex_enter(&pp->p_lock);
3971 3988          e.rcep_p.zone = zone;
3972 3989          e.rcep_t = RCENTITY_ZONE;
3973 3990          zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3974 3991          mutex_exit(&pp->p_lock);
3975 3992          rctl_prealloc_destroy(gp);
3976 3993  
3977 3994          /*
3978 3995           * Apply the rctls passed in to zone_create().  This is basically a list
3979 3996           * assignment: all of the old values are removed and the new ones
3980 3997           * inserted.  That is, if an empty list is passed in, all values are
3981 3998           * removed.
3982 3999           */
3983 4000          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3984 4001                  rctl_dict_entry_t *rde;
3985 4002                  rctl_hndl_t hndl;
3986 4003                  char *name;
3987 4004                  nvlist_t **nvlarray;
3988 4005                  uint_t i, nelem;
3989 4006                  int error;      /* For ASSERT()s */
3990 4007  
3991 4008                  name = nvpair_name(nvp);
3992 4009                  hndl = rctl_hndl_lookup(name);
3993 4010                  ASSERT(hndl != -1);
3994 4011                  rde = rctl_dict_lookup_hndl(hndl);
3995 4012                  ASSERT(rde != NULL);
3996 4013  
3997 4014                  for (; /* ever */; ) {
3998 4015                          rctl_val_t oval;
3999 4016  
4000 4017                          mutex_enter(&pp->p_lock);
4001 4018                          error = rctl_local_get(hndl, NULL, &oval, pp);
4002 4019                          mutex_exit(&pp->p_lock);
4003 4020                          ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4004 4021                          ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4005 4022                          if (oval.rcv_privilege == RCPRIV_SYSTEM)
4006 4023                                  break;
4007 4024                          mutex_enter(&pp->p_lock);
4008 4025                          error = rctl_local_delete(hndl, &oval, pp);
4009 4026                          mutex_exit(&pp->p_lock);
4010 4027                          ASSERT(error == 0);
4011 4028                  }
4012 4029                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4013 4030                  ASSERT(error == 0);
4014 4031                  for (i = 0; i < nelem; i++) {
4015 4032                          rctl_val_t *nvalp;
4016 4033  
4017 4034                          nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4018 4035                          error = nvlist2rctlval(nvlarray[i], nvalp);
4019 4036                          ASSERT(error == 0);
4020 4037                          /*
4021 4038                           * rctl_local_insert can fail if the value being
4022 4039                           * inserted is a duplicate; this is OK.
4023 4040                           */
4024 4041                          mutex_enter(&pp->p_lock);
4025 4042                          if (rctl_local_insert(hndl, nvalp, pp) != 0)
4026 4043                                  kmem_cache_free(rctl_val_cache, nvalp);
4027 4044                          mutex_exit(&pp->p_lock);
4028 4045                  }
4029 4046          }
4030 4047  
4031 4048          /*
4032 4049           * Tell the world that we're done setting up.
4033 4050           *
4034 4051           * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4035 4052           * and atomically set the zone's processor set visibility.  Once
4036 4053           * we drop pool_lock() this zone will automatically get updated
4037 4054           * to reflect any future changes to the pools configuration.
4038 4055           *
4039 4056           * Note that after we drop the locks below (zonehash_lock in
4040 4057           * particular) other operations such as a zone_getattr call can
4041 4058           * now proceed and observe the zone. That is the reason for doing a
4042 4059           * state transition to the INITIALIZED state.
4043 4060           */
4044 4061          pool_lock();
4045 4062          mutex_enter(&cpu_lock);
4046 4063          mutex_enter(&zonehash_lock);
4047 4064          zone_uniqid(zone);
4048 4065          zone_zsd_configure(zone);
4049 4066          if (pool_state == POOL_ENABLED)
4050 4067                  zone_pset_set(zone, pool_default->pool_pset->pset_id);
4051 4068          mutex_enter(&zone_status_lock);
4052 4069          ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4053 4070          zone_status_set(zone, ZONE_IS_INITIALIZED);
4054 4071          mutex_exit(&zone_status_lock);
4055 4072          mutex_exit(&zonehash_lock);
4056 4073          mutex_exit(&cpu_lock);
4057 4074          pool_unlock();
4058 4075  
4059 4076          /* Now call the create callback for this key */
4060 4077          zsd_apply_all_keys(zsd_apply_create, zone);
4061 4078  
4062 4079          /* The callbacks are complete. Mark ZONE_IS_READY */
4063 4080          mutex_enter(&zone_status_lock);
4064 4081          ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4065 4082          zone_status_set(zone, ZONE_IS_READY);
4066 4083          mutex_exit(&zone_status_lock);
4067 4084  
4068 4085          /*
4069 4086           * Once we see the zone transition to the ZONE_IS_BOOTING state,
4070 4087           * we launch init, and set the state to running.
4071 4088           */
4072 4089          zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4073 4090  
4074 4091          if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4075 4092                  id_t cid;
4076 4093  
4077 4094                  /*
4078 4095                   * Ok, this is a little complicated.  We need to grab the
4079 4096                   * zone's pool's scheduling class ID; note that by now, we
4080 4097                   * are already bound to a pool if we need to be (zoneadmd
4081 4098                   * will have done that to us while we're in the READY
4082 4099                   * state).  *But* the scheduling class for the zone's 'init'
4083 4100                   * must be explicitly passed to newproc, which doesn't
4084 4101                   * respect pool bindings.
4085 4102                   *
4086 4103                   * We hold the pool_lock across the call to newproc() to
4087 4104                   * close the obvious race: the pool's scheduling class
4088 4105                   * could change before we manage to create the LWP with
4089 4106                   * classid 'cid'.
4090 4107                   */
4091 4108                  pool_lock();
4092 4109                  if (zone->zone_defaultcid > 0)
4093 4110                          cid = zone->zone_defaultcid;
4094 4111                  else
4095 4112                          cid = pool_get_class(zone->zone_pool);
4096 4113                  if (cid == -1)
4097 4114                          cid = defaultcid;
4098 4115  
4099 4116                  /*
4100 4117                   * If this fails, zone_boot will ultimately fail.  The
4101 4118                   * state of the zone will be set to SHUTTING_DOWN-- userland
4102 4119                   * will have to tear down the zone, and fail, or try again.
4103 4120                   */
4104 4121                  if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4105 4122                      minclsyspri - 1, &ct, 0)) != 0) {
4106 4123                          mutex_enter(&zone_status_lock);
4107 4124                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4108 4125                          mutex_exit(&zone_status_lock);
4109 4126                  } else {
4110 4127                          zone->zone_boot_time = gethrestime_sec();
4111 4128                  }
4112 4129  
4113 4130                  pool_unlock();
4114 4131          }
4115 4132  
4116 4133          /*
4117 4134           * Wait for zone_destroy() to be called.  This is what we spend
4118 4135           * most of our life doing.
4119 4136           */
4120 4137          zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4121 4138  
4122 4139          if (ct)
4123 4140                  /*
4124 4141                   * At this point the process contract should be empty.
4125 4142                   * (Though if it isn't, it's not the end of the world.)
4126 4143                   */
4127 4144                  VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4128 4145  
4129 4146          /*
4130 4147           * Allow kcred to be freed when all referring processes
4131 4148           * (including this one) go away.  We can't just do this in
4132 4149           * zone_free because we need to wait for the zone_cred_ref to
4133 4150           * drop to 0 before calling zone_free, and the existence of
4134 4151           * zone_kcred will prevent that.  Thus, we call crfree here to
4135 4152           * balance the crdup in zone_create.  The crhold calls earlier
4136 4153           * in zsched will be dropped when the thread and process exit.
4137 4154           */
4138 4155          crfree(zone->zone_kcred);
4139 4156          zone->zone_kcred = NULL;
4140 4157  
4141 4158          exit(CLD_EXITED, 0);
4142 4159  }
4143 4160  
4144 4161  /*
4145 4162   * Helper function to determine if there are any submounts of the
4146 4163   * provided path.  Used to make sure the zone doesn't "inherit" any
4147 4164   * mounts from before it is created.
4148 4165   */
4149 4166  static uint_t
4150 4167  zone_mount_count(const char *rootpath)
4151 4168  {
4152 4169          vfs_t *vfsp;
4153 4170          uint_t count = 0;
4154 4171          size_t rootpathlen = strlen(rootpath);
4155 4172  
4156 4173          /*
4157 4174           * Holding zonehash_lock prevents race conditions with
4158 4175           * vfs_list_add()/vfs_list_remove() since we serialize with
4159 4176           * zone_find_by_path().
4160 4177           */
4161 4178          ASSERT(MUTEX_HELD(&zonehash_lock));
4162 4179          /*
4163 4180           * The rootpath must end with a '/'
4164 4181           */
4165 4182          ASSERT(rootpath[rootpathlen - 1] == '/');
4166 4183  
4167 4184          /*
4168 4185           * This intentionally does not count the rootpath itself if that
4169 4186           * happens to be a mount point.
4170 4187           */
4171 4188          vfs_list_read_lock();
4172 4189          vfsp = rootvfs;
4173 4190          do {
4174 4191                  if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4175 4192                      rootpathlen) == 0)
4176 4193                          count++;
4177 4194                  vfsp = vfsp->vfs_next;
4178 4195          } while (vfsp != rootvfs);
4179 4196          vfs_list_unlock();
4180 4197          return (count);
4181 4198  }
4182 4199  
4183 4200  /*
4184 4201   * Helper function to make sure that a zone created on 'rootpath'
4185 4202   * wouldn't end up containing other zones' rootpaths.
4186 4203   */
4187 4204  static boolean_t
4188 4205  zone_is_nested(const char *rootpath)
4189 4206  {
4190 4207          zone_t *zone;
4191 4208          size_t rootpathlen = strlen(rootpath);
4192 4209          size_t len;
4193 4210  
4194 4211          ASSERT(MUTEX_HELD(&zonehash_lock));
4195 4212  
4196 4213          /*
4197 4214           * zone_set_root() appended '/' and '\0' at the end of rootpath
4198 4215           */
4199 4216          if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4200 4217              (rootpath[1] == '/') && (rootpath[2] == '\0'))
4201 4218                  return (B_TRUE);
4202 4219  
4203 4220          for (zone = list_head(&zone_active); zone != NULL;
4204 4221              zone = list_next(&zone_active, zone)) {
4205 4222                  if (zone == global_zone)
4206 4223                          continue;
4207 4224                  len = strlen(zone->zone_rootpath);
4208 4225                  if (strncmp(rootpath, zone->zone_rootpath,
4209 4226                      MIN(rootpathlen, len)) == 0)
4210 4227                          return (B_TRUE);
4211 4228          }
4212 4229          return (B_FALSE);
4213 4230  }
4214 4231  
4215 4232  static int
4216 4233  zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4217 4234      size_t zone_privssz)
4218 4235  {
4219 4236          priv_set_t *privs;
4220 4237  
4221 4238          if (zone_privssz < sizeof (priv_set_t))
4222 4239                  return (ENOMEM);
4223 4240  
4224 4241          privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4225 4242  
4226 4243          if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4227 4244                  kmem_free(privs, sizeof (priv_set_t));
4228 4245                  return (EFAULT);
4229 4246          }
4230 4247  
4231 4248          zone->zone_privset = privs;
4232 4249          return (0);
4233 4250  }
4234 4251  
4235 4252  /*
4236 4253   * We make creative use of nvlists to pass in rctls from userland.  The list is
4237 4254   * a list of the following structures:
4238 4255   *
4239 4256   * (name = rctl_name, value = nvpair_list_array)
4240 4257   *
4241 4258   * Where each element of the nvpair_list_array is of the form:
4242 4259   *
4243 4260   * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4244 4261   *      (name = "limit", value = uint64_t),
4245 4262   *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4246 4263   */
4247 4264  static int
4248 4265  parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4249 4266  {
4250 4267          nvpair_t *nvp = NULL;
4251 4268          nvlist_t *nvl = NULL;
4252 4269          char *kbuf;
4253 4270          int error;
4254 4271          rctl_val_t rv;
4255 4272  
4256 4273          *nvlp = NULL;
4257 4274  
4258 4275          if (buflen == 0)
4259 4276                  return (0);
4260 4277  
4261 4278          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4262 4279                  return (ENOMEM);
4263 4280          if (copyin(ubuf, kbuf, buflen)) {
4264 4281                  error = EFAULT;
4265 4282                  goto out;
4266 4283          }
4267 4284          if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4268 4285                  /*
4269 4286                   * nvl may have been allocated/free'd, but the value set to
4270 4287                   * non-NULL, so we reset it here.
4271 4288                   */
4272 4289                  nvl = NULL;
4273 4290                  error = EINVAL;
4274 4291                  goto out;
4275 4292          }
4276 4293          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4277 4294                  rctl_dict_entry_t *rde;
4278 4295                  rctl_hndl_t hndl;
4279 4296                  nvlist_t **nvlarray;
4280 4297                  uint_t i, nelem;
4281 4298                  char *name;
4282 4299  
4283 4300                  error = EINVAL;
4284 4301                  name = nvpair_name(nvp);
4285 4302                  if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4286 4303                      != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4287 4304                          goto out;
4288 4305                  }
4289 4306                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4290 4307                          goto out;
4291 4308                  }
4292 4309                  rde = rctl_dict_lookup_hndl(hndl);
4293 4310                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4294 4311                  ASSERT(error == 0);
4295 4312                  for (i = 0; i < nelem; i++) {
4296 4313                          if (error = nvlist2rctlval(nvlarray[i], &rv))
4297 4314                                  goto out;
4298 4315                  }
4299 4316                  if (rctl_invalid_value(rde, &rv)) {
4300 4317                          error = EINVAL;
4301 4318                          goto out;
4302 4319                  }
4303 4320          }
4304 4321          error = 0;
4305 4322          *nvlp = nvl;
4306 4323  out:
4307 4324          kmem_free(kbuf, buflen);
4308 4325          if (error && nvl != NULL)
4309 4326                  nvlist_free(nvl);
4310 4327          return (error);
4311 4328  }
4312 4329  
4313 4330  int
4314 4331  zone_create_error(int er_error, int er_ext, int *er_out)
4315 4332  {
4316 4333          if (er_out != NULL) {
4317 4334                  if (copyout(&er_ext, er_out, sizeof (int))) {
4318 4335                          return (set_errno(EFAULT));
4319 4336                  }
4320 4337          }
4321 4338          return (set_errno(er_error));
4322 4339  }
4323 4340  
4324 4341  static int
4325 4342  zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4326 4343  {
4327 4344          ts_label_t *tsl;
4328 4345          bslabel_t blab;
4329 4346  
4330 4347          /* Get label from user */
4331 4348          if (copyin(lab, &blab, sizeof (blab)) != 0)
4332 4349                  return (EFAULT);
4333 4350          tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4334 4351          if (tsl == NULL)
4335 4352                  return (ENOMEM);
4336 4353  
4337 4354          zone->zone_slabel = tsl;
4338 4355          return (0);
4339 4356  }
4340 4357  
4341 4358  /*
4342 4359   * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4343 4360   */
4344 4361  static int
4345 4362  parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4346 4363  {
4347 4364          char *kbuf;
4348 4365          char *dataset, *next;
4349 4366          zone_dataset_t *zd;
4350 4367          size_t len;
4351 4368  
4352 4369          if (ubuf == NULL || buflen == 0)
4353 4370                  return (0);
4354 4371  
4355 4372          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4356 4373                  return (ENOMEM);
4357 4374  
4358 4375          if (copyin(ubuf, kbuf, buflen) != 0) {
4359 4376                  kmem_free(kbuf, buflen);
4360 4377                  return (EFAULT);
4361 4378          }
4362 4379  
4363 4380          dataset = next = kbuf;
4364 4381          for (;;) {
4365 4382                  zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
4366 4383  
4367 4384                  next = strchr(dataset, ',');
4368 4385  
4369 4386                  if (next == NULL)
4370 4387                          len = strlen(dataset);
4371 4388                  else
4372 4389                          len = next - dataset;
4373 4390  
4374 4391                  zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
4375 4392                  bcopy(dataset, zd->zd_dataset, len);
4376 4393                  zd->zd_dataset[len] = '\0';
4377 4394  
4378 4395                  list_insert_head(&zone->zone_datasets, zd);
4379 4396  
4380 4397                  if (next == NULL)
4381 4398                          break;
4382 4399  
4383 4400                  dataset = next + 1;
4384 4401          }
4385 4402  
4386 4403          kmem_free(kbuf, buflen);
4387 4404          return (0);
4388 4405  }
4389 4406  
4390 4407  /*
4391 4408   * System call to create/initialize a new zone named 'zone_name', rooted
4392 4409   * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4393 4410   * and initialized with the zone-wide rctls described in 'rctlbuf', and
4394 4411   * with labeling set by 'match', 'doi', and 'label'.
4395 4412   *
4396 4413   * If extended error is non-null, we may use it to return more detailed
4397 4414   * error information.
4398 4415   */
4399 4416  static zoneid_t
4400 4417  zone_create(const char *zone_name, const char *zone_root,
4401 4418      const priv_set_t *zone_privs, size_t zone_privssz,
4402 4419      caddr_t rctlbuf, size_t rctlbufsz,
4403 4420      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4404 4421      int match, uint32_t doi, const bslabel_t *label,
4405 4422      int flags)
4406 4423  {
4407 4424          struct zsched_arg zarg;
4408 4425          nvlist_t *rctls = NULL;
4409 4426          proc_t *pp = curproc;
4410 4427          zone_t *zone, *ztmp;
4411 4428          zoneid_t zoneid, start = GLOBAL_ZONEID;
4412 4429          int error;
4413 4430          int error2 = 0;
4414 4431          char *str;
4415 4432          cred_t *zkcr;
4416 4433          boolean_t insert_label_hash;
4417 4434  
4418 4435          if (secpolicy_zone_config(CRED()) != 0)
4419 4436                  return (set_errno(EPERM));
4420 4437  
4421 4438          /* can't boot zone from within chroot environment */
4422 4439          if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4423 4440                  return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4424 4441                      extended_error));
4425 4442          /*
4426 4443           * As the first step of zone creation, we want to allocate a zoneid.
4427 4444           * This allocation is complicated by the fact that netstacks use the
4428 4445           * zoneid to determine their stackid, but netstacks themselves are
4429 4446           * freed asynchronously with respect to zone destruction.  This means
4430 4447           * that a netstack reference leak (or in principle, an extraordinarily
4431 4448           * long netstack reference hold) could result in a zoneid being
4432 4449           * allocated that in fact corresponds to a stackid from an active
4433 4450           * (referenced) netstack -- unleashing all sorts of havoc when that
4434 4451           * netstack is actually (re)used.  (In the abstract, we might wish a
4435 4452           * zoneid to not be deallocated until its last referencing netstack
4436 4453           * has been released, but netstacks lack a backpointer into their
4437 4454           * referencing zone -- and changing them to have such a pointer would
4438 4455           * be substantial, to put it euphemistically.)  To avoid this, we
4439 4456           * detect this condition on allocation: if we have allocated a zoneid
4440 4457           * that corresponds to a netstack that's still in use, we warn about
4441 4458           * it (as it is much more likely to be a reference leak than an actual
4442 4459           * netstack reference), free it, and allocate another.  That these
4443 4460           * identifers are allocated out of an ID space assures that we won't
4444 4461           * see the identifier we just allocated.
4445 4462           */
4446 4463          for (;;) {
4447 4464                  zoneid = id_alloc(zoneid_space);
4448 4465  
4449 4466                  if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4450 4467                          break;
4451 4468  
4452 4469                  id_free(zoneid_space, zoneid);
4453 4470  
4454 4471                  if (start == GLOBAL_ZONEID) {
4455 4472                          start = zoneid;
4456 4473                  } else if (zoneid == start) {
4457 4474                          /*
4458 4475                           * We have managed to iterate over the entire available
4459 4476                           * zoneid space -- there are no identifiers available,
4460 4477                           * presumably due to some number of leaked netstack
4461 4478                           * references.  While it's in principle possible for us
4462 4479                           * to continue to try, it seems wiser to give up at
4463 4480                           * this point to warn and fail explicitly with a
4464 4481                           * distinctive error.
4465 4482                           */
4466 4483                          cmn_err(CE_WARN, "zone_create() failed: all available "
4467 4484                              "zone IDs have netstacks still in use");
4468 4485                          return (set_errno(ENFILE));
4469 4486                  }
4470 4487  
4471 4488                  cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4472 4489                      "netstack still in use", zoneid);
4473 4490          }
4474 4491  
4475 4492          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4476 4493          zone->zone_id = zoneid;
4477 4494          zone->zone_status = ZONE_IS_UNINITIALIZED;
4478 4495          zone->zone_pool = pool_default;
4479 4496          zone->zone_pool_mod = gethrtime();
4480 4497          zone->zone_psetid = ZONE_PS_INVAL;
4481 4498          zone->zone_ncpus = 0;
4482 4499          zone->zone_ncpus_online = 0;
4483 4500          zone->zone_restart_init = B_TRUE;
4484 4501          zone->zone_brand = &native_brand;
4485 4502          zone->zone_initname = NULL;
4486 4503          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4487 4504          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4488 4505          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4489 4506          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4490 4507          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4491 4508              offsetof(zone_ref_t, zref_linkage));
4492 4509          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4493 4510              offsetof(struct zsd_entry, zsd_linkage));
4494 4511          list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4495 4512              offsetof(zone_dataset_t, zd_linkage));
4496 4513          list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4497 4514              offsetof(zone_dl_t, zdl_linkage));
4498 4515          rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4499 4516          rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4500 4517  
4501 4518          if (flags & ZCF_NET_EXCL) {
4502 4519                  zone->zone_flags |= ZF_NET_EXCL;
4503 4520          }
4504 4521  
4505 4522          if ((error = zone_set_name(zone, zone_name)) != 0) {
4506 4523                  zone_free(zone);
4507 4524                  return (zone_create_error(error, 0, extended_error));
4508 4525          }
4509 4526  
4510 4527          if ((error = zone_set_root(zone, zone_root)) != 0) {
4511 4528                  zone_free(zone);
4512 4529                  return (zone_create_error(error, 0, extended_error));
4513 4530          }
4514 4531          if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4515 4532                  zone_free(zone);
4516 4533                  return (zone_create_error(error, 0, extended_error));
4517 4534          }
4518 4535  
4519 4536          /* initialize node name to be the same as zone name */
4520 4537          zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4521 4538          (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4522 4539          zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4523 4540  
4524 4541          zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4525 4542          zone->zone_domain[0] = '\0';
4526 4543          zone->zone_hostid = HW_INVALID_HOSTID;
4527 4544          zone->zone_shares = 1;
4528 4545          zone->zone_shmmax = 0;
4529 4546          zone->zone_ipc.ipcq_shmmni = 0;
4530 4547          zone->zone_ipc.ipcq_semmni = 0;
4531 4548          zone->zone_ipc.ipcq_msgmni = 0;
4532 4549          zone->zone_bootargs = NULL;
4533 4550          zone->zone_fs_allowed = NULL;
4534 4551  
4535 4552          psecflags_default(&zone->zone_secflags);
4536 4553  
4537 4554          zone->zone_initname =
4538 4555              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4539 4556          (void) strcpy(zone->zone_initname, zone_default_initname);
4540 4557          zone->zone_nlwps = 0;
4541 4558          zone->zone_nlwps_ctl = INT_MAX;
4542 4559          zone->zone_nprocs = 0;
4543 4560          zone->zone_nprocs_ctl = INT_MAX;
4544 4561          zone->zone_locked_mem = 0;
4545 4562          zone->zone_locked_mem_ctl = UINT64_MAX;
4546 4563          zone->zone_max_swap = 0;
4547 4564          zone->zone_max_swap_ctl = UINT64_MAX;
4548 4565          zone->zone_max_lofi = 0;
4549 4566          zone->zone_max_lofi_ctl = UINT64_MAX;
4550 4567          zone0.zone_lockedmem_kstat = NULL;
4551 4568          zone0.zone_swapresv_kstat = NULL;
4552 4569  
4553 4570          zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
4554 4571  
4555 4572          /*
4556 4573           * Zsched initializes the rctls.
4557 4574           */
4558 4575          zone->zone_rctls = NULL;
4559 4576  
4560 4577          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4561 4578                  zone_free(zone);
4562 4579                  return (zone_create_error(error, 0, extended_error));
4563 4580          }
4564 4581  
4565 4582          if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4566 4583                  zone_free(zone);
4567 4584                  return (set_errno(error));
4568 4585          }
4569 4586  
4570 4587          /*
4571 4588           * Read in the trusted system parameters:
4572 4589           * match flag and sensitivity label.
4573 4590           */
4574 4591          zone->zone_match = match;
4575 4592          if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4576 4593                  /* Fail if requested to set doi to anything but system's doi */
4577 4594                  if (doi != 0 && doi != default_doi) {
4578 4595                          zone_free(zone);
4579 4596                          return (set_errno(EINVAL));
4580 4597                  }
4581 4598                  /* Always apply system's doi to the zone */
4582 4599                  error = zone_set_label(zone, label, default_doi);
4583 4600                  if (error != 0) {
4584 4601                          zone_free(zone);
4585 4602                          return (set_errno(error));
4586 4603                  }
4587 4604                  insert_label_hash = B_TRUE;
4588 4605          } else {
4589 4606                  /* all zones get an admin_low label if system is not labeled */
4590 4607                  zone->zone_slabel = l_admin_low;
4591 4608                  label_hold(l_admin_low);
4592 4609                  insert_label_hash = B_FALSE;
4593 4610          }
4594 4611  
4595 4612          /*
4596 4613           * Stop all lwps since that's what normally happens as part of fork().
4597 4614           * This needs to happen before we grab any locks to avoid deadlock
4598 4615           * (another lwp in the process could be waiting for the held lock).
4599 4616           */
4600 4617          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4601 4618                  zone_free(zone);
4602 4619                  nvlist_free(rctls);
4603 4620                  return (zone_create_error(error, 0, extended_error));
4604 4621          }
4605 4622  
4606 4623          if (block_mounts(zone) == 0) {
4607 4624                  mutex_enter(&pp->p_lock);
4608 4625                  if (curthread != pp->p_agenttp)
4609 4626                          continuelwps(pp);
4610 4627                  mutex_exit(&pp->p_lock);
4611 4628                  zone_free(zone);
4612 4629                  nvlist_free(rctls);
4613 4630                  return (zone_create_error(error, 0, extended_error));
4614 4631          }
4615 4632  
4616 4633          /*
4617 4634           * Set up credential for kernel access.  After this, any errors
4618 4635           * should go through the dance in errout rather than calling
4619 4636           * zone_free directly.
4620 4637           */
4621 4638          zone->zone_kcred = crdup(kcred);
4622 4639          crsetzone(zone->zone_kcred, zone);
4623 4640          priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4624 4641          priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4625 4642          priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4626 4643          priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4627 4644  
4628 4645          mutex_enter(&zonehash_lock);
4629 4646          /*
4630 4647           * Make sure zone doesn't already exist.
4631 4648           *
4632 4649           * If the system and zone are labeled,
4633 4650           * make sure no other zone exists that has the same label.
4634 4651           */
4635 4652          if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4636 4653              (insert_label_hash &&
4637 4654              (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4638 4655                  zone_status_t status;
4639 4656  
4640 4657                  status = zone_status_get(ztmp);
4641 4658                  if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4642 4659                          error = EEXIST;
4643 4660                  else
4644 4661                          error = EBUSY;
4645 4662  
4646 4663                  if (insert_label_hash)
4647 4664                          error2 = ZE_LABELINUSE;
4648 4665  
4649 4666                  goto errout;
4650 4667          }
4651 4668  
4652 4669          /*
4653 4670           * Don't allow zone creations which would cause one zone's rootpath to
4654 4671           * be accessible from that of another (non-global) zone.
4655 4672           */
4656 4673          if (zone_is_nested(zone->zone_rootpath)) {
4657 4674                  error = EBUSY;
4658 4675                  goto errout;
4659 4676          }
4660 4677  
4661 4678          ASSERT(zonecount != 0);         /* check for leaks */
4662 4679          if (zonecount + 1 > maxzones) {
4663 4680                  error = ENOMEM;
4664 4681                  goto errout;
4665 4682          }
4666 4683  
4667 4684          if (zone_mount_count(zone->zone_rootpath) != 0) {
4668 4685                  error = EBUSY;
4669 4686                  error2 = ZE_AREMOUNTS;
4670 4687                  goto errout;
4671 4688          }
4672 4689  
4673 4690          /*
4674 4691           * Zone is still incomplete, but we need to drop all locks while
4675 4692           * zsched() initializes this zone's kernel process.  We
4676 4693           * optimistically add the zone to the hashtable and associated
4677 4694           * lists so a parallel zone_create() doesn't try to create the
4678 4695           * same zone.
4679 4696           */
4680 4697          zonecount++;
4681 4698          (void) mod_hash_insert(zonehashbyid,
4682 4699              (mod_hash_key_t)(uintptr_t)zone->zone_id,
4683 4700              (mod_hash_val_t)(uintptr_t)zone);
4684 4701          str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4685 4702          (void) strcpy(str, zone->zone_name);
4686 4703          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4687 4704              (mod_hash_val_t)(uintptr_t)zone);
4688 4705          if (insert_label_hash) {
4689 4706                  (void) mod_hash_insert(zonehashbylabel,
4690 4707                      (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4691 4708                  zone->zone_flags |= ZF_HASHED_LABEL;
4692 4709          }
4693 4710  
4694 4711          /*
4695 4712           * Insert into active list.  At this point there are no 'hold's
4696 4713           * on the zone, but everyone else knows not to use it, so we can
4697 4714           * continue to use it.  zsched() will do a zone_hold() if the
4698 4715           * newproc() is successful.
4699 4716           */
4700 4717          list_insert_tail(&zone_active, zone);
4701 4718          mutex_exit(&zonehash_lock);
4702 4719  
4703 4720          zarg.zone = zone;
4704 4721          zarg.nvlist = rctls;
4705 4722          /*
4706 4723           * The process, task, and project rctls are probably wrong;
4707 4724           * we need an interface to get the default values of all rctls,
4708 4725           * and initialize zsched appropriately.  I'm not sure that that
4709 4726           * makes much of a difference, though.
4710 4727           */
4711 4728          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4712 4729          if (error != 0) {
4713 4730                  /*
4714 4731                   * We need to undo all globally visible state.
4715 4732                   */
4716 4733                  mutex_enter(&zonehash_lock);
4717 4734                  list_remove(&zone_active, zone);
4718 4735                  if (zone->zone_flags & ZF_HASHED_LABEL) {
4719 4736                          ASSERT(zone->zone_slabel != NULL);
4720 4737                          (void) mod_hash_destroy(zonehashbylabel,
4721 4738                              (mod_hash_key_t)zone->zone_slabel);
4722 4739                  }
4723 4740                  (void) mod_hash_destroy(zonehashbyname,
4724 4741                      (mod_hash_key_t)(uintptr_t)zone->zone_name);
4725 4742                  (void) mod_hash_destroy(zonehashbyid,
4726 4743                      (mod_hash_key_t)(uintptr_t)zone->zone_id);
4727 4744                  ASSERT(zonecount > 1);
4728 4745                  zonecount--;
4729 4746                  goto errout;
4730 4747          }
4731 4748  
4732 4749          /*
4733 4750           * Zone creation can't fail from now on.
4734 4751           */
4735 4752  
4736 4753          /*
4737 4754           * Create zone kstats
4738 4755           */
4739 4756          zone_kstat_create(zone);
4740 4757  
4741 4758          /*
4742 4759           * Let the other lwps continue.
4743 4760           */
4744 4761          mutex_enter(&pp->p_lock);
4745 4762          if (curthread != pp->p_agenttp)
4746 4763                  continuelwps(pp);
4747 4764          mutex_exit(&pp->p_lock);
4748 4765  
4749 4766          /*
4750 4767           * Wait for zsched to finish initializing the zone.
4751 4768           */
4752 4769          zone_status_wait(zone, ZONE_IS_READY);
4753 4770          /*
4754 4771           * The zone is fully visible, so we can let mounts progress.
4755 4772           */
4756 4773          resume_mounts(zone);
4757 4774          nvlist_free(rctls);
4758 4775  
4759 4776          return (zoneid);
4760 4777  
4761 4778  errout:
4762 4779          mutex_exit(&zonehash_lock);
4763 4780          /*
4764 4781           * Let the other lwps continue.
4765 4782           */
4766 4783          mutex_enter(&pp->p_lock);
4767 4784          if (curthread != pp->p_agenttp)
4768 4785                  continuelwps(pp);
4769 4786          mutex_exit(&pp->p_lock);
4770 4787  
4771 4788          resume_mounts(zone);
4772 4789          nvlist_free(rctls);
4773 4790          /*
4774 4791           * There is currently one reference to the zone, a cred_ref from
4775 4792           * zone_kcred.  To free the zone, we call crfree, which will call
4776 4793           * zone_cred_rele, which will call zone_free.
4777 4794           */
4778 4795          ASSERT(zone->zone_cred_ref == 1);
4779 4796          ASSERT(zone->zone_kcred->cr_ref == 1);
4780 4797          ASSERT(zone->zone_ref == 0);
4781 4798          zkcr = zone->zone_kcred;
4782 4799          zone->zone_kcred = NULL;
4783 4800          crfree(zkcr);                           /* triggers call to zone_free */
4784 4801          return (zone_create_error(error, error2, extended_error));
4785 4802  }
4786 4803  
4787 4804  /*
4788 4805   * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4789 4806   * the heavy lifting.  initname is the path to the program to launch
4790 4807   * at the "top" of the zone; if this is NULL, we use the system default,
4791 4808   * which is stored at zone_default_initname.
4792 4809   */
4793 4810  static int
4794 4811  zone_boot(zoneid_t zoneid)
4795 4812  {
4796 4813          int err;
4797 4814          zone_t *zone;
4798 4815  
4799 4816          if (secpolicy_zone_config(CRED()) != 0)
4800 4817                  return (set_errno(EPERM));
4801 4818          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4802 4819                  return (set_errno(EINVAL));
4803 4820  
4804 4821          mutex_enter(&zonehash_lock);
4805 4822          /*
4806 4823           * Look for zone under hash lock to prevent races with calls to
4807 4824           * zone_shutdown, zone_destroy, etc.
4808 4825           */
4809 4826          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4810 4827                  mutex_exit(&zonehash_lock);
4811 4828                  return (set_errno(EINVAL));
4812 4829          }
4813 4830  
4814 4831          mutex_enter(&zone_status_lock);
4815 4832          if (zone_status_get(zone) != ZONE_IS_READY) {
4816 4833                  mutex_exit(&zone_status_lock);
4817 4834                  mutex_exit(&zonehash_lock);
4818 4835                  return (set_errno(EINVAL));
4819 4836          }
4820 4837          zone_status_set(zone, ZONE_IS_BOOTING);
4821 4838          mutex_exit(&zone_status_lock);
4822 4839  
4823 4840          zone_hold(zone);        /* so we can use the zone_t later */
4824 4841          mutex_exit(&zonehash_lock);
4825 4842  
4826 4843          if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4827 4844                  zone_rele(zone);
4828 4845                  return (set_errno(EINTR));
4829 4846          }
4830 4847  
4831 4848          /*
4832 4849           * Boot (starting init) might have failed, in which case the zone
4833 4850           * will go to the SHUTTING_DOWN state; an appropriate errno will
4834 4851           * be placed in zone->zone_boot_err, and so we return that.
4835 4852           */
4836 4853          err = zone->zone_boot_err;
4837 4854          zone_rele(zone);
4838 4855          return (err ? set_errno(err) : 0);
4839 4856  }
4840 4857  
4841 4858  /*
4842 4859   * Kills all user processes in the zone, waiting for them all to exit
4843 4860   * before returning.
4844 4861   */
4845 4862  static int
4846 4863  zone_empty(zone_t *zone)
4847 4864  {
4848 4865          int waitstatus;
4849 4866  
4850 4867          /*
4851 4868           * We need to drop zonehash_lock before killing all
4852 4869           * processes, otherwise we'll deadlock with zone_find_*
4853 4870           * which can be called from the exit path.
4854 4871           */
4855 4872          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4856 4873          while ((waitstatus = zone_status_timedwait_sig(zone,
4857 4874              ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4858 4875                  killall(zone->zone_id);
4859 4876          }
4860 4877          /*
4861 4878           * return EINTR if we were signaled
4862 4879           */
4863 4880          if (waitstatus == 0)
4864 4881                  return (EINTR);
4865 4882          return (0);
4866 4883  }
4867 4884  
4868 4885  /*
4869 4886   * This function implements the policy for zone visibility.
4870 4887   *
4871 4888   * In standard Solaris, a non-global zone can only see itself.
4872 4889   *
4873 4890   * In Trusted Extensions, a labeled zone can lookup any zone whose label
4874 4891   * it dominates. For this test, the label of the global zone is treated as
4875 4892   * admin_high so it is special-cased instead of being checked for dominance.
4876 4893   *
4877 4894   * Returns true if zone attributes are viewable, false otherwise.
4878 4895   */
4879 4896  static boolean_t
4880 4897  zone_list_access(zone_t *zone)
4881 4898  {
4882 4899  
4883 4900          if (curproc->p_zone == global_zone ||
4884 4901              curproc->p_zone == zone) {
4885 4902                  return (B_TRUE);
4886 4903          } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4887 4904                  bslabel_t *curproc_label;
4888 4905                  bslabel_t *zone_label;
4889 4906  
4890 4907                  curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4891 4908                  zone_label = label2bslabel(zone->zone_slabel);
4892 4909  
4893 4910                  if (zone->zone_id != GLOBAL_ZONEID &&
4894 4911                      bldominates(curproc_label, zone_label)) {
4895 4912                          return (B_TRUE);
4896 4913                  } else {
4897 4914                          return (B_FALSE);
4898 4915                  }
4899 4916          } else {
4900 4917                  return (B_FALSE);
4901 4918          }
4902 4919  }
4903 4920  
4904 4921  /*
4905 4922   * Systemcall to start the zone's halt sequence.  By the time this
4906 4923   * function successfully returns, all user processes and kernel threads
4907 4924   * executing in it will have exited, ZSD shutdown callbacks executed,
4908 4925   * and the zone status set to ZONE_IS_DOWN.
4909 4926   *
4910 4927   * It is possible that the call will interrupt itself if the caller is the
4911 4928   * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4912 4929   */
4913 4930  static int
4914 4931  zone_shutdown(zoneid_t zoneid)
4915 4932  {
4916 4933          int error;
4917 4934          zone_t *zone;
4918 4935          zone_status_t status;
4919 4936  
4920 4937          if (secpolicy_zone_config(CRED()) != 0)
4921 4938                  return (set_errno(EPERM));
4922 4939          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4923 4940                  return (set_errno(EINVAL));
4924 4941  
4925 4942          mutex_enter(&zonehash_lock);
4926 4943          /*
4927 4944           * Look for zone under hash lock to prevent races with other
4928 4945           * calls to zone_shutdown and zone_destroy.
4929 4946           */
4930 4947          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4931 4948                  mutex_exit(&zonehash_lock);
4932 4949                  return (set_errno(EINVAL));
4933 4950          }
4934 4951  
4935 4952          /*
4936 4953           * We have to drop zonehash_lock before calling block_mounts.
4937 4954           * Hold the zone so we can continue to use the zone_t.
4938 4955           */
4939 4956          zone_hold(zone);
4940 4957          mutex_exit(&zonehash_lock);
4941 4958  
4942 4959          /*
4943 4960           * Block mounts so that VFS_MOUNT() can get an accurate view of
4944 4961           * the zone's status with regards to ZONE_IS_SHUTTING down.
4945 4962           *
4946 4963           * e.g. NFS can fail the mount if it determines that the zone
4947 4964           * has already begun the shutdown sequence.
4948 4965           *
4949 4966           */
4950 4967          if (block_mounts(zone) == 0) {
4951 4968                  zone_rele(zone);
4952 4969                  return (set_errno(EINTR));
4953 4970          }
4954 4971  
4955 4972          mutex_enter(&zonehash_lock);
4956 4973          mutex_enter(&zone_status_lock);
4957 4974          status = zone_status_get(zone);
4958 4975          /*
4959 4976           * Fail if the zone isn't fully initialized yet.
4960 4977           */
4961 4978          if (status < ZONE_IS_READY) {
4962 4979                  mutex_exit(&zone_status_lock);
4963 4980                  mutex_exit(&zonehash_lock);
4964 4981                  resume_mounts(zone);
4965 4982                  zone_rele(zone);
4966 4983                  return (set_errno(EINVAL));
4967 4984          }
4968 4985          /*
4969 4986           * If conditions required for zone_shutdown() to return have been met,
4970 4987           * return success.
4971 4988           */
4972 4989          if (status >= ZONE_IS_DOWN) {
4973 4990                  mutex_exit(&zone_status_lock);
4974 4991                  mutex_exit(&zonehash_lock);
4975 4992                  resume_mounts(zone);
4976 4993                  zone_rele(zone);
4977 4994                  return (0);
4978 4995          }
4979 4996          /*
4980 4997           * If zone_shutdown() hasn't been called before, go through the motions.
4981 4998           * If it has, there's nothing to do but wait for the kernel threads to
4982 4999           * drain.
4983 5000           */
4984 5001          if (status < ZONE_IS_EMPTY) {
4985 5002                  uint_t ntasks;
4986 5003  
4987 5004                  mutex_enter(&zone->zone_lock);
4988 5005                  if ((ntasks = zone->zone_ntasks) != 1) {
4989 5006                          /*
4990 5007                           * There's still stuff running.
4991 5008                           */
4992 5009                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4993 5010                  }
4994 5011                  mutex_exit(&zone->zone_lock);
4995 5012                  if (ntasks == 1) {
4996 5013                          /*
4997 5014                           * The only way to create another task is through
4998 5015                           * zone_enter(), which will block until we drop
4999 5016                           * zonehash_lock.  The zone is empty.
5000 5017                           */
5001 5018                          if (zone->zone_kthreads == NULL) {
5002 5019                                  /*
5003 5020                                   * Skip ahead to ZONE_IS_DOWN
5004 5021                                   */
5005 5022                                  zone_status_set(zone, ZONE_IS_DOWN);
5006 5023                          } else {
5007 5024                                  zone_status_set(zone, ZONE_IS_EMPTY);
5008 5025                          }
5009 5026                  }
5010 5027          }
5011 5028          mutex_exit(&zone_status_lock);
5012 5029          mutex_exit(&zonehash_lock);
5013 5030          resume_mounts(zone);
5014 5031  
5015 5032          if (error = zone_empty(zone)) {
5016 5033                  zone_rele(zone);
5017 5034                  return (set_errno(error));
5018 5035          }
5019 5036          /*
5020 5037           * After the zone status goes to ZONE_IS_DOWN this zone will no
5021 5038           * longer be notified of changes to the pools configuration, so
5022 5039           * in order to not end up with a stale pool pointer, we point
5023 5040           * ourselves at the default pool and remove all resource
5024 5041           * visibility.  This is especially important as the zone_t may
5025 5042           * languish on the deathrow for a very long time waiting for
5026 5043           * cred's to drain out.
5027 5044           *
5028 5045           * This rebinding of the zone can happen multiple times
5029 5046           * (presumably due to interrupted or parallel systemcalls)
5030 5047           * without any adverse effects.
5031 5048           */
5032 5049          if (pool_lock_intr() != 0) {
5033 5050                  zone_rele(zone);
5034 5051                  return (set_errno(EINTR));
5035 5052          }
5036 5053          if (pool_state == POOL_ENABLED) {
5037 5054                  mutex_enter(&cpu_lock);
5038 5055                  zone_pool_set(zone, pool_default);
5039 5056                  /*
5040 5057                   * The zone no longer needs to be able to see any cpus.
5041 5058                   */
5042 5059                  zone_pset_set(zone, ZONE_PS_INVAL);
5043 5060                  mutex_exit(&cpu_lock);
5044 5061          }
5045 5062          pool_unlock();
5046 5063  
5047 5064          /*
5048 5065           * ZSD shutdown callbacks can be executed multiple times, hence
5049 5066           * it is safe to not be holding any locks across this call.
5050 5067           */
5051 5068          zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5052 5069  
5053 5070          mutex_enter(&zone_status_lock);
5054 5071          if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5055 5072                  zone_status_set(zone, ZONE_IS_DOWN);
5056 5073          mutex_exit(&zone_status_lock);
5057 5074  
5058 5075          /*
5059 5076           * Wait for kernel threads to drain.
5060 5077           */
5061 5078          if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5062 5079                  zone_rele(zone);
5063 5080                  return (set_errno(EINTR));
5064 5081          }
5065 5082  
5066 5083          /*
5067 5084           * Zone can be become down/destroyable even if the above wait
5068 5085           * returns EINTR, so any code added here may never execute.
5069 5086           * (i.e. don't add code here)
5070 5087           */
5071 5088  
5072 5089          zone_rele(zone);
5073 5090          return (0);
5074 5091  }
5075 5092  
5076 5093  /*
5077 5094   * Log the specified zone's reference counts.  The caller should not be
5078 5095   * holding the zone's zone_lock.
5079 5096   */
5080 5097  static void
5081 5098  zone_log_refcounts(zone_t *zone)
5082 5099  {
5083 5100          char *buffer;
5084 5101          char *buffer_position;
5085 5102          uint32_t buffer_size;
5086 5103          uint32_t index;
5087 5104          uint_t ref;
5088 5105          uint_t cred_ref;
5089 5106  
5090 5107          /*
5091 5108           * Construct a string representing the subsystem-specific reference
5092 5109           * counts.  The counts are printed in ascending order by index into the
5093 5110           * zone_t::zone_subsys_ref array.  The list will be surrounded by
5094 5111           * square brackets [] and will only contain nonzero reference counts.
5095 5112           *
5096 5113           * The buffer will hold two square bracket characters plus ten digits,
5097 5114           * one colon, one space, one comma, and some characters for a
5098 5115           * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5099 5116           * bit integers have at most ten decimal digits.)  The last
5100 5117           * reference count's comma is replaced by the closing square
5101 5118           * bracket and a NULL character to terminate the string.
5102 5119           *
5103 5120           * NOTE: We have to grab the zone's zone_lock to create a consistent
5104 5121           * snapshot of the zone's reference counters.
5105 5122           *
5106 5123           * First, figure out how much space the string buffer will need.
5107 5124           * The buffer's size is stored in buffer_size.
5108 5125           */
5109 5126          buffer_size = 2;                        /* for the square brackets */
5110 5127          mutex_enter(&zone->zone_lock);
5111 5128          zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5112 5129          ref = zone->zone_ref;
5113 5130          cred_ref = zone->zone_cred_ref;
5114 5131          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5115 5132                  if (zone->zone_subsys_ref[index] != 0)
5116 5133                          buffer_size += strlen(zone_ref_subsys_names[index]) +
5117 5134                              13;
5118 5135          if (buffer_size == 2) {
5119 5136                  /*
5120 5137                   * No subsystems had nonzero reference counts.  Don't bother
5121 5138                   * with allocating a buffer; just log the general-purpose and
5122 5139                   * credential reference counts.
5123 5140                   */
5124 5141                  mutex_exit(&zone->zone_lock);
5125 5142                  (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5126 5143                      "Zone '%s' (ID: %d) is shutting down, but %u zone "
5127 5144                      "references and %u credential references are still extant",
5128 5145                      zone->zone_name, zone->zone_id, ref, cred_ref);
5129 5146                  return;
5130 5147          }
5131 5148  
5132 5149          /*
5133 5150           * buffer_size contains the exact number of characters that the
5134 5151           * buffer will need.  Allocate the buffer and fill it with nonzero
5135 5152           * subsystem-specific reference counts.  Surround the results with
5136 5153           * square brackets afterwards.
5137 5154           */
5138 5155          buffer = kmem_alloc(buffer_size, KM_SLEEP);
5139 5156          buffer_position = &buffer[1];
5140 5157          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5141 5158                  /*
5142 5159                   * NOTE: The DDI's version of sprintf() returns a pointer to
5143 5160                   * the modified buffer rather than the number of bytes written
5144 5161                   * (as in snprintf(3C)).  This is unfortunate and annoying.
5145 5162                   * Therefore, we'll use snprintf() with INT_MAX to get the
5146 5163                   * number of bytes written.  Using INT_MAX is safe because
5147 5164                   * the buffer is perfectly sized for the data: we'll never
5148 5165                   * overrun the buffer.
5149 5166                   */
5150 5167                  if (zone->zone_subsys_ref[index] != 0)
5151 5168                          buffer_position += snprintf(buffer_position, INT_MAX,
5152 5169                              "%s: %u,", zone_ref_subsys_names[index],
5153 5170                              zone->zone_subsys_ref[index]);
5154 5171          }
5155 5172          mutex_exit(&zone->zone_lock);
5156 5173          buffer[0] = '[';
5157 5174          ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5158 5175          ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5159 5176          buffer_position[-1] = ']';
5160 5177  
5161 5178          /*
5162 5179           * Log the reference counts and free the message buffer.
5163 5180           */
5164 5181          (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5165 5182              "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5166 5183              "%u credential references are still extant %s", zone->zone_name,
5167 5184              zone->zone_id, ref, cred_ref, buffer);
5168 5185          kmem_free(buffer, buffer_size);
5169 5186  }
5170 5187  
5171 5188  /*
5172 5189   * Systemcall entry point to finalize the zone halt process.  The caller
5173 5190   * must have already successfully called zone_shutdown().
5174 5191   *
5175 5192   * Upon successful completion, the zone will have been fully destroyed:
5176 5193   * zsched will have exited, destructor callbacks executed, and the zone
5177 5194   * removed from the list of active zones.
5178 5195   */
5179 5196  static int
5180 5197  zone_destroy(zoneid_t zoneid)
5181 5198  {
5182 5199          uint64_t uniqid;
5183 5200          zone_t *zone;
5184 5201          zone_status_t status;
5185 5202          clock_t wait_time;
5186 5203          boolean_t log_refcounts;
5187 5204  
5188 5205          if (secpolicy_zone_config(CRED()) != 0)
5189 5206                  return (set_errno(EPERM));
5190 5207          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5191 5208                  return (set_errno(EINVAL));
5192 5209  
5193 5210          mutex_enter(&zonehash_lock);
5194 5211          /*
5195 5212           * Look for zone under hash lock to prevent races with other
5196 5213           * calls to zone_destroy.
5197 5214           */
5198 5215          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5199 5216                  mutex_exit(&zonehash_lock);
5200 5217                  return (set_errno(EINVAL));
5201 5218          }
5202 5219  
5203 5220          if (zone_mount_count(zone->zone_rootpath) != 0) {
5204 5221                  mutex_exit(&zonehash_lock);
5205 5222                  return (set_errno(EBUSY));
5206 5223          }
5207 5224          mutex_enter(&zone_status_lock);
5208 5225          status = zone_status_get(zone);
5209 5226          if (status < ZONE_IS_DOWN) {
5210 5227                  mutex_exit(&zone_status_lock);
5211 5228                  mutex_exit(&zonehash_lock);
5212 5229                  return (set_errno(EBUSY));
5213 5230          } else if (status == ZONE_IS_DOWN) {
5214 5231                  zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5215 5232          }
5216 5233          mutex_exit(&zone_status_lock);
5217 5234          zone_hold(zone);
5218 5235          mutex_exit(&zonehash_lock);
5219 5236  
5220 5237          /*
5221 5238           * wait for zsched to exit
5222 5239           */
5223 5240          zone_status_wait(zone, ZONE_IS_DEAD);
5224 5241          zone_zsd_callbacks(zone, ZSD_DESTROY);
5225 5242          zone->zone_netstack = NULL;
5226 5243          uniqid = zone->zone_uniqid;
5227 5244          zone_rele(zone);
5228 5245          zone = NULL;    /* potentially free'd */
5229 5246  
5230 5247          log_refcounts = B_FALSE;
5231 5248          wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5232 5249          mutex_enter(&zonehash_lock);
5233 5250          for (; /* ever */; ) {
5234 5251                  boolean_t unref;
5235 5252                  boolean_t refs_have_been_logged;
5236 5253  
5237 5254                  if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5238 5255                      zone->zone_uniqid != uniqid) {
5239 5256                          /*
5240 5257                           * The zone has gone away.  Necessary conditions
5241 5258                           * are met, so we return success.
5242 5259                           */
5243 5260                          mutex_exit(&zonehash_lock);
5244 5261                          return (0);
5245 5262                  }
5246 5263                  mutex_enter(&zone->zone_lock);
5247 5264                  unref = ZONE_IS_UNREF(zone);
5248 5265                  refs_have_been_logged = (zone->zone_flags &
5249 5266                      ZF_REFCOUNTS_LOGGED);
5250 5267                  mutex_exit(&zone->zone_lock);
5251 5268                  if (unref) {
5252 5269                          /*
5253 5270                           * There is only one reference to the zone -- that
5254 5271                           * added when the zone was added to the hashtables --
5255 5272                           * and things will remain this way until we drop
5256 5273                           * zonehash_lock... we can go ahead and cleanup the
5257 5274                           * zone.
5258 5275                           */
5259 5276                          break;
5260 5277                  }
5261 5278  
5262 5279                  /*
5263 5280                   * Wait for zone_rele_common() or zone_cred_rele() to signal
5264 5281                   * zone_destroy_cv.  zone_destroy_cv is signaled only when
5265 5282                   * some zone's general-purpose reference count reaches one.
5266 5283                   * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5267 5284                   * on zone_destroy_cv, then log the zone's reference counts and
5268 5285                   * continue to wait for zone_rele() and zone_cred_rele().
5269 5286                   */
5270 5287                  if (!refs_have_been_logged) {
5271 5288                          if (!log_refcounts) {
5272 5289                                  /*
5273 5290                                   * This thread hasn't timed out waiting on
5274 5291                                   * zone_destroy_cv yet.  Wait wait_time clock
5275 5292                                   * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5276 5293                                   * seconds) for the zone's references to clear.
5277 5294                                   */
5278 5295                                  ASSERT(wait_time > 0);
5279 5296                                  wait_time = cv_reltimedwait_sig(
5280 5297                                      &zone_destroy_cv, &zonehash_lock, wait_time,
5281 5298                                      TR_SEC);
5282 5299                                  if (wait_time > 0) {
5283 5300                                          /*
5284 5301                                           * A thread in zone_rele() or
5285 5302                                           * zone_cred_rele() signaled
5286 5303                                           * zone_destroy_cv before this thread's
5287 5304                                           * wait timed out.  The zone might have
5288 5305                                           * only one reference left; find out!
5289 5306                                           */
5290 5307                                          continue;
5291 5308                                  } else if (wait_time == 0) {
5292 5309                                          /* The thread's process was signaled. */
5293 5310                                          mutex_exit(&zonehash_lock);
5294 5311                                          return (set_errno(EINTR));
5295 5312                                  }
5296 5313  
5297 5314                                  /*
5298 5315                                   * The thread timed out while waiting on
5299 5316                                   * zone_destroy_cv.  Even though the thread
5300 5317                                   * timed out, it has to check whether another
5301 5318                                   * thread woke up from zone_destroy_cv and
5302 5319                                   * destroyed the zone.
5303 5320                                   *
5304 5321                                   * If the zone still exists and has more than
5305 5322                                   * one unreleased general-purpose reference,
5306 5323                                   * then log the zone's reference counts.
5307 5324                                   */
5308 5325                                  log_refcounts = B_TRUE;
5309 5326                                  continue;
5310 5327                          }
5311 5328  
5312 5329                          /*
5313 5330                           * The thread already timed out on zone_destroy_cv while
5314 5331                           * waiting for subsystems to release the zone's last
5315 5332                           * general-purpose references.  Log the zone's reference
5316 5333                           * counts and wait indefinitely on zone_destroy_cv.
5317 5334                           */
5318 5335                          zone_log_refcounts(zone);
5319 5336                  }
5320 5337                  if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5321 5338                          /* The thread's process was signaled. */
5322 5339                          mutex_exit(&zonehash_lock);
5323 5340                          return (set_errno(EINTR));
5324 5341                  }
5325 5342          }
5326 5343  
5327 5344          /*
5328 5345           * Remove CPU cap for this zone now since we're not going to
5329 5346           * fail below this point.
5330 5347           */
5331 5348          cpucaps_zone_remove(zone);
5332 5349  
5333 5350          /* Get rid of the zone's kstats */
5334 5351          zone_kstat_delete(zone);
5335 5352  
5336 5353          /* remove the pfexecd doors */
5337 5354          if (zone->zone_pfexecd != NULL) {
5338 5355                  klpd_freelist(&zone->zone_pfexecd);
5339 5356                  zone->zone_pfexecd = NULL;
5340 5357          }
5341 5358  
5342 5359          /* free brand specific data */
5343 5360          if (ZONE_IS_BRANDED(zone))
5344 5361                  ZBROP(zone)->b_free_brand_data(zone);
5345 5362  
5346 5363          /* Say goodbye to brand framework. */
5347 5364          brand_unregister_zone(zone->zone_brand);
5348 5365  
5349 5366          /*
5350 5367           * It is now safe to let the zone be recreated; remove it from the
5351 5368           * lists.  The memory will not be freed until the last cred
5352 5369           * reference goes away.
5353 5370           */
5354 5371          ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
5355 5372          zonecount--;
5356 5373          /* remove from active list and hash tables */
5357 5374          list_remove(&zone_active, zone);
5358 5375          (void) mod_hash_destroy(zonehashbyname,
5359 5376              (mod_hash_key_t)zone->zone_name);
5360 5377          (void) mod_hash_destroy(zonehashbyid,
5361 5378              (mod_hash_key_t)(uintptr_t)zone->zone_id);
  
    | 
      ↓ open down ↓ | 
    1871 lines elided | 
    
      ↑ open up ↑ | 
  
5362 5379          if (zone->zone_flags & ZF_HASHED_LABEL)
5363 5380                  (void) mod_hash_destroy(zonehashbylabel,
5364 5381                      (mod_hash_key_t)zone->zone_slabel);
5365 5382          mutex_exit(&zonehash_lock);
5366 5383  
5367 5384          /*
5368 5385           * Release the root vnode; we're not using it anymore.  Nor should any
5369 5386           * other thread that might access it exist.
5370 5387           */
5371 5388          if (zone->zone_rootvp != NULL) {
5372      -                VN_RELE(zone->zone_rootvp);
     5389 +                vnode_t *vp = zone->zone_rootvp;
     5390 +
     5391 +                mutex_enter(&vp->v_lock);
     5392 +                vp->v_flag &= ~VZONEROOT;
     5393 +                mutex_exit(&vp->v_lock);
     5394 +                VN_RELE(vp);
5373 5395                  zone->zone_rootvp = NULL;
5374 5396          }
5375 5397  
5376 5398          /* add to deathrow list */
5377 5399          mutex_enter(&zone_deathrow_lock);
5378 5400          list_insert_tail(&zone_deathrow, zone);
5379 5401          mutex_exit(&zone_deathrow_lock);
5380 5402  
5381 5403          /*
5382 5404           * Drop last reference (which was added by zsched()), this will
5383 5405           * free the zone unless there are outstanding cred references.
5384 5406           */
5385 5407          zone_rele(zone);
5386 5408          return (0);
5387 5409  }
5388 5410  
5389 5411  /*
5390 5412   * Systemcall entry point for zone_getattr(2).
5391 5413   */
5392 5414  static ssize_t
5393 5415  zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5394 5416  {
5395 5417          size_t size;
5396 5418          int error = 0, err;
5397 5419          zone_t *zone;
5398 5420          char *zonepath;
5399 5421          char *outstr;
5400 5422          zone_status_t zone_status;
5401 5423          pid_t initpid;
5402 5424          boolean_t global = (curzone == global_zone);
5403 5425          boolean_t inzone = (curzone->zone_id == zoneid);
5404 5426          ushort_t flags;
5405 5427          zone_net_data_t *zbuf;
5406 5428  
5407 5429          mutex_enter(&zonehash_lock);
5408 5430          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5409 5431                  mutex_exit(&zonehash_lock);
5410 5432                  return (set_errno(EINVAL));
5411 5433          }
5412 5434          zone_status = zone_status_get(zone);
5413 5435          if (zone_status < ZONE_IS_INITIALIZED) {
5414 5436                  mutex_exit(&zonehash_lock);
5415 5437                  return (set_errno(EINVAL));
5416 5438          }
5417 5439          zone_hold(zone);
5418 5440          mutex_exit(&zonehash_lock);
5419 5441  
5420 5442          /*
5421 5443           * If not in the global zone, don't show information about other zones,
5422 5444           * unless the system is labeled and the local zone's label dominates
5423 5445           * the other zone.
5424 5446           */
5425 5447          if (!zone_list_access(zone)) {
5426 5448                  zone_rele(zone);
5427 5449                  return (set_errno(EINVAL));
5428 5450          }
5429 5451  
5430 5452          switch (attr) {
5431 5453          case ZONE_ATTR_ROOT:
5432 5454                  if (global) {
5433 5455                          /*
5434 5456                           * Copy the path to trim the trailing "/" (except for
5435 5457                           * the global zone).
5436 5458                           */
5437 5459                          if (zone != global_zone)
5438 5460                                  size = zone->zone_rootpathlen - 1;
5439 5461                          else
5440 5462                                  size = zone->zone_rootpathlen;
5441 5463                          zonepath = kmem_alloc(size, KM_SLEEP);
5442 5464                          bcopy(zone->zone_rootpath, zonepath, size);
5443 5465                          zonepath[size - 1] = '\0';
5444 5466                  } else {
5445 5467                          if (inzone || !is_system_labeled()) {
5446 5468                                  /*
5447 5469                                   * Caller is not in the global zone.
5448 5470                                   * if the query is on the current zone
5449 5471                                   * or the system is not labeled,
5450 5472                                   * just return faked-up path for current zone.
5451 5473                                   */
5452 5474                                  zonepath = "/";
5453 5475                                  size = 2;
5454 5476                          } else {
5455 5477                                  /*
5456 5478                                   * Return related path for current zone.
5457 5479                                   */
5458 5480                                  int prefix_len = strlen(zone_prefix);
5459 5481                                  int zname_len = strlen(zone->zone_name);
5460 5482  
5461 5483                                  size = prefix_len + zname_len + 1;
5462 5484                                  zonepath = kmem_alloc(size, KM_SLEEP);
5463 5485                                  bcopy(zone_prefix, zonepath, prefix_len);
5464 5486                                  bcopy(zone->zone_name, zonepath +
5465 5487                                      prefix_len, zname_len);
5466 5488                                  zonepath[size - 1] = '\0';
5467 5489                          }
5468 5490                  }
5469 5491                  if (bufsize > size)
5470 5492                          bufsize = size;
5471 5493                  if (buf != NULL) {
5472 5494                          err = copyoutstr(zonepath, buf, bufsize, NULL);
5473 5495                          if (err != 0 && err != ENAMETOOLONG)
5474 5496                                  error = EFAULT;
5475 5497                  }
5476 5498                  if (global || (is_system_labeled() && !inzone))
5477 5499                          kmem_free(zonepath, size);
5478 5500                  break;
5479 5501  
5480 5502          case ZONE_ATTR_NAME:
5481 5503                  size = strlen(zone->zone_name) + 1;
5482 5504                  if (bufsize > size)
5483 5505                          bufsize = size;
5484 5506                  if (buf != NULL) {
5485 5507                          err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
5486 5508                          if (err != 0 && err != ENAMETOOLONG)
5487 5509                                  error = EFAULT;
5488 5510                  }
5489 5511                  break;
5490 5512  
5491 5513          case ZONE_ATTR_STATUS:
5492 5514                  /*
5493 5515                   * Since we're not holding zonehash_lock, the zone status
5494 5516                   * may be anything; leave it up to userland to sort it out.
5495 5517                   */
5496 5518                  size = sizeof (zone_status);
5497 5519                  if (bufsize > size)
5498 5520                          bufsize = size;
5499 5521                  zone_status = zone_status_get(zone);
5500 5522                  if (buf != NULL &&
5501 5523                      copyout(&zone_status, buf, bufsize) != 0)
5502 5524                          error = EFAULT;
5503 5525                  break;
5504 5526          case ZONE_ATTR_FLAGS:
5505 5527                  size = sizeof (zone->zone_flags);
5506 5528                  if (bufsize > size)
5507 5529                          bufsize = size;
5508 5530                  flags = zone->zone_flags;
5509 5531                  if (buf != NULL &&
5510 5532                      copyout(&flags, buf, bufsize) != 0)
5511 5533                          error = EFAULT;
5512 5534                  break;
5513 5535          case ZONE_ATTR_PRIVSET:
5514 5536                  size = sizeof (priv_set_t);
5515 5537                  if (bufsize > size)
5516 5538                          bufsize = size;
5517 5539                  if (buf != NULL &&
5518 5540                      copyout(zone->zone_privset, buf, bufsize) != 0)
5519 5541                          error = EFAULT;
5520 5542                  break;
5521 5543          case ZONE_ATTR_UNIQID:
5522 5544                  size = sizeof (zone->zone_uniqid);
5523 5545                  if (bufsize > size)
5524 5546                          bufsize = size;
5525 5547                  if (buf != NULL &&
5526 5548                      copyout(&zone->zone_uniqid, buf, bufsize) != 0)
5527 5549                          error = EFAULT;
5528 5550                  break;
5529 5551          case ZONE_ATTR_POOLID:
5530 5552                  {
5531 5553                          pool_t *pool;
5532 5554                          poolid_t poolid;
5533 5555  
5534 5556                          if (pool_lock_intr() != 0) {
5535 5557                                  error = EINTR;
5536 5558                                  break;
5537 5559                          }
5538 5560                          pool = zone_pool_get(zone);
5539 5561                          poolid = pool->pool_id;
5540 5562                          pool_unlock();
5541 5563                          size = sizeof (poolid);
5542 5564                          if (bufsize > size)
5543 5565                                  bufsize = size;
5544 5566                          if (buf != NULL && copyout(&poolid, buf, size) != 0)
5545 5567                                  error = EFAULT;
5546 5568                  }
5547 5569                  break;
5548 5570          case ZONE_ATTR_SLBL:
5549 5571                  size = sizeof (bslabel_t);
5550 5572                  if (bufsize > size)
5551 5573                          bufsize = size;
5552 5574                  if (zone->zone_slabel == NULL)
5553 5575                          error = EINVAL;
5554 5576                  else if (buf != NULL &&
5555 5577                      copyout(label2bslabel(zone->zone_slabel), buf,
5556 5578                      bufsize) != 0)
5557 5579                          error = EFAULT;
5558 5580                  break;
5559 5581          case ZONE_ATTR_INITPID:
5560 5582                  size = sizeof (initpid);
5561 5583                  if (bufsize > size)
5562 5584                          bufsize = size;
5563 5585                  initpid = zone->zone_proc_initpid;
5564 5586                  if (initpid == -1) {
5565 5587                          error = ESRCH;
5566 5588                          break;
5567 5589                  }
5568 5590                  if (buf != NULL &&
5569 5591                      copyout(&initpid, buf, bufsize) != 0)
5570 5592                          error = EFAULT;
5571 5593                  break;
5572 5594          case ZONE_ATTR_BRAND:
5573 5595                  size = strlen(zone->zone_brand->b_name) + 1;
5574 5596  
5575 5597                  if (bufsize > size)
5576 5598                          bufsize = size;
5577 5599                  if (buf != NULL) {
5578 5600                          err = copyoutstr(zone->zone_brand->b_name, buf,
5579 5601                              bufsize, NULL);
5580 5602                          if (err != 0 && err != ENAMETOOLONG)
5581 5603                                  error = EFAULT;
5582 5604                  }
5583 5605                  break;
5584 5606          case ZONE_ATTR_INITNAME:
5585 5607                  size = strlen(zone->zone_initname) + 1;
5586 5608                  if (bufsize > size)
5587 5609                          bufsize = size;
5588 5610                  if (buf != NULL) {
5589 5611                          err = copyoutstr(zone->zone_initname, buf, bufsize,
5590 5612                              NULL);
5591 5613                          if (err != 0 && err != ENAMETOOLONG)
5592 5614                                  error = EFAULT;
5593 5615                  }
5594 5616                  break;
5595 5617          case ZONE_ATTR_BOOTARGS:
5596 5618                  if (zone->zone_bootargs == NULL)
5597 5619                          outstr = "";
5598 5620                  else
5599 5621                          outstr = zone->zone_bootargs;
5600 5622                  size = strlen(outstr) + 1;
5601 5623                  if (bufsize > size)
5602 5624                          bufsize = size;
5603 5625                  if (buf != NULL) {
5604 5626                          err = copyoutstr(outstr, buf, bufsize, NULL);
5605 5627                          if (err != 0 && err != ENAMETOOLONG)
5606 5628                                  error = EFAULT;
5607 5629                  }
5608 5630                  break;
5609 5631          case ZONE_ATTR_PHYS_MCAP:
5610 5632                  size = sizeof (zone->zone_phys_mcap);
5611 5633                  if (bufsize > size)
5612 5634                          bufsize = size;
5613 5635                  if (buf != NULL &&
5614 5636                      copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5615 5637                          error = EFAULT;
5616 5638                  break;
5617 5639          case ZONE_ATTR_SCHED_CLASS:
5618 5640                  mutex_enter(&class_lock);
5619 5641  
5620 5642                  if (zone->zone_defaultcid >= loaded_classes)
5621 5643                          outstr = "";
5622 5644                  else
5623 5645                          outstr = sclass[zone->zone_defaultcid].cl_name;
5624 5646                  size = strlen(outstr) + 1;
5625 5647                  if (bufsize > size)
5626 5648                          bufsize = size;
5627 5649                  if (buf != NULL) {
5628 5650                          err = copyoutstr(outstr, buf, bufsize, NULL);
5629 5651                          if (err != 0 && err != ENAMETOOLONG)
5630 5652                                  error = EFAULT;
5631 5653                  }
5632 5654  
5633 5655                  mutex_exit(&class_lock);
5634 5656                  break;
5635 5657          case ZONE_ATTR_HOSTID:
5636 5658                  if (zone->zone_hostid != HW_INVALID_HOSTID &&
5637 5659                      bufsize == sizeof (zone->zone_hostid)) {
5638 5660                          size = sizeof (zone->zone_hostid);
5639 5661                          if (buf != NULL && copyout(&zone->zone_hostid, buf,
5640 5662                              bufsize) != 0)
5641 5663                                  error = EFAULT;
5642 5664                  } else {
5643 5665                          error = EINVAL;
5644 5666                  }
5645 5667                  break;
5646 5668          case ZONE_ATTR_FS_ALLOWED:
5647 5669                  if (zone->zone_fs_allowed == NULL)
5648 5670                          outstr = "";
5649 5671                  else
5650 5672                          outstr = zone->zone_fs_allowed;
5651 5673                  size = strlen(outstr) + 1;
5652 5674                  if (bufsize > size)
5653 5675                          bufsize = size;
5654 5676                  if (buf != NULL) {
5655 5677                          err = copyoutstr(outstr, buf, bufsize, NULL);
5656 5678                          if (err != 0 && err != ENAMETOOLONG)
5657 5679                                  error = EFAULT;
5658 5680                  }
5659 5681                  break;
5660 5682          case ZONE_ATTR_SECFLAGS:
5661 5683                  size = sizeof (zone->zone_secflags);
5662 5684                  if (bufsize > size)
5663 5685                          bufsize = size;
5664 5686                  if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
5665 5687                          error = EFAULT;
5666 5688                  break;
5667 5689          case ZONE_ATTR_NETWORK:
5668 5690                  bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
5669 5691                  size = bufsize;
5670 5692                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5671 5693                  if (copyin(buf, zbuf, bufsize) != 0) {
5672 5694                          error = EFAULT;
5673 5695                  } else {
5674 5696                          error = zone_get_network(zoneid, zbuf);
5675 5697                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5676 5698                                  error = EFAULT;
5677 5699                  }
5678 5700                  kmem_free(zbuf, bufsize);
5679 5701                  break;
5680 5702          default:
5681 5703                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5682 5704                          size = bufsize;
5683 5705                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5684 5706                  } else {
5685 5707                          error = EINVAL;
5686 5708                  }
5687 5709          }
5688 5710          zone_rele(zone);
5689 5711  
5690 5712          if (error)
5691 5713                  return (set_errno(error));
5692 5714          return ((ssize_t)size);
5693 5715  }
5694 5716  
5695 5717  /*
5696 5718   * Systemcall entry point for zone_setattr(2).
5697 5719   */
5698 5720  /*ARGSUSED*/
5699 5721  static int
5700 5722  zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5701 5723  {
5702 5724          zone_t *zone;
5703 5725          zone_status_t zone_status;
5704 5726          int err = -1;
5705 5727          zone_net_data_t *zbuf;
5706 5728  
5707 5729          if (secpolicy_zone_config(CRED()) != 0)
5708 5730                  return (set_errno(EPERM));
5709 5731  
5710 5732          /*
5711 5733           * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5712 5734           * global zone.
5713 5735           */
5714 5736          if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5715 5737                  return (set_errno(EINVAL));
5716 5738          }
5717 5739  
5718 5740          mutex_enter(&zonehash_lock);
5719 5741          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5720 5742                  mutex_exit(&zonehash_lock);
5721 5743                  return (set_errno(EINVAL));
5722 5744          }
5723 5745          zone_hold(zone);
5724 5746          mutex_exit(&zonehash_lock);
5725 5747  
5726 5748          /*
5727 5749           * At present most attributes can only be set on non-running,
5728 5750           * non-global zones.
5729 5751           */
5730 5752          zone_status = zone_status_get(zone);
5731 5753          if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5732 5754                  err = EINVAL;
5733 5755                  goto done;
5734 5756          }
5735 5757  
5736 5758          switch (attr) {
5737 5759          case ZONE_ATTR_INITNAME:
5738 5760                  err = zone_set_initname(zone, (const char *)buf);
5739 5761                  break;
5740 5762          case ZONE_ATTR_INITNORESTART:
5741 5763                  zone->zone_restart_init = B_FALSE;
5742 5764                  err = 0;
5743 5765                  break;
5744 5766          case ZONE_ATTR_BOOTARGS:
5745 5767                  err = zone_set_bootargs(zone, (const char *)buf);
5746 5768                  break;
5747 5769          case ZONE_ATTR_BRAND:
5748 5770                  err = zone_set_brand(zone, (const char *)buf);
5749 5771                  break;
5750 5772          case ZONE_ATTR_FS_ALLOWED:
5751 5773                  err = zone_set_fs_allowed(zone, (const char *)buf);
5752 5774                  break;
5753 5775          case ZONE_ATTR_SECFLAGS:
5754 5776                  err = zone_set_secflags(zone, (psecflags_t *)buf);
5755 5777                  break;
5756 5778          case ZONE_ATTR_PHYS_MCAP:
5757 5779                  err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5758 5780                  break;
5759 5781          case ZONE_ATTR_SCHED_CLASS:
5760 5782                  err = zone_set_sched_class(zone, (const char *)buf);
5761 5783                  break;
5762 5784          case ZONE_ATTR_HOSTID:
5763 5785                  if (bufsize == sizeof (zone->zone_hostid)) {
5764 5786                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5765 5787                                  err = 0;
5766 5788                          else
5767 5789                                  err = EFAULT;
5768 5790                  } else {
5769 5791                          err = EINVAL;
5770 5792                  }
5771 5793                  break;
5772 5794          case ZONE_ATTR_NETWORK:
5773 5795                  if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5774 5796                          err = EINVAL;
5775 5797                          break;
5776 5798                  }
5777 5799                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5778 5800                  if (copyin(buf, zbuf, bufsize) != 0) {
5779 5801                          kmem_free(zbuf, bufsize);
5780 5802                          err = EFAULT;
5781 5803                          break;
5782 5804                  }
5783 5805                  err = zone_set_network(zoneid, zbuf);
5784 5806                  kmem_free(zbuf, bufsize);
5785 5807                  break;
5786 5808          default:
5787 5809                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5788 5810                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5789 5811                  else
5790 5812                          err = EINVAL;
5791 5813          }
5792 5814  
5793 5815  done:
5794 5816          zone_rele(zone);
5795 5817          ASSERT(err != -1);
5796 5818          return (err != 0 ? set_errno(err) : 0);
5797 5819  }
5798 5820  
5799 5821  /*
5800 5822   * Return zero if the process has at least one vnode mapped in to its
5801 5823   * address space which shouldn't be allowed to change zones.
5802 5824   *
5803 5825   * Also return zero if the process has any shared mappings which reserve
5804 5826   * swap.  This is because the counting for zone.max-swap does not allow swap
5805 5827   * reservation to be shared between zones.  zone swap reservation is counted
5806 5828   * on zone->zone_max_swap.
5807 5829   */
5808 5830  static int
5809 5831  as_can_change_zones(void)
5810 5832  {
5811 5833          proc_t *pp = curproc;
5812 5834          struct seg *seg;
5813 5835          struct as *as = pp->p_as;
5814 5836          vnode_t *vp;
5815 5837          int allow = 1;
5816 5838  
5817 5839          ASSERT(pp->p_as != &kas);
5818 5840          AS_LOCK_ENTER(as, RW_READER);
5819 5841          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5820 5842  
5821 5843                  /*
5822 5844                   * Cannot enter zone with shared anon memory which
5823 5845                   * reserves swap.  See comment above.
5824 5846                   */
5825 5847                  if (seg_can_change_zones(seg) == B_FALSE) {
5826 5848                          allow = 0;
5827 5849                          break;
5828 5850                  }
5829 5851                  /*
5830 5852                   * if we can't get a backing vnode for this segment then skip
5831 5853                   * it.
5832 5854                   */
5833 5855                  vp = NULL;
5834 5856                  if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5835 5857                          continue;
5836 5858                  if (!vn_can_change_zones(vp)) { /* bail on first match */
5837 5859                          allow = 0;
5838 5860                          break;
5839 5861                  }
5840 5862          }
5841 5863          AS_LOCK_EXIT(as);
5842 5864          return (allow);
5843 5865  }
5844 5866  
5845 5867  /*
5846 5868   * Count swap reserved by curproc's address space
5847 5869   */
5848 5870  static size_t
5849 5871  as_swresv(void)
5850 5872  {
5851 5873          proc_t *pp = curproc;
5852 5874          struct seg *seg;
5853 5875          struct as *as = pp->p_as;
5854 5876          size_t swap = 0;
5855 5877  
5856 5878          ASSERT(pp->p_as != &kas);
5857 5879          ASSERT(AS_WRITE_HELD(as));
5858 5880          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5859 5881                  swap += seg_swresv(seg);
5860 5882  
5861 5883          return (swap);
5862 5884  }
5863 5885  
5864 5886  /*
5865 5887   * Systemcall entry point for zone_enter().
5866 5888   *
5867 5889   * The current process is injected into said zone.  In the process
5868 5890   * it will change its project membership, privileges, rootdir/cwd,
5869 5891   * zone-wide rctls, and pool association to match those of the zone.
5870 5892   *
5871 5893   * The first zone_enter() called while the zone is in the ZONE_IS_READY
5872 5894   * state will transition it to ZONE_IS_RUNNING.  Processes may only
5873 5895   * enter a zone that is "ready" or "running".
5874 5896   */
5875 5897  static int
5876 5898  zone_enter(zoneid_t zoneid)
5877 5899  {
5878 5900          zone_t *zone;
5879 5901          vnode_t *vp;
5880 5902          proc_t *pp = curproc;
5881 5903          contract_t *ct;
5882 5904          cont_process_t *ctp;
5883 5905          task_t *tk, *oldtk;
5884 5906          kproject_t *zone_proj0;
5885 5907          cred_t *cr, *newcr;
5886 5908          pool_t *oldpool, *newpool;
5887 5909          sess_t *sp;
5888 5910          uid_t uid;
5889 5911          zone_status_t status;
5890 5912          int err = 0;
5891 5913          rctl_entity_p_t e;
5892 5914          size_t swap;
5893 5915          kthread_id_t t;
5894 5916  
5895 5917          if (secpolicy_zone_config(CRED()) != 0)
5896 5918                  return (set_errno(EPERM));
5897 5919          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5898 5920                  return (set_errno(EINVAL));
5899 5921  
5900 5922          /*
5901 5923           * Stop all lwps so we don't need to hold a lock to look at
5902 5924           * curproc->p_zone.  This needs to happen before we grab any
5903 5925           * locks to avoid deadlock (another lwp in the process could
5904 5926           * be waiting for the held lock).
5905 5927           */
5906 5928          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5907 5929                  return (set_errno(EINTR));
5908 5930  
5909 5931          /*
5910 5932           * Make sure we're not changing zones with files open or mapped in
5911 5933           * to our address space which shouldn't be changing zones.
5912 5934           */
5913 5935          if (!files_can_change_zones()) {
5914 5936                  err = EBADF;
5915 5937                  goto out;
5916 5938          }
5917 5939          if (!as_can_change_zones()) {
5918 5940                  err = EFAULT;
5919 5941                  goto out;
5920 5942          }
5921 5943  
5922 5944          mutex_enter(&zonehash_lock);
5923 5945          if (pp->p_zone != global_zone) {
5924 5946                  mutex_exit(&zonehash_lock);
5925 5947                  err = EINVAL;
5926 5948                  goto out;
5927 5949          }
5928 5950  
5929 5951          zone = zone_find_all_by_id(zoneid);
5930 5952          if (zone == NULL) {
5931 5953                  mutex_exit(&zonehash_lock);
5932 5954                  err = EINVAL;
5933 5955                  goto out;
5934 5956          }
5935 5957  
5936 5958          /*
5937 5959           * To prevent processes in a zone from holding contracts on
5938 5960           * extrazonal resources, and to avoid process contract
5939 5961           * memberships which span zones, contract holders and processes
5940 5962           * which aren't the sole members of their encapsulating process
5941 5963           * contracts are not allowed to zone_enter.
5942 5964           */
5943 5965          ctp = pp->p_ct_process;
5944 5966          ct = &ctp->conp_contract;
5945 5967          mutex_enter(&ct->ct_lock);
5946 5968          mutex_enter(&pp->p_lock);
5947 5969          if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5948 5970                  mutex_exit(&pp->p_lock);
5949 5971                  mutex_exit(&ct->ct_lock);
5950 5972                  mutex_exit(&zonehash_lock);
5951 5973                  err = EINVAL;
5952 5974                  goto out;
5953 5975          }
5954 5976  
5955 5977          /*
5956 5978           * Moreover, we don't allow processes whose encapsulating
5957 5979           * process contracts have inherited extrazonal contracts.
5958 5980           * While it would be easier to eliminate all process contracts
5959 5981           * with inherited contracts, we need to be able to give a
5960 5982           * restarted init (or other zone-penetrating process) its
5961 5983           * predecessor's contracts.
5962 5984           */
5963 5985          if (ctp->conp_ninherited != 0) {
5964 5986                  contract_t *next;
5965 5987                  for (next = list_head(&ctp->conp_inherited); next;
5966 5988                      next = list_next(&ctp->conp_inherited, next)) {
5967 5989                          if (contract_getzuniqid(next) != zone->zone_uniqid) {
5968 5990                                  mutex_exit(&pp->p_lock);
5969 5991                                  mutex_exit(&ct->ct_lock);
5970 5992                                  mutex_exit(&zonehash_lock);
5971 5993                                  err = EINVAL;
5972 5994                                  goto out;
5973 5995                          }
5974 5996                  }
5975 5997          }
5976 5998  
5977 5999          mutex_exit(&pp->p_lock);
5978 6000          mutex_exit(&ct->ct_lock);
5979 6001  
5980 6002          status = zone_status_get(zone);
5981 6003          if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5982 6004                  /*
5983 6005                   * Can't join
5984 6006                   */
5985 6007                  mutex_exit(&zonehash_lock);
5986 6008                  err = EINVAL;
5987 6009                  goto out;
5988 6010          }
5989 6011  
5990 6012          /*
5991 6013           * Make sure new priv set is within the permitted set for caller
5992 6014           */
5993 6015          if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5994 6016                  mutex_exit(&zonehash_lock);
5995 6017                  err = EPERM;
5996 6018                  goto out;
5997 6019          }
5998 6020          /*
5999 6021           * We want to momentarily drop zonehash_lock while we optimistically
6000 6022           * bind curproc to the pool it should be running in.  This is safe
6001 6023           * since the zone can't disappear (we have a hold on it).
6002 6024           */
6003 6025          zone_hold(zone);
6004 6026          mutex_exit(&zonehash_lock);
6005 6027  
6006 6028          /*
6007 6029           * Grab pool_lock to keep the pools configuration from changing
6008 6030           * and to stop ourselves from getting rebound to another pool
6009 6031           * until we join the zone.
6010 6032           */
6011 6033          if (pool_lock_intr() != 0) {
6012 6034                  zone_rele(zone);
6013 6035                  err = EINTR;
6014 6036                  goto out;
6015 6037          }
6016 6038          ASSERT(secpolicy_pool(CRED()) == 0);
6017 6039          /*
6018 6040           * Bind ourselves to the pool currently associated with the zone.
6019 6041           */
6020 6042          oldpool = curproc->p_pool;
6021 6043          newpool = zone_pool_get(zone);
6022 6044          if (pool_state == POOL_ENABLED && newpool != oldpool &&
6023 6045              (err = pool_do_bind(newpool, P_PID, P_MYID,
6024 6046              POOL_BIND_ALL)) != 0) {
6025 6047                  pool_unlock();
6026 6048                  zone_rele(zone);
6027 6049                  goto out;
6028 6050          }
6029 6051  
6030 6052          /*
6031 6053           * Grab cpu_lock now; we'll need it later when we call
6032 6054           * task_join().
6033 6055           */
6034 6056          mutex_enter(&cpu_lock);
6035 6057          mutex_enter(&zonehash_lock);
6036 6058          /*
6037 6059           * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6038 6060           */
6039 6061          if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6040 6062                  /*
6041 6063                   * Can't join anymore.
6042 6064                   */
6043 6065                  mutex_exit(&zonehash_lock);
6044 6066                  mutex_exit(&cpu_lock);
6045 6067                  if (pool_state == POOL_ENABLED &&
6046 6068                      newpool != oldpool)
6047 6069                          (void) pool_do_bind(oldpool, P_PID, P_MYID,
6048 6070                              POOL_BIND_ALL);
6049 6071                  pool_unlock();
6050 6072                  zone_rele(zone);
6051 6073                  err = EINVAL;
6052 6074                  goto out;
6053 6075          }
6054 6076  
6055 6077          /*
6056 6078           * a_lock must be held while transfering locked memory and swap
6057 6079           * reservation from the global zone to the non global zone because
6058 6080           * asynchronous faults on the processes' address space can lock
6059 6081           * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6060 6082           * segments respectively.
6061 6083           */
6062 6084          AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6063 6085          swap = as_swresv();
6064 6086          mutex_enter(&pp->p_lock);
6065 6087          zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6066 6088          /* verify that we do not exceed and task or lwp limits */
6067 6089          mutex_enter(&zone->zone_nlwps_lock);
6068 6090          /* add new lwps to zone and zone's proj0 */
6069 6091          zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6070 6092          zone->zone_nlwps += pp->p_lwpcnt;
6071 6093          /* add 1 task to zone's proj0 */
6072 6094          zone_proj0->kpj_ntasks += 1;
6073 6095  
6074 6096          zone_proj0->kpj_nprocs++;
6075 6097          zone->zone_nprocs++;
6076 6098          mutex_exit(&zone->zone_nlwps_lock);
6077 6099  
6078 6100          mutex_enter(&zone->zone_mem_lock);
6079 6101          zone->zone_locked_mem += pp->p_locked_mem;
6080 6102          zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6081 6103          zone->zone_max_swap += swap;
6082 6104          mutex_exit(&zone->zone_mem_lock);
6083 6105  
6084 6106          mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6085 6107          zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6086 6108          mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6087 6109  
6088 6110          /* remove lwps and process from proc's old zone and old project */
6089 6111          mutex_enter(&pp->p_zone->zone_nlwps_lock);
6090 6112          pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6091 6113          pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6092 6114          pp->p_task->tk_proj->kpj_nprocs--;
6093 6115          pp->p_zone->zone_nprocs--;
6094 6116          mutex_exit(&pp->p_zone->zone_nlwps_lock);
6095 6117  
6096 6118          mutex_enter(&pp->p_zone->zone_mem_lock);
6097 6119          pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6098 6120          pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6099 6121          pp->p_zone->zone_max_swap -= swap;
6100 6122          mutex_exit(&pp->p_zone->zone_mem_lock);
6101 6123  
6102 6124          mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6103 6125          pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6104 6126          mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6105 6127  
6106 6128          pp->p_flag |= SZONETOP;
6107 6129          pp->p_zone = zone;
6108 6130          mutex_exit(&pp->p_lock);
6109 6131          AS_LOCK_EXIT(pp->p_as);
6110 6132  
6111 6133          /*
6112 6134           * Joining the zone cannot fail from now on.
6113 6135           *
6114 6136           * This means that a lot of the following code can be commonized and
6115 6137           * shared with zsched().
6116 6138           */
6117 6139  
6118 6140          /*
6119 6141           * If the process contract fmri was inherited, we need to
6120 6142           * flag this so that any contract status will not leak
6121 6143           * extra zone information, svc_fmri in this case
6122 6144           */
6123 6145          if (ctp->conp_svc_ctid != ct->ct_id) {
6124 6146                  mutex_enter(&ct->ct_lock);
6125 6147                  ctp->conp_svc_zone_enter = ct->ct_id;
6126 6148                  mutex_exit(&ct->ct_lock);
6127 6149          }
6128 6150  
6129 6151          /*
6130 6152           * Reset the encapsulating process contract's zone.
6131 6153           */
6132 6154          ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6133 6155          contract_setzuniqid(ct, zone->zone_uniqid);
6134 6156  
6135 6157          /*
6136 6158           * Create a new task and associate the process with the project keyed
6137 6159           * by (projid,zoneid).
6138 6160           *
6139 6161           * We might as well be in project 0; the global zone's projid doesn't
6140 6162           * make much sense in a zone anyhow.
6141 6163           *
6142 6164           * This also increments zone_ntasks, and returns with p_lock held.
6143 6165           */
6144 6166          tk = task_create(0, zone);
6145 6167          oldtk = task_join(tk, 0);
6146 6168          mutex_exit(&cpu_lock);
6147 6169  
6148 6170          /*
6149 6171           * call RCTLOP_SET functions on this proc
6150 6172           */
6151 6173          e.rcep_p.zone = zone;
6152 6174          e.rcep_t = RCENTITY_ZONE;
6153 6175          (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6154 6176              RCD_CALLBACK);
6155 6177          mutex_exit(&pp->p_lock);
6156 6178  
6157 6179          /*
6158 6180           * We don't need to hold any of zsched's locks here; not only do we know
6159 6181           * the process and zone aren't going away, we know its session isn't
6160 6182           * changing either.
6161 6183           *
6162 6184           * By joining zsched's session here, we mimic the behavior in the
6163 6185           * global zone of init's sid being the pid of sched.  We extend this
6164 6186           * to all zlogin-like zone_enter()'ing processes as well.
6165 6187           */
6166 6188          mutex_enter(&pidlock);
6167 6189          sp = zone->zone_zsched->p_sessp;
6168 6190          sess_hold(zone->zone_zsched);
6169 6191          mutex_enter(&pp->p_lock);
6170 6192          pgexit(pp);
6171 6193          sess_rele(pp->p_sessp, B_TRUE);
6172 6194          pp->p_sessp = sp;
6173 6195          pgjoin(pp, zone->zone_zsched->p_pidp);
6174 6196  
6175 6197          /*
6176 6198           * If any threads are scheduled to be placed on zone wait queue they
6177 6199           * should abandon the idea since the wait queue is changing.
6178 6200           * We need to be holding pidlock & p_lock to do this.
6179 6201           */
6180 6202          if ((t = pp->p_tlist) != NULL) {
6181 6203                  do {
6182 6204                          thread_lock(t);
6183 6205                          /*
6184 6206                           * Kick this thread so that it doesn't sit
6185 6207                           * on a wrong wait queue.
6186 6208                           */
6187 6209                          if (ISWAITING(t))
6188 6210                                  setrun_locked(t);
6189 6211  
6190 6212                          if (t->t_schedflag & TS_ANYWAITQ)
6191 6213                                  t->t_schedflag &= ~ TS_ANYWAITQ;
6192 6214  
6193 6215                          thread_unlock(t);
6194 6216                  } while ((t = t->t_forw) != pp->p_tlist);
6195 6217          }
6196 6218  
6197 6219          /*
6198 6220           * If there is a default scheduling class for the zone and it is not
6199 6221           * the class we are currently in, change all of the threads in the
6200 6222           * process to the new class.  We need to be holding pidlock & p_lock
6201 6223           * when we call parmsset so this is a good place to do it.
6202 6224           */
6203 6225          if (zone->zone_defaultcid > 0 &&
6204 6226              zone->zone_defaultcid != curthread->t_cid) {
6205 6227                  pcparms_t pcparms;
6206 6228  
6207 6229                  pcparms.pc_cid = zone->zone_defaultcid;
6208 6230                  pcparms.pc_clparms[0] = 0;
6209 6231  
6210 6232                  /*
6211 6233                   * If setting the class fails, we still want to enter the zone.
6212 6234                   */
6213 6235                  if ((t = pp->p_tlist) != NULL) {
6214 6236                          do {
6215 6237                                  (void) parmsset(&pcparms, t);
6216 6238                          } while ((t = t->t_forw) != pp->p_tlist);
6217 6239                  }
6218 6240          }
6219 6241  
6220 6242          mutex_exit(&pp->p_lock);
6221 6243          mutex_exit(&pidlock);
6222 6244  
6223 6245          mutex_exit(&zonehash_lock);
6224 6246          /*
6225 6247           * We're firmly in the zone; let pools progress.
6226 6248           */
6227 6249          pool_unlock();
6228 6250          task_rele(oldtk);
6229 6251          /*
6230 6252           * We don't need to retain a hold on the zone since we already
6231 6253           * incremented zone_ntasks, so the zone isn't going anywhere.
6232 6254           */
6233 6255          zone_rele(zone);
6234 6256  
6235 6257          /*
6236 6258           * Chroot
6237 6259           */
6238 6260          vp = zone->zone_rootvp;
6239 6261          zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6240 6262          zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6241 6263  
6242 6264          /*
6243 6265           * Change process security flags.  Note that the _effective_ flags
6244 6266           * cannot change
6245 6267           */
6246 6268          secflags_copy(&pp->p_secflags.psf_lower,
6247 6269              &zone->zone_secflags.psf_lower);
6248 6270          secflags_copy(&pp->p_secflags.psf_upper,
6249 6271              &zone->zone_secflags.psf_upper);
6250 6272          secflags_copy(&pp->p_secflags.psf_inherit,
6251 6273              &zone->zone_secflags.psf_inherit);
6252 6274  
6253 6275          /*
6254 6276           * Change process credentials
6255 6277           */
6256 6278          newcr = cralloc();
6257 6279          mutex_enter(&pp->p_crlock);
6258 6280          cr = pp->p_cred;
6259 6281          crcopy_to(cr, newcr);
6260 6282          crsetzone(newcr, zone);
6261 6283          pp->p_cred = newcr;
6262 6284  
6263 6285          /*
6264 6286           * Restrict all process privilege sets to zone limit
6265 6287           */
6266 6288          priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6267 6289          priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6268 6290          priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6269 6291          priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6270 6292          mutex_exit(&pp->p_crlock);
6271 6293          crset(pp, newcr);
6272 6294  
6273 6295          /*
6274 6296           * Adjust upcount to reflect zone entry.
6275 6297           */
6276 6298          uid = crgetruid(newcr);
6277 6299          mutex_enter(&pidlock);
6278 6300          upcount_dec(uid, GLOBAL_ZONEID);
6279 6301          upcount_inc(uid, zoneid);
6280 6302          mutex_exit(&pidlock);
6281 6303  
6282 6304          /*
6283 6305           * Set up core file path and content.
6284 6306           */
6285 6307          set_core_defaults();
6286 6308  
6287 6309  out:
6288 6310          /*
6289 6311           * Let the other lwps continue.
6290 6312           */
6291 6313          mutex_enter(&pp->p_lock);
6292 6314          if (curthread != pp->p_agenttp)
6293 6315                  continuelwps(pp);
6294 6316          mutex_exit(&pp->p_lock);
6295 6317  
6296 6318          return (err != 0 ? set_errno(err) : 0);
6297 6319  }
6298 6320  
6299 6321  /*
6300 6322   * Systemcall entry point for zone_list(2).
6301 6323   *
6302 6324   * Processes running in a (non-global) zone only see themselves.
6303 6325   * On labeled systems, they see all zones whose label they dominate.
6304 6326   */
6305 6327  static int
6306 6328  zone_list(zoneid_t *zoneidlist, uint_t *numzones)
6307 6329  {
6308 6330          zoneid_t *zoneids;
6309 6331          zone_t *zone, *myzone;
6310 6332          uint_t user_nzones, real_nzones;
6311 6333          uint_t domi_nzones;
6312 6334          int error;
6313 6335  
6314 6336          if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
6315 6337                  return (set_errno(EFAULT));
6316 6338  
6317 6339          myzone = curproc->p_zone;
6318 6340          if (myzone != global_zone) {
6319 6341                  bslabel_t *mybslab;
6320 6342  
6321 6343                  if (!is_system_labeled()) {
6322 6344                          /* just return current zone */
6323 6345                          real_nzones = domi_nzones = 1;
6324 6346                          zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
6325 6347                          zoneids[0] = myzone->zone_id;
6326 6348                  } else {
6327 6349                          /* return all zones that are dominated */
6328 6350                          mutex_enter(&zonehash_lock);
6329 6351                          real_nzones = zonecount;
6330 6352                          domi_nzones = 0;
6331 6353                          if (real_nzones > 0) {
6332 6354                                  zoneids = kmem_alloc(real_nzones *
6333 6355                                      sizeof (zoneid_t), KM_SLEEP);
6334 6356                                  mybslab = label2bslabel(myzone->zone_slabel);
6335 6357                                  for (zone = list_head(&zone_active);
6336 6358                                      zone != NULL;
6337 6359                                      zone = list_next(&zone_active, zone)) {
6338 6360                                          if (zone->zone_id == GLOBAL_ZONEID)
6339 6361                                                  continue;
6340 6362                                          if (zone != myzone &&
6341 6363                                              (zone->zone_flags & ZF_IS_SCRATCH))
6342 6364                                                  continue;
6343 6365                                          /*
6344 6366                                           * Note that a label always dominates
6345 6367                                           * itself, so myzone is always included
6346 6368                                           * in the list.
6347 6369                                           */
6348 6370                                          if (bldominates(mybslab,
6349 6371                                              label2bslabel(zone->zone_slabel))) {
6350 6372                                                  zoneids[domi_nzones++] =
6351 6373                                                      zone->zone_id;
6352 6374                                          }
6353 6375                                  }
6354 6376                          }
6355 6377                          mutex_exit(&zonehash_lock);
6356 6378                  }
6357 6379          } else {
6358 6380                  mutex_enter(&zonehash_lock);
6359 6381                  real_nzones = zonecount;
6360 6382                  domi_nzones = 0;
6361 6383                  if (real_nzones > 0) {
6362 6384                          zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
6363 6385                              KM_SLEEP);
6364 6386                          for (zone = list_head(&zone_active); zone != NULL;
6365 6387                              zone = list_next(&zone_active, zone))
6366 6388                                  zoneids[domi_nzones++] = zone->zone_id;
6367 6389                          ASSERT(domi_nzones == real_nzones);
6368 6390                  }
6369 6391                  mutex_exit(&zonehash_lock);
6370 6392          }
6371 6393  
6372 6394          /*
6373 6395           * If user has allocated space for fewer entries than we found, then
6374 6396           * return only up to their limit.  Either way, tell them exactly how
6375 6397           * many we found.
6376 6398           */
6377 6399          if (domi_nzones < user_nzones)
6378 6400                  user_nzones = domi_nzones;
6379 6401          error = 0;
6380 6402          if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
6381 6403                  error = EFAULT;
6382 6404          } else if (zoneidlist != NULL && user_nzones != 0) {
6383 6405                  if (copyout(zoneids, zoneidlist,
6384 6406                      user_nzones * sizeof (zoneid_t)) != 0)
6385 6407                          error = EFAULT;
6386 6408          }
6387 6409  
6388 6410          if (real_nzones > 0)
6389 6411                  kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
6390 6412  
6391 6413          if (error != 0)
6392 6414                  return (set_errno(error));
6393 6415          else
6394 6416                  return (0);
6395 6417  }
6396 6418  
6397 6419  /*
6398 6420   * Systemcall entry point for zone_lookup(2).
6399 6421   *
6400 6422   * Non-global zones are only able to see themselves and (on labeled systems)
6401 6423   * the zones they dominate.
6402 6424   */
6403 6425  static zoneid_t
6404 6426  zone_lookup(const char *zone_name)
6405 6427  {
6406 6428          char *kname;
6407 6429          zone_t *zone;
6408 6430          zoneid_t zoneid;
6409 6431          int err;
6410 6432  
6411 6433          if (zone_name == NULL) {
6412 6434                  /* return caller's zone id */
6413 6435                  return (getzoneid());
6414 6436          }
6415 6437  
6416 6438          kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
6417 6439          if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
6418 6440                  kmem_free(kname, ZONENAME_MAX);
6419 6441                  return (set_errno(err));
6420 6442          }
6421 6443  
6422 6444          mutex_enter(&zonehash_lock);
6423 6445          zone = zone_find_all_by_name(kname);
6424 6446          kmem_free(kname, ZONENAME_MAX);
6425 6447          /*
6426 6448           * In a non-global zone, can only lookup global and own name.
6427 6449           * In Trusted Extensions zone label dominance rules apply.
6428 6450           */
6429 6451          if (zone == NULL ||
6430 6452              zone_status_get(zone) < ZONE_IS_READY ||
6431 6453              !zone_list_access(zone)) {
6432 6454                  mutex_exit(&zonehash_lock);
6433 6455                  return (set_errno(EINVAL));
6434 6456          } else {
6435 6457                  zoneid = zone->zone_id;
6436 6458                  mutex_exit(&zonehash_lock);
6437 6459                  return (zoneid);
6438 6460          }
6439 6461  }
6440 6462  
6441 6463  static int
6442 6464  zone_version(int *version_arg)
6443 6465  {
6444 6466          int version = ZONE_SYSCALL_API_VERSION;
6445 6467  
6446 6468          if (copyout(&version, version_arg, sizeof (int)) != 0)
6447 6469                  return (set_errno(EFAULT));
6448 6470          return (0);
6449 6471  }
6450 6472  
6451 6473  /* ARGSUSED */
6452 6474  long
6453 6475  zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
6454 6476  {
6455 6477          zone_def zs;
6456 6478          int err;
6457 6479  
6458 6480          switch (cmd) {
6459 6481          case ZONE_CREATE:
6460 6482                  if (get_udatamodel() == DATAMODEL_NATIVE) {
6461 6483                          if (copyin(arg1, &zs, sizeof (zone_def))) {
6462 6484                                  return (set_errno(EFAULT));
6463 6485                          }
6464 6486                  } else {
6465 6487  #ifdef _SYSCALL32_IMPL
6466 6488                          zone_def32 zs32;
6467 6489  
6468 6490                          if (copyin(arg1, &zs32, sizeof (zone_def32))) {
6469 6491                                  return (set_errno(EFAULT));
6470 6492                          }
6471 6493                          zs.zone_name =
6472 6494                              (const char *)(unsigned long)zs32.zone_name;
6473 6495                          zs.zone_root =
6474 6496                              (const char *)(unsigned long)zs32.zone_root;
6475 6497                          zs.zone_privs =
6476 6498                              (const struct priv_set *)
6477 6499                              (unsigned long)zs32.zone_privs;
6478 6500                          zs.zone_privssz = zs32.zone_privssz;
6479 6501                          zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6480 6502                          zs.rctlbufsz = zs32.rctlbufsz;
6481 6503                          zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6482 6504                          zs.zfsbufsz = zs32.zfsbufsz;
6483 6505                          zs.extended_error =
6484 6506                              (int *)(unsigned long)zs32.extended_error;
6485 6507                          zs.match = zs32.match;
6486 6508                          zs.doi = zs32.doi;
6487 6509                          zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6488 6510                          zs.flags = zs32.flags;
6489 6511  #else
6490 6512                          panic("get_udatamodel() returned bogus result\n");
6491 6513  #endif
6492 6514                  }
6493 6515  
6494 6516                  return (zone_create(zs.zone_name, zs.zone_root,
6495 6517                      zs.zone_privs, zs.zone_privssz,
6496 6518                      (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6497 6519                      (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6498 6520                      zs.extended_error, zs.match, zs.doi,
6499 6521                      zs.label, zs.flags));
6500 6522          case ZONE_BOOT:
6501 6523                  return (zone_boot((zoneid_t)(uintptr_t)arg1));
6502 6524          case ZONE_DESTROY:
6503 6525                  return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6504 6526          case ZONE_GETATTR:
6505 6527                  return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6506 6528                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6507 6529          case ZONE_SETATTR:
6508 6530                  return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6509 6531                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6510 6532          case ZONE_ENTER:
6511 6533                  return (zone_enter((zoneid_t)(uintptr_t)arg1));
6512 6534          case ZONE_LIST:
6513 6535                  return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6514 6536          case ZONE_SHUTDOWN:
6515 6537                  return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6516 6538          case ZONE_LOOKUP:
6517 6539                  return (zone_lookup((const char *)arg1));
6518 6540          case ZONE_VERSION:
6519 6541                  return (zone_version((int *)arg1));
6520 6542          case ZONE_ADD_DATALINK:
6521 6543                  return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
6522 6544                      (datalink_id_t)(uintptr_t)arg2));
6523 6545          case ZONE_DEL_DATALINK:
6524 6546                  return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
6525 6547                      (datalink_id_t)(uintptr_t)arg2));
6526 6548          case ZONE_CHECK_DATALINK: {
6527 6549                  zoneid_t        zoneid;
6528 6550                  boolean_t       need_copyout;
6529 6551  
6530 6552                  if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
6531 6553                          return (EFAULT);
6532 6554                  need_copyout = (zoneid == ALL_ZONES);
6533 6555                  err = zone_check_datalink(&zoneid,
6534 6556                      (datalink_id_t)(uintptr_t)arg2);
6535 6557                  if (err == 0 && need_copyout) {
6536 6558                          if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
6537 6559                                  err = EFAULT;
6538 6560                  }
6539 6561                  return (err == 0 ? 0 : set_errno(err));
6540 6562          }
6541 6563          case ZONE_LIST_DATALINK:
6542 6564                  return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
6543 6565                      (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
6544 6566          default:
6545 6567                  return (set_errno(EINVAL));
6546 6568          }
6547 6569  }
6548 6570  
6549 6571  struct zarg {
6550 6572          zone_t *zone;
6551 6573          zone_cmd_arg_t arg;
6552 6574  };
6553 6575  
6554 6576  static int
6555 6577  zone_lookup_door(const char *zone_name, door_handle_t *doorp)
6556 6578  {
6557 6579          char *buf;
6558 6580          size_t buflen;
6559 6581          int error;
6560 6582  
6561 6583          buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
6562 6584          buf = kmem_alloc(buflen, KM_SLEEP);
6563 6585          (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
6564 6586          error = door_ki_open(buf, doorp);
6565 6587          kmem_free(buf, buflen);
6566 6588          return (error);
6567 6589  }
6568 6590  
6569 6591  static void
6570 6592  zone_release_door(door_handle_t *doorp)
6571 6593  {
6572 6594          door_ki_rele(*doorp);
6573 6595          *doorp = NULL;
6574 6596  }
6575 6597  
6576 6598  static void
6577 6599  zone_ki_call_zoneadmd(struct zarg *zargp)
6578 6600  {
6579 6601          door_handle_t door = NULL;
6580 6602          door_arg_t darg, save_arg;
6581 6603          char *zone_name;
6582 6604          size_t zone_namelen;
6583 6605          zoneid_t zoneid;
6584 6606          zone_t *zone;
6585 6607          zone_cmd_arg_t arg;
6586 6608          uint64_t uniqid;
6587 6609          size_t size;
6588 6610          int error;
6589 6611          int retry;
6590 6612  
6591 6613          zone = zargp->zone;
6592 6614          arg = zargp->arg;
6593 6615          kmem_free(zargp, sizeof (*zargp));
6594 6616  
6595 6617          zone_namelen = strlen(zone->zone_name) + 1;
6596 6618          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6597 6619          bcopy(zone->zone_name, zone_name, zone_namelen);
6598 6620          zoneid = zone->zone_id;
6599 6621          uniqid = zone->zone_uniqid;
6600 6622          /*
6601 6623           * zoneadmd may be down, but at least we can empty out the zone.
6602 6624           * We can ignore the return value of zone_empty() since we're called
6603 6625           * from a kernel thread and know we won't be delivered any signals.
6604 6626           */
6605 6627          ASSERT(curproc == &p0);
6606 6628          (void) zone_empty(zone);
6607 6629          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6608 6630          zone_rele(zone);
6609 6631  
6610 6632          size = sizeof (arg);
6611 6633          darg.rbuf = (char *)&arg;
6612 6634          darg.data_ptr = (char *)&arg;
6613 6635          darg.rsize = size;
6614 6636          darg.data_size = size;
6615 6637          darg.desc_ptr = NULL;
6616 6638          darg.desc_num = 0;
6617 6639  
6618 6640          save_arg = darg;
6619 6641          /*
6620 6642           * Since we're not holding a reference to the zone, any number of
6621 6643           * things can go wrong, including the zone disappearing before we get a
6622 6644           * chance to talk to zoneadmd.
6623 6645           */
6624 6646          for (retry = 0; /* forever */; retry++) {
6625 6647                  if (door == NULL &&
6626 6648                      (error = zone_lookup_door(zone_name, &door)) != 0) {
6627 6649                          goto next;
6628 6650                  }
6629 6651                  ASSERT(door != NULL);
6630 6652  
6631 6653                  if ((error = door_ki_upcall_limited(door, &darg, NULL,
6632 6654                      SIZE_MAX, 0)) == 0) {
6633 6655                          break;
6634 6656                  }
6635 6657                  switch (error) {
6636 6658                  case EINTR:
6637 6659                          /* FALLTHROUGH */
6638 6660                  case EAGAIN:    /* process may be forking */
6639 6661                          /*
6640 6662                           * Back off for a bit
6641 6663                           */
6642 6664                          break;
6643 6665                  case EBADF:
6644 6666                          zone_release_door(&door);
6645 6667                          if (zone_lookup_door(zone_name, &door) != 0) {
6646 6668                                  /*
6647 6669                                   * zoneadmd may be dead, but it may come back to
6648 6670                                   * life later.
6649 6671                                   */
6650 6672                                  break;
6651 6673                          }
6652 6674                          break;
6653 6675                  default:
6654 6676                          cmn_err(CE_WARN,
6655 6677                              "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
6656 6678                              error);
6657 6679                          goto out;
6658 6680                  }
6659 6681  next:
6660 6682                  /*
6661 6683                   * If this isn't the same zone_t that we originally had in mind,
6662 6684                   * then this is the same as if two kadmin requests come in at
6663 6685                   * the same time: the first one wins.  This means we lose, so we
6664 6686                   * bail.
6665 6687                   */
6666 6688                  if ((zone = zone_find_by_id(zoneid)) == NULL) {
6667 6689                          /*
6668 6690                           * Problem is solved.
6669 6691                           */
6670 6692                          break;
6671 6693                  }
6672 6694                  if (zone->zone_uniqid != uniqid) {
6673 6695                          /*
6674 6696                           * zoneid recycled
6675 6697                           */
6676 6698                          zone_rele(zone);
6677 6699                          break;
6678 6700                  }
6679 6701                  /*
6680 6702                   * We could zone_status_timedwait(), but there doesn't seem to
6681 6703                   * be much point in doing that (plus, it would mean that
6682 6704                   * zone_free() isn't called until this thread exits).
6683 6705                   */
6684 6706                  zone_rele(zone);
6685 6707                  delay(hz);
6686 6708                  darg = save_arg;
6687 6709          }
6688 6710  out:
6689 6711          if (door != NULL) {
6690 6712                  zone_release_door(&door);
6691 6713          }
6692 6714          kmem_free(zone_name, zone_namelen);
6693 6715          thread_exit();
6694 6716  }
6695 6717  
6696 6718  /*
6697 6719   * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
6698 6720   * kadmin().  The caller is a process in the zone.
6699 6721   *
6700 6722   * In order to shutdown the zone, we will hand off control to zoneadmd
6701 6723   * (running in the global zone) via a door.  We do a half-hearted job at
6702 6724   * killing all processes in the zone, create a kernel thread to contact
6703 6725   * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
6704 6726   * a form of generation number used to let zoneadmd (as well as
6705 6727   * zone_destroy()) know exactly which zone they're re talking about.
6706 6728   */
6707 6729  int
6708 6730  zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
6709 6731  {
6710 6732          struct zarg *zargp;
6711 6733          zone_cmd_t zcmd;
6712 6734          zone_t *zone;
6713 6735  
6714 6736          zone = curproc->p_zone;
6715 6737          ASSERT(getzoneid() != GLOBAL_ZONEID);
6716 6738  
6717 6739          switch (cmd) {
6718 6740          case A_SHUTDOWN:
6719 6741                  switch (fcn) {
6720 6742                  case AD_HALT:
6721 6743                  case AD_POWEROFF:
6722 6744                          zcmd = Z_HALT;
6723 6745                          break;
6724 6746                  case AD_BOOT:
6725 6747                          zcmd = Z_REBOOT;
6726 6748                          break;
6727 6749                  case AD_IBOOT:
6728 6750                  case AD_SBOOT:
6729 6751                  case AD_SIBOOT:
6730 6752                  case AD_NOSYNC:
6731 6753                          return (ENOTSUP);
6732 6754                  default:
6733 6755                          return (EINVAL);
6734 6756                  }
6735 6757                  break;
6736 6758          case A_REBOOT:
6737 6759                  zcmd = Z_REBOOT;
6738 6760                  break;
6739 6761          case A_FTRACE:
6740 6762          case A_REMOUNT:
6741 6763          case A_FREEZE:
6742 6764          case A_DUMP:
6743 6765          case A_CONFIG:
6744 6766                  return (ENOTSUP);
6745 6767          default:
6746 6768                  ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
6747 6769                  return (EINVAL);
6748 6770          }
6749 6771  
6750 6772          if (secpolicy_zone_admin(credp, B_FALSE))
6751 6773                  return (EPERM);
6752 6774          mutex_enter(&zone_status_lock);
6753 6775  
6754 6776          /*
6755 6777           * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6756 6778           * is in the zone.
6757 6779           */
6758 6780          ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6759 6781          if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6760 6782                  /*
6761 6783                   * This zone is already on its way down.
6762 6784                   */
6763 6785                  mutex_exit(&zone_status_lock);
6764 6786                  return (0);
6765 6787          }
6766 6788          /*
6767 6789           * Prevent future zone_enter()s
6768 6790           */
6769 6791          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6770 6792          mutex_exit(&zone_status_lock);
6771 6793  
6772 6794          /*
6773 6795           * Kill everyone now and call zoneadmd later.
6774 6796           * zone_ki_call_zoneadmd() will do a more thorough job of this
6775 6797           * later.
6776 6798           */
6777 6799          killall(zone->zone_id);
6778 6800          /*
6779 6801           * Now, create the thread to contact zoneadmd and do the rest of the
6780 6802           * work.  This thread can't be created in our zone otherwise
6781 6803           * zone_destroy() would deadlock.
6782 6804           */
6783 6805          zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6784 6806          zargp->arg.cmd = zcmd;
6785 6807          zargp->arg.uniqid = zone->zone_uniqid;
6786 6808          zargp->zone = zone;
6787 6809          (void) strcpy(zargp->arg.locale, "C");
6788 6810          /* mdep was already copied in for us by uadmin */
6789 6811          if (mdep != NULL)
6790 6812                  (void) strlcpy(zargp->arg.bootbuf, mdep,
6791 6813                      sizeof (zargp->arg.bootbuf));
6792 6814          zone_hold(zone);
6793 6815  
6794 6816          (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6795 6817              TS_RUN, minclsyspri);
6796 6818          exit(CLD_EXITED, 0);
6797 6819  
6798 6820          return (EINVAL);
6799 6821  }
6800 6822  
6801 6823  /*
6802 6824   * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6803 6825   * status to ZONE_IS_SHUTTING_DOWN.
6804 6826   *
6805 6827   * This function also shuts down all running zones to ensure that they won't
6806 6828   * fork new processes.
6807 6829   */
6808 6830  void
6809 6831  zone_shutdown_global(void)
6810 6832  {
6811 6833          zone_t *current_zonep;
6812 6834  
6813 6835          ASSERT(INGLOBALZONE(curproc));
6814 6836          mutex_enter(&zonehash_lock);
6815 6837          mutex_enter(&zone_status_lock);
6816 6838  
6817 6839          /* Modify the global zone's status first. */
6818 6840          ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6819 6841          zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6820 6842  
6821 6843          /*
6822 6844           * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6823 6845           * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6824 6846           * could cause assertions to fail (e.g., assertions about a zone's
6825 6847           * state during initialization, readying, or booting) or produce races.
6826 6848           * We'll let threads continue to initialize and ready new zones: they'll
6827 6849           * fail to boot the new zones when they see that the global zone is
6828 6850           * shutting down.
6829 6851           */
6830 6852          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6831 6853              current_zonep = list_next(&zone_active, current_zonep)) {
6832 6854                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6833 6855                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6834 6856          }
6835 6857          mutex_exit(&zone_status_lock);
6836 6858          mutex_exit(&zonehash_lock);
6837 6859  }
6838 6860  
6839 6861  /*
6840 6862   * Returns true if the named dataset is visible in the current zone.
6841 6863   * The 'write' parameter is set to 1 if the dataset is also writable.
6842 6864   */
6843 6865  int
6844 6866  zone_dataset_visible(const char *dataset, int *write)
6845 6867  {
6846 6868          static int zfstype = -1;
6847 6869          zone_dataset_t *zd;
6848 6870          size_t len;
6849 6871          zone_t *zone = curproc->p_zone;
6850 6872          const char *name = NULL;
6851 6873          vfs_t *vfsp = NULL;
6852 6874  
6853 6875          if (dataset[0] == '\0')
6854 6876                  return (0);
6855 6877  
6856 6878          /*
6857 6879           * Walk the list once, looking for datasets which match exactly, or
6858 6880           * specify a dataset underneath an exported dataset.  If found, return
6859 6881           * true and note that it is writable.
6860 6882           */
6861 6883          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6862 6884              zd = list_next(&zone->zone_datasets, zd)) {
6863 6885  
6864 6886                  len = strlen(zd->zd_dataset);
6865 6887                  if (strlen(dataset) >= len &&
6866 6888                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6867 6889                      (dataset[len] == '\0' || dataset[len] == '/' ||
6868 6890                      dataset[len] == '@')) {
6869 6891                          if (write)
6870 6892                                  *write = 1;
6871 6893                          return (1);
6872 6894                  }
6873 6895          }
6874 6896  
6875 6897          /*
6876 6898           * Walk the list a second time, searching for datasets which are parents
6877 6899           * of exported datasets.  These should be visible, but read-only.
6878 6900           *
6879 6901           * Note that we also have to support forms such as 'pool/dataset/', with
6880 6902           * a trailing slash.
6881 6903           */
6882 6904          for (zd = list_head(&zone->zone_datasets); zd != NULL;
6883 6905              zd = list_next(&zone->zone_datasets, zd)) {
6884 6906  
6885 6907                  len = strlen(dataset);
6886 6908                  if (dataset[len - 1] == '/')
6887 6909                          len--;  /* Ignore trailing slash */
6888 6910                  if (len < strlen(zd->zd_dataset) &&
6889 6911                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
6890 6912                      zd->zd_dataset[len] == '/') {
6891 6913                          if (write)
6892 6914                                  *write = 0;
6893 6915                          return (1);
6894 6916                  }
6895 6917          }
6896 6918  
6897 6919          /*
6898 6920           * We reach here if the given dataset is not found in the zone_dataset
6899 6921           * list. Check if this dataset was added as a filesystem (ie. "add fs")
6900 6922           * instead of delegation. For this we search for the dataset in the
6901 6923           * zone_vfslist of this zone. If found, return true and note that it is
6902 6924           * not writable.
6903 6925           */
6904 6926  
6905 6927          /*
6906 6928           * Initialize zfstype if it is not initialized yet.
6907 6929           */
6908 6930          if (zfstype == -1) {
6909 6931                  struct vfssw *vswp = vfs_getvfssw("zfs");
6910 6932                  zfstype = vswp - vfssw;
6911 6933                  vfs_unrefvfssw(vswp);
6912 6934          }
6913 6935  
6914 6936          vfs_list_read_lock();
6915 6937          vfsp = zone->zone_vfslist;
6916 6938          do {
6917 6939                  ASSERT(vfsp);
6918 6940                  if (vfsp->vfs_fstype == zfstype) {
6919 6941                          name = refstr_value(vfsp->vfs_resource);
6920 6942  
6921 6943                          /*
6922 6944                           * Check if we have an exact match.
6923 6945                           */
6924 6946                          if (strcmp(dataset, name) == 0) {
6925 6947                                  vfs_list_unlock();
6926 6948                                  if (write)
6927 6949                                          *write = 0;
6928 6950                                  return (1);
6929 6951                          }
6930 6952                          /*
6931 6953                           * We need to check if we are looking for parents of
6932 6954                           * a dataset. These should be visible, but read-only.
6933 6955                           */
6934 6956                          len = strlen(dataset);
6935 6957                          if (dataset[len - 1] == '/')
6936 6958                                  len--;
6937 6959  
6938 6960                          if (len < strlen(name) &&
6939 6961                              bcmp(dataset, name, len) == 0 && name[len] == '/') {
6940 6962                                  vfs_list_unlock();
6941 6963                                  if (write)
6942 6964                                          *write = 0;
6943 6965                                  return (1);
6944 6966                          }
6945 6967                  }
6946 6968                  vfsp = vfsp->vfs_zone_next;
6947 6969          } while (vfsp != zone->zone_vfslist);
6948 6970  
6949 6971          vfs_list_unlock();
6950 6972          return (0);
6951 6973  }
6952 6974  
6953 6975  /*
6954 6976   * zone_find_by_any_path() -
6955 6977   *
6956 6978   * kernel-private routine similar to zone_find_by_path(), but which
6957 6979   * effectively compares against zone paths rather than zonerootpath
6958 6980   * (i.e., the last component of zonerootpaths, which should be "root/",
6959 6981   * are not compared.)  This is done in order to accurately identify all
6960 6982   * paths, whether zone-visible or not, including those which are parallel
6961 6983   * to /root/, such as /dev/, /home/, etc...
6962 6984   *
6963 6985   * If the specified path does not fall under any zone path then global
6964 6986   * zone is returned.
6965 6987   *
6966 6988   * The treat_abs parameter indicates whether the path should be treated as
6967 6989   * an absolute path although it does not begin with "/".  (This supports
6968 6990   * nfs mount syntax such as host:any/path.)
6969 6991   *
6970 6992   * The caller is responsible for zone_rele of the returned zone.
6971 6993   */
6972 6994  zone_t *
6973 6995  zone_find_by_any_path(const char *path, boolean_t treat_abs)
6974 6996  {
6975 6997          zone_t *zone;
6976 6998          int path_offset = 0;
6977 6999  
6978 7000          if (path == NULL) {
6979 7001                  zone_hold(global_zone);
6980 7002                  return (global_zone);
6981 7003          }
6982 7004  
6983 7005          if (*path != '/') {
6984 7006                  ASSERT(treat_abs);
6985 7007                  path_offset = 1;
6986 7008          }
6987 7009  
6988 7010          mutex_enter(&zonehash_lock);
6989 7011          for (zone = list_head(&zone_active); zone != NULL;
6990 7012              zone = list_next(&zone_active, zone)) {
6991 7013                  char    *c;
6992 7014                  size_t  pathlen;
6993 7015                  char *rootpath_start;
6994 7016  
6995 7017                  if (zone == global_zone)        /* skip global zone */
6996 7018                          continue;
6997 7019  
6998 7020                  /* scan backwards to find start of last component */
6999 7021                  c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
7000 7022                  do {
7001 7023                          c--;
7002 7024                  } while (*c != '/');
7003 7025  
7004 7026                  pathlen = c - zone->zone_rootpath + 1 - path_offset;
7005 7027                  rootpath_start = (zone->zone_rootpath + path_offset);
7006 7028                  if (strncmp(path, rootpath_start, pathlen) == 0)
7007 7029                          break;
7008 7030          }
7009 7031          if (zone == NULL)
7010 7032                  zone = global_zone;
7011 7033          zone_hold(zone);
7012 7034          mutex_exit(&zonehash_lock);
7013 7035          return (zone);
7014 7036  }
7015 7037  
7016 7038  /*
7017 7039   * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7018 7040   * zone_dl_t pointer if found, and NULL otherwise.
7019 7041   */
7020 7042  static zone_dl_t *
7021 7043  zone_find_dl(zone_t *zone, datalink_id_t linkid)
7022 7044  {
7023 7045          zone_dl_t *zdl;
7024 7046  
7025 7047          ASSERT(mutex_owned(&zone->zone_lock));
7026 7048          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7027 7049              zdl = list_next(&zone->zone_dl_list, zdl)) {
7028 7050                  if (zdl->zdl_id == linkid)
7029 7051                          break;
7030 7052          }
7031 7053          return (zdl);
7032 7054  }
7033 7055  
7034 7056  static boolean_t
7035 7057  zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7036 7058  {
7037 7059          boolean_t exists;
7038 7060  
7039 7061          mutex_enter(&zone->zone_lock);
7040 7062          exists = (zone_find_dl(zone, linkid) != NULL);
7041 7063          mutex_exit(&zone->zone_lock);
7042 7064          return (exists);
7043 7065  }
7044 7066  
7045 7067  /*
7046 7068   * Add an data link name for the zone.
7047 7069   */
7048 7070  static int
7049 7071  zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7050 7072  {
7051 7073          zone_dl_t *zdl;
7052 7074          zone_t *zone;
7053 7075          zone_t *thiszone;
7054 7076  
7055 7077          if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7056 7078                  return (set_errno(ENXIO));
7057 7079  
7058 7080          /* Verify that the datalink ID doesn't already belong to a zone. */
7059 7081          mutex_enter(&zonehash_lock);
7060 7082          for (zone = list_head(&zone_active); zone != NULL;
7061 7083              zone = list_next(&zone_active, zone)) {
7062 7084                  if (zone_dl_exists(zone, linkid)) {
7063 7085                          mutex_exit(&zonehash_lock);
7064 7086                          zone_rele(thiszone);
7065 7087                          return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7066 7088                  }
7067 7089          }
7068 7090  
7069 7091          zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7070 7092          zdl->zdl_id = linkid;
7071 7093          zdl->zdl_net = NULL;
7072 7094          mutex_enter(&thiszone->zone_lock);
7073 7095          list_insert_head(&thiszone->zone_dl_list, zdl);
7074 7096          mutex_exit(&thiszone->zone_lock);
7075 7097          mutex_exit(&zonehash_lock);
7076 7098          zone_rele(thiszone);
7077 7099          return (0);
7078 7100  }
7079 7101  
7080 7102  static int
7081 7103  zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7082 7104  {
7083 7105          zone_dl_t *zdl;
7084 7106          zone_t *zone;
7085 7107          int err = 0;
7086 7108  
7087 7109          if ((zone = zone_find_by_id(zoneid)) == NULL)
7088 7110                  return (set_errno(EINVAL));
7089 7111  
7090 7112          mutex_enter(&zone->zone_lock);
7091 7113          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7092 7114                  err = ENXIO;
7093 7115          } else {
7094 7116                  list_remove(&zone->zone_dl_list, zdl);
7095 7117                  nvlist_free(zdl->zdl_net);
7096 7118                  kmem_free(zdl, sizeof (zone_dl_t));
7097 7119          }
7098 7120          mutex_exit(&zone->zone_lock);
7099 7121          zone_rele(zone);
7100 7122          return (err == 0 ? 0 : set_errno(err));
7101 7123  }
7102 7124  
7103 7125  /*
7104 7126   * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
7105 7127   * the linkid.  Otherwise we just check if the specified zoneidp has been
7106 7128   * assigned the supplied linkid.
7107 7129   */
7108 7130  int
7109 7131  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7110 7132  {
7111 7133          zone_t *zone;
7112 7134          int err = ENXIO;
7113 7135  
7114 7136          if (*zoneidp != ALL_ZONES) {
7115 7137                  if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
7116 7138                          if (zone_dl_exists(zone, linkid))
7117 7139                                  err = 0;
7118 7140                          zone_rele(zone);
7119 7141                  }
7120 7142                  return (err);
7121 7143          }
7122 7144  
7123 7145          mutex_enter(&zonehash_lock);
7124 7146          for (zone = list_head(&zone_active); zone != NULL;
7125 7147              zone = list_next(&zone_active, zone)) {
7126 7148                  if (zone_dl_exists(zone, linkid)) {
7127 7149                          *zoneidp = zone->zone_id;
7128 7150                          err = 0;
7129 7151                          break;
7130 7152                  }
7131 7153          }
7132 7154          mutex_exit(&zonehash_lock);
7133 7155          return (err);
7134 7156  }
7135 7157  
7136 7158  /*
7137 7159   * Get the list of datalink IDs assigned to a zone.
7138 7160   *
7139 7161   * On input, *nump is the number of datalink IDs that can fit in the supplied
7140 7162   * idarray.  Upon return, *nump is either set to the number of datalink IDs
7141 7163   * that were placed in the array if the array was large enough, or to the
7142 7164   * number of datalink IDs that the function needs to place in the array if the
7143 7165   * array is too small.
7144 7166   */
7145 7167  static int
7146 7168  zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7147 7169  {
7148 7170          uint_t num, dlcount;
7149 7171          zone_t *zone;
7150 7172          zone_dl_t *zdl;
7151 7173          datalink_id_t *idptr = idarray;
7152 7174  
7153 7175          if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7154 7176                  return (set_errno(EFAULT));
7155 7177          if ((zone = zone_find_by_id(zoneid)) == NULL)
7156 7178                  return (set_errno(ENXIO));
7157 7179  
7158 7180          num = 0;
7159 7181          mutex_enter(&zone->zone_lock);
7160 7182          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7161 7183              zdl = list_next(&zone->zone_dl_list, zdl)) {
7162 7184                  /*
7163 7185                   * If the list is bigger than what the caller supplied, just
7164 7186                   * count, don't do copyout.
7165 7187                   */
7166 7188                  if (++num > dlcount)
7167 7189                          continue;
7168 7190                  if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7169 7191                          mutex_exit(&zone->zone_lock);
7170 7192                          zone_rele(zone);
7171 7193                          return (set_errno(EFAULT));
7172 7194                  }
7173 7195                  idptr++;
7174 7196          }
7175 7197          mutex_exit(&zone->zone_lock);
7176 7198          zone_rele(zone);
7177 7199  
7178 7200          /* Increased or decreased, caller should be notified. */
7179 7201          if (num != dlcount) {
7180 7202                  if (copyout(&num, nump, sizeof (num)) != 0)
7181 7203                          return (set_errno(EFAULT));
7182 7204          }
7183 7205          return (0);
7184 7206  }
7185 7207  
7186 7208  /*
7187 7209   * Public interface for looking up a zone by zoneid. It's a customized version
7188 7210   * for netstack_zone_create(). It can only be called from the zsd create
7189 7211   * callbacks, since it doesn't have reference on the zone structure hence if
7190 7212   * it is called elsewhere the zone could disappear after the zonehash_lock
7191 7213   * is dropped.
7192 7214   *
7193 7215   * Furthermore it
7194 7216   * 1. Doesn't check the status of the zone.
7195 7217   * 2. It will be called even before zone_init is called, in that case the
7196 7218   *    address of zone0 is returned directly, and netstack_zone_create()
7197 7219   *    will only assign a value to zone0.zone_netstack, won't break anything.
7198 7220   * 3. Returns without the zone being held.
7199 7221   */
7200 7222  zone_t *
7201 7223  zone_find_by_id_nolock(zoneid_t zoneid)
7202 7224  {
7203 7225          zone_t *zone;
7204 7226  
7205 7227          mutex_enter(&zonehash_lock);
7206 7228          if (zonehashbyid == NULL)
7207 7229                  zone = &zone0;
7208 7230          else
7209 7231                  zone = zone_find_all_by_id(zoneid);
7210 7232          mutex_exit(&zonehash_lock);
7211 7233          return (zone);
7212 7234  }
7213 7235  
7214 7236  /*
7215 7237   * Walk the datalinks for a given zone
7216 7238   */
7217 7239  int
7218 7240  zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
7219 7241      void *data)
7220 7242  {
7221 7243          zone_t          *zone;
7222 7244          zone_dl_t       *zdl;
7223 7245          datalink_id_t   *idarray;
7224 7246          uint_t          idcount = 0;
7225 7247          int             i, ret = 0;
7226 7248  
7227 7249          if ((zone = zone_find_by_id(zoneid)) == NULL)
7228 7250                  return (ENOENT);
7229 7251  
7230 7252          /*
7231 7253           * We first build an array of linkid's so that we can walk these and
7232 7254           * execute the callback with the zone_lock dropped.
7233 7255           */
7234 7256          mutex_enter(&zone->zone_lock);
7235 7257          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7236 7258              zdl = list_next(&zone->zone_dl_list, zdl)) {
7237 7259                  idcount++;
7238 7260          }
7239 7261  
7240 7262          if (idcount == 0) {
7241 7263                  mutex_exit(&zone->zone_lock);
7242 7264                  zone_rele(zone);
7243 7265                  return (0);
7244 7266          }
7245 7267  
7246 7268          idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
7247 7269          if (idarray == NULL) {
7248 7270                  mutex_exit(&zone->zone_lock);
7249 7271                  zone_rele(zone);
7250 7272                  return (ENOMEM);
7251 7273          }
7252 7274  
7253 7275          for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7254 7276              i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
7255 7277                  idarray[i] = zdl->zdl_id;
7256 7278          }
7257 7279  
7258 7280          mutex_exit(&zone->zone_lock);
7259 7281  
7260 7282          for (i = 0; i < idcount && ret == 0; i++) {
7261 7283                  if ((ret = (*cb)(idarray[i], data)) != 0)
7262 7284                          break;
7263 7285          }
7264 7286  
7265 7287          zone_rele(zone);
7266 7288          kmem_free(idarray, sizeof (datalink_id_t) * idcount);
7267 7289          return (ret);
7268 7290  }
7269 7291  
7270 7292  static char *
7271 7293  zone_net_type2name(int type)
7272 7294  {
7273 7295          switch (type) {
7274 7296          case ZONE_NETWORK_ADDRESS:
7275 7297                  return (ZONE_NET_ADDRNAME);
7276 7298          case ZONE_NETWORK_DEFROUTER:
7277 7299                  return (ZONE_NET_RTRNAME);
7278 7300          default:
7279 7301                  return (NULL);
7280 7302          }
7281 7303  }
7282 7304  
7283 7305  static int
7284 7306  zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7285 7307  {
7286 7308          zone_t *zone;
7287 7309          zone_dl_t *zdl;
7288 7310          nvlist_t *nvl;
7289 7311          int err = 0;
7290 7312          uint8_t *new = NULL;
7291 7313          char *nvname;
7292 7314          int bufsize;
7293 7315          datalink_id_t linkid = znbuf->zn_linkid;
7294 7316  
7295 7317          if (secpolicy_zone_config(CRED()) != 0)
7296 7318                  return (set_errno(EPERM));
7297 7319  
7298 7320          if (zoneid == GLOBAL_ZONEID)
7299 7321                  return (set_errno(EINVAL));
7300 7322  
7301 7323          nvname = zone_net_type2name(znbuf->zn_type);
7302 7324          bufsize = znbuf->zn_len;
7303 7325          new = znbuf->zn_val;
7304 7326          if (nvname == NULL)
7305 7327                  return (set_errno(EINVAL));
7306 7328  
7307 7329          if ((zone = zone_find_by_id(zoneid)) == NULL) {
7308 7330                  return (set_errno(EINVAL));
7309 7331          }
7310 7332  
7311 7333          mutex_enter(&zone->zone_lock);
7312 7334          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7313 7335                  err = ENXIO;
7314 7336                  goto done;
7315 7337          }
7316 7338          if ((nvl = zdl->zdl_net) == NULL) {
7317 7339                  if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
7318 7340                          err = ENOMEM;
7319 7341                          goto done;
7320 7342                  } else {
7321 7343                          zdl->zdl_net = nvl;
7322 7344                  }
7323 7345          }
7324 7346          if (nvlist_exists(nvl, nvname)) {
7325 7347                  err = EINVAL;
7326 7348                  goto done;
7327 7349          }
7328 7350          err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
7329 7351          ASSERT(err == 0);
7330 7352  done:
7331 7353          mutex_exit(&zone->zone_lock);
7332 7354          zone_rele(zone);
7333 7355          if (err != 0)
7334 7356                  return (set_errno(err));
7335 7357          else
7336 7358                  return (0);
7337 7359  }
7338 7360  
7339 7361  static int
7340 7362  zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
7341 7363  {
7342 7364          zone_t *zone;
7343 7365          zone_dl_t *zdl;
7344 7366          nvlist_t *nvl;
7345 7367          uint8_t *ptr;
7346 7368          uint_t psize;
7347 7369          int err = 0;
7348 7370          char *nvname;
7349 7371          int bufsize;
7350 7372          void *buf;
7351 7373          datalink_id_t linkid = znbuf->zn_linkid;
7352 7374  
7353 7375          if (zoneid == GLOBAL_ZONEID)
7354 7376                  return (set_errno(EINVAL));
7355 7377  
7356 7378          nvname = zone_net_type2name(znbuf->zn_type);
7357 7379          bufsize = znbuf->zn_len;
7358 7380          buf = znbuf->zn_val;
7359 7381  
7360 7382          if (nvname == NULL)
7361 7383                  return (set_errno(EINVAL));
7362 7384          if ((zone = zone_find_by_id(zoneid)) == NULL)
7363 7385                  return (set_errno(EINVAL));
7364 7386  
7365 7387          mutex_enter(&zone->zone_lock);
7366 7388          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7367 7389                  err = ENXIO;
7368 7390                  goto done;
7369 7391          }
7370 7392          if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
7371 7393                  err = ENOENT;
7372 7394                  goto done;
7373 7395          }
7374 7396          err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
7375 7397          ASSERT(err == 0);
7376 7398  
7377 7399          if (psize > bufsize) {
7378 7400                  err = ENOBUFS;
7379 7401                  goto done;
7380 7402          }
7381 7403          znbuf->zn_len = psize;
7382 7404          bcopy(ptr, buf, psize);
7383 7405  done:
7384 7406          mutex_exit(&zone->zone_lock);
7385 7407          zone_rele(zone);
7386 7408          if (err != 0)
7387 7409                  return (set_errno(err));
7388 7410          else
7389 7411                  return (0);
7390 7412  }
  
    | 
      ↓ open down ↓ | 
    2008 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX