VZONEROOT Wdiff usr/src/uts/common/os/zone.c

Print this page

Add VZONEROOT flag because not all zone roots have VROOT set.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2019, Joyent, Inc.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  27   27   */
  28   28  
  29   29  /*
  30   30   * Zones
  31   31   *
  32   32   *   A zone is a named collection of processes, namespace constraints,
  33   33   *   and other system resources which comprise a secure and manageable
  34   34   *   application containment facility.
  35   35   *
  36   36   *   Zones (represented by the reference counted zone_t) are tracked in
  37   37   *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  38   38   *   (zoneid_t) are used to track zone association.  Zone IDs are
  39   39   *   dynamically generated when the zone is created; if a persistent
  40   40   *   identifier is needed (core files, accounting logs, audit trail,
  41   41   *   etc.), the zone name should be used.
  42   42   *
  43   43   *
  44   44   *   Global Zone:
  45   45   *
  46   46   *   The global zone (zoneid 0) is automatically associated with all
  47   47   *   system resources that have not been bound to a user-created zone.
  48   48   *   This means that even systems where zones are not in active use
  49   49   *   have a global zone, and all processes, mounts, etc. are
  50   50   *   associated with that zone.  The global zone is generally
  51   51   *   unconstrained in terms of privileges and access, though the usual
  52   52   *   credential and privilege based restrictions apply.
  53   53   *
  54   54   *
  55   55   *   Zone States:
  56   56   *
  57   57   *   The states in which a zone may be in and the transitions are as
  58   58   *   follows:
  59   59   *
  60   60   *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
  61   61   *   initialized zone is added to the list of active zones on the system but
  62   62   *   isn't accessible.
  63   63   *
  64   64   *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
  65   65   *   not yet completed. Not possible to enter the zone, but attributes can
  66   66   *   be retrieved.
  67   67   *
  68   68   *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  69   69   *   ready.  The zone is made visible after the ZSD constructor callbacks are
  70   70   *   executed.  A zone remains in this state until it transitions into
  71   71   *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
  72   72   *
  73   73   *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
  74   74   *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
  75   75   *   state.
  76   76   *
  77   77   *   ZONE_IS_RUNNING: The zone is open for business: zsched has
  78   78   *   successfully started init.   A zone remains in this state until
  79   79   *   zone_shutdown() is called.
  80   80   *
  81   81   *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
  82   82   *   killing all processes running in the zone. The zone remains
  83   83   *   in this state until there are no more user processes running in the zone.
  84   84   *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
  85   85   *   Since zone_shutdown() is restartable, it may be called successfully
  86   86   *   multiple times for the same zone_t.  Setting of the zone's state to
  87   87   *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
  88   88   *   the zone's status without worrying about it being a moving target.
  89   89   *
  90   90   *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
  91   91   *   are no more user processes in the zone.  The zone remains in this
  92   92   *   state until there are no more kernel threads associated with the
  93   93   *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
  94   94   *   fail.
  95   95   *
  96   96   *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
  97   97   *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
  98   98   *   join the zone or create kernel threads therein.
  99   99   *
 100  100   *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
 101  101   *   remains in this state until zsched exits.  Calls to zone_find_by_*()
 102  102   *   return NULL from now on.
 103  103   *
 104  104   *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
 105  105   *   processes or threads doing work on behalf of the zone.  The zone is
 106  106   *   removed from the list of active zones.  zone_destroy() returns, and
 107  107   *   the zone can be recreated.
 108  108   *
 109  109   *   ZONE_IS_FREE (internal state): All references have been dropped and
 110  110   *   the zone_t is no longer in the zone_active nor zone_deathrow lists.
 111  111   *   The zone_t is in the process of being freed.  This state exists
 112  112   *   only for publishing a sysevent to indicate that the zone by this
 113  113   *   name can be booted again.
 114  114   *
 115  115   *   Threads can wait for the zone to enter a requested state (other than
 116  116   *   ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait()
 117  117   *   with the desired state passed in as an argument.  Zone state transitions
 118  118   *   are uni-directional; it is not possible to move back to an earlier state.
 119  119   *
 120  120   *
 121  121   *   Zone-Specific Data:
 122  122   *
 123  123   *   Subsystems needing to maintain zone-specific data can store that
 124  124   *   data using the ZSD mechanism.  This provides a zone-specific data
 125  125   *   store, similar to thread-specific data (see pthread_getspecific(3C)
 126  126   *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
 127  127   *   to register callbacks to be invoked when a zone is created, shut
 128  128   *   down, or destroyed.  This can be used to initialize zone-specific
 129  129   *   data for new zones and to clean up when zones go away.
 130  130   *
 131  131   *
 132  132   *   Data Structures:
 133  133   *
 134  134   *   The per-zone structure (zone_t) is reference counted, and freed
 135  135   *   when all references are released.  zone_hold and zone_rele can be
 136  136   *   used to adjust the reference count.  In addition, reference counts
 137  137   *   associated with the cred_t structure are tracked separately using
 138  138   *   zone_cred_hold and zone_cred_rele.
 139  139   *
 140  140   *   Pointers to active zone_t's are stored in two hash tables; one
 141  141   *   for searching by id, the other for searching by name.  Lookups
 142  142   *   can be performed on either basis, using zone_find_by_id and
 143  143   *   zone_find_by_name.  Both return zone_t pointers with the zone
 144  144   *   held, so zone_rele should be called when the pointer is no longer
 145  145   *   needed.  Zones can also be searched by path; zone_find_by_path
 146  146   *   returns the zone with which a path name is associated (global
 147  147   *   zone if the path is not within some other zone's file system
 148  148   *   hierarchy).  This currently requires iterating through each zone,
 149  149   *   so it is slower than an id or name search via a hash table.
 150  150   *
 151  151   *
 152  152   *   Locking:
 153  153   *
 154  154   *   zonehash_lock: This is a top-level global lock used to protect the
 155  155   *       zone hash tables and lists.  Zones cannot be created or destroyed
 156  156   *       while this lock is held.
 157  157   *   zone_status_lock: This is a global lock protecting zone state.
 158  158   *       Zones cannot change state while this lock is held.  It also
 159  159   *       protects the list of kernel threads associated with a zone.
 160  160   *   zone_lock: This is a per-zone lock used to protect several fields of
 161  161   *       the zone_t (see <sys/zone.h> for details).  In addition, holding
 162  162   *       this lock means that the zone cannot go away.
 163  163   *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
 164  164   *       related to the zone.max-lwps rctl.
 165  165   *   zone_mem_lock: This is a per-zone lock used to protect the fields
 166  166   *       related to the zone.max-locked-memory and zone.max-swap rctls.
 167  167   *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
 168  168   *       currently just max_lofi
 169  169   *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
 170  170   *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
 171  171   *       list (a list of zones in the ZONE_IS_DEAD state).
 172  172   *
 173  173   *   Ordering requirements:
 174  174   *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
 175  175   *       zone_lock --> zsd_key_lock --> pidlock --> p_lock
 176  176   *
 177  177   *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
 178  178   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
 179  179   *      zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
 180  180   *
 181  181   *   Blocking memory allocations are permitted while holding any of the
 182  182   *   zone locks.
 183  183   *
 184  184   *
 185  185   *   System Call Interface:
 186  186   *
 187  187   *   The zone subsystem can be managed and queried from user level with
 188  188   *   the following system calls (all subcodes of the primary "zone"
 189  189   *   system call):
 190  190   *   - zone_create: creates a zone with selected attributes (name,
 191  191   *     root path, privileges, resource controls, ZFS datasets)
 192  192   *   - zone_enter: allows the current process to enter a zone
 193  193   *   - zone_getattr: reports attributes of a zone
 194  194   *   - zone_setattr: set attributes of a zone
 195  195   *   - zone_boot: set 'init' running for the zone
 196  196   *   - zone_list: lists all zones active in the system
 197  197   *   - zone_lookup: looks up zone id based on name
 198  198   *   - zone_shutdown: initiates shutdown process (see states above)
 199  199   *   - zone_destroy: completes shutdown process (see states above)
 200  200   *
 201  201   */
 202  202  
 203  203  #include <sys/priv_impl.h>
 204  204  #include <sys/cred.h>
 205  205  #include <c2/audit.h>
 206  206  #include <sys/debug.h>
 207  207  #include <sys/file.h>
 208  208  #include <sys/kmem.h>
 209  209  #include <sys/kstat.h>
 210  210  #include <sys/mutex.h>
 211  211  #include <sys/note.h>
 212  212  #include <sys/pathname.h>
 213  213  #include <sys/proc.h>
 214  214  #include <sys/project.h>
 215  215  #include <sys/sysevent.h>
 216  216  #include <sys/task.h>
 217  217  #include <sys/systm.h>
 218  218  #include <sys/types.h>
 219  219  #include <sys/utsname.h>
 220  220  #include <sys/vnode.h>
 221  221  #include <sys/vfs.h>
 222  222  #include <sys/systeminfo.h>
 223  223  #include <sys/policy.h>
 224  224  #include <sys/cred_impl.h>
 225  225  #include <sys/contract_impl.h>
 226  226  #include <sys/contract/process_impl.h>
 227  227  #include <sys/class.h>
 228  228  #include <sys/pool.h>
 229  229  #include <sys/pool_pset.h>
 230  230  #include <sys/pset.h>
 231  231  #include <sys/strlog.h>
 232  232  #include <sys/sysmacros.h>
 233  233  #include <sys/callb.h>
 234  234  #include <sys/vmparam.h>
 235  235  #include <sys/corectl.h>
 236  236  #include <sys/ipc_impl.h>
 237  237  #include <sys/klpd.h>
 238  238  
 239  239  #include <sys/door.h>
 240  240  #include <sys/cpuvar.h>
 241  241  #include <sys/sdt.h>
 242  242  
 243  243  #include <sys/uadmin.h>
 244  244  #include <sys/session.h>
 245  245  #include <sys/cmn_err.h>
 246  246  #include <sys/modhash.h>
 247  247  #include <sys/sunddi.h>
 248  248  #include <sys/nvpair.h>
 249  249  #include <sys/rctl.h>
 250  250  #include <sys/fss.h>
 251  251  #include <sys/brand.h>
 252  252  #include <sys/zone.h>
 253  253  #include <net/if.h>
 254  254  #include <sys/cpucaps.h>
 255  255  #include <vm/seg.h>
 256  256  #include <sys/mac.h>
 257  257  #include <sys/rt.h>
 258  258  #include <sys/fx.h>
 259  259  
 260  260  /*
 261  261   * This constant specifies the number of seconds that threads waiting for
 262  262   * subsystems to release a zone's general-purpose references will wait before
 263  263   * they log the zone's reference counts.  The constant's value shouldn't
 264  264   * be so small that reference counts are unnecessarily reported for zones
 265  265   * whose references are slowly released.  On the other hand, it shouldn't be so
 266  266   * large that users reboot their systems out of frustration over hung zones
 267  267   * before the system logs the zones' reference counts.
 268  268   */
 269  269  #define ZONE_DESTROY_TIMEOUT_SECS       60
 270  270  
 271  271  /* List of data link IDs which are accessible from the zone */
 272  272  typedef struct zone_dl {
 273  273          datalink_id_t   zdl_id;
 274  274          nvlist_t        *zdl_net;
 275  275          list_node_t     zdl_linkage;
 276  276  } zone_dl_t;
 277  277  
 278  278  /*
 279  279   * cv used to signal that all references to the zone have been released.  This
 280  280   * needs to be global since there may be multiple waiters, and the first to
 281  281   * wake up will free the zone_t, hence we cannot use zone->zone_cv.
 282  282   */
 283  283  static kcondvar_t zone_destroy_cv;
 284  284  /*
 285  285   * Lock used to serialize access to zone_cv.  This could have been per-zone,
 286  286   * but then we'd need another lock for zone_destroy_cv, and why bother?
 287  287   */
 288  288  static kmutex_t zone_status_lock;
 289  289  
 290  290  /*
 291  291   * ZSD-related global variables.
 292  292   */
 293  293  static kmutex_t zsd_key_lock;   /* protects the following two */
 294  294  /*
 295  295   * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
 296  296   */
 297  297  static zone_key_t zsd_keyval = 0;
 298  298  /*
 299  299   * Global list of registered keys.  We use this when a new zone is created.
 300  300   */
 301  301  static list_t zsd_registered_keys;
 302  302  
 303  303  int zone_hash_size = 256;
 304  304  static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
 305  305  static kmutex_t zonehash_lock;
 306  306  static uint_t zonecount;
 307  307  static id_space_t *zoneid_space;
 308  308  
 309  309  /*
 310  310   * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
 311  311   * kernel proper runs, and which manages all other zones.
 312  312   *
 313  313   * Although not declared as static, the variable "zone0" should not be used
 314  314   * except for by code that needs to reference the global zone early on in boot,
 315  315   * before it is fully initialized.  All other consumers should use
 316  316   * 'global_zone'.
 317  317   */
 318  318  zone_t zone0;
 319  319  zone_zfs_io_t zone0_zp_zfs;
 320  320  zone_t *global_zone = NULL;     /* Set when the global zone is initialized */
 321  321  
 322  322  /*
 323  323   * List of active zones, protected by zonehash_lock.
 324  324   */
 325  325  static list_t zone_active;
 326  326  
 327  327  /*
 328  328   * List of destroyed zones that still have outstanding cred references.
 329  329   * Used for debugging.  Uses a separate lock to avoid lock ordering
 330  330   * problems in zone_free.
 331  331   */
 332  332  static list_t zone_deathrow;
 333  333  static kmutex_t zone_deathrow_lock;
 334  334  
 335  335  /* This can be dynamically reduced if various subsystems hit internal limits. */
 336  336  uint_t maxzones = MAX_ZONES;
 337  337  
 338  338  /* Event channel to sent zone state change notifications */
 339  339  evchan_t *zone_event_chan;
 340  340  
 341  341  /*
 342  342   * This table holds the mapping from kernel zone states to
 343  343   * states visible in the state notification API.
 344  344   * The idea is that we only expose "obvious" states and
 345  345   * do not expose states which are just implementation details.
 346  346   */
 347  347  const char  *zone_status_table[] = {
 348  348          ZONE_EVENT_UNINITIALIZED,       /* uninitialized */
 349  349          ZONE_EVENT_INITIALIZED,         /* initialized */
 350  350          ZONE_EVENT_READY,               /* ready */
 351  351          ZONE_EVENT_READY,               /* booting */
 352  352          ZONE_EVENT_RUNNING,             /* running */
 353  353          ZONE_EVENT_SHUTTING_DOWN,       /* shutting_down */
 354  354          ZONE_EVENT_SHUTTING_DOWN,       /* empty */
 355  355          ZONE_EVENT_SHUTTING_DOWN,       /* down */
 356  356          ZONE_EVENT_SHUTTING_DOWN,       /* dying */
 357  357          ZONE_EVENT_UNINITIALIZED,       /* dead */
 358  358          ZONE_EVENT_FREE,                /* free */
 359  359  };
 360  360  
 361  361  /*
 362  362   * This array contains the names of the subsystems listed in zone_ref_subsys_t
 363  363   * (see sys/zone.h).
 364  364   */
 365  365  static char *zone_ref_subsys_names[] = {
 366  366          "NFS",          /* ZONE_REF_NFS */
 367  367          "NFSv4",        /* ZONE_REF_NFSV4 */
 368  368          "SMBFS",        /* ZONE_REF_SMBFS */
 369  369          "MNTFS",        /* ZONE_REF_MNTFS */
 370  370          "LOFI",         /* ZONE_REF_LOFI */
 371  371          "VFS",          /* ZONE_REF_VFS */
 372  372          "IPC"           /* ZONE_REF_IPC */
 373  373  };
 374  374  
 375  375  /*
 376  376   * This isn't static so lint doesn't complain.
 377  377   */
 378  378  rctl_hndl_t rc_zone_cpu_shares;
 379  379  rctl_hndl_t rc_zone_locked_mem;
 380  380  rctl_hndl_t rc_zone_max_swap;
 381  381  rctl_hndl_t rc_zone_phys_mem;
 382  382  rctl_hndl_t rc_zone_max_lofi;
 383  383  rctl_hndl_t rc_zone_cpu_cap;
 384  384  rctl_hndl_t rc_zone_cpu_baseline;
 385  385  rctl_hndl_t rc_zone_cpu_burst_time;
 386  386  rctl_hndl_t rc_zone_zfs_io_pri;
 387  387  rctl_hndl_t rc_zone_nlwps;
 388  388  rctl_hndl_t rc_zone_nprocs;
 389  389  rctl_hndl_t rc_zone_shmmax;
 390  390  rctl_hndl_t rc_zone_shmmni;
 391  391  rctl_hndl_t rc_zone_semmni;
 392  392  rctl_hndl_t rc_zone_msgmni;
 393  393  
 394  394  const char * const zone_default_initname = "/sbin/init";
 395  395  static char * const zone_prefix = "/zone/";
 396  396  static int zone_shutdown(zoneid_t zoneid);
 397  397  static int zone_add_datalink(zoneid_t, datalink_id_t);
 398  398  static int zone_remove_datalink(zoneid_t, datalink_id_t);
 399  399  static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 400  400  static int zone_set_network(zoneid_t, zone_net_data_t *);
 401  401  static int zone_get_network(zoneid_t, zone_net_data_t *);
 402  402  static void zone_status_set(zone_t *, zone_status_t);
 403  403  
 404  404  typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 405  405  
 406  406  static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 407  407  static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
 408  408  static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 409  409  static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
 410  410      zone_key_t);
 411  411  static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 412  412  static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
 413  413      kmutex_t *);
 414  414  static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 415  415      kmutex_t *);
 416  416  
 417  417  /*
 418  418   * Bump this number when you alter the zone syscall interfaces; this is
 419  419   * because we need to have support for previous API versions in libc
 420  420   * to support patching; libc calls into the kernel to determine this number.
 421  421   *
 422  422   * Version 1 of the API is the version originally shipped with Solaris 10
 423  423   * Version 2 alters the zone_create system call in order to support more
 424  424   *     arguments by moving the args into a structure; and to do better
 425  425   *     error reporting when zone_create() fails.
 426  426   * Version 3 alters the zone_create system call in order to support the
 427  427   *     import of ZFS datasets to zones.
 428  428   * Version 4 alters the zone_create system call in order to support
 429  429   *     Trusted Extensions.
 430  430   * Version 5 alters the zone_boot system call, and converts its old
 431  431   *     bootargs parameter to be set by the zone_setattr API instead.
 432  432   * Version 6 adds the flag argument to zone_create.
 433  433   * Version 7 adds the requested zoneid to zone_create.
 434  434   */
 435  435  static const int ZONE_SYSCALL_API_VERSION = 7;
 436  436  
 437  437  /*
 438  438   * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent"
 439  439   * data which can be referenced independently of the zone_t structure. This
 440  440   * data falls into two categories;
 441  441   *   1) pages and RSS data associated with processes inside a zone
 442  442   *   2) in-flight ZFS I/O data
 443  443   *
 444  444   * Each member of zone_persist_t stores the zone's current page usage, its page
 445  445   * limit, a flag indicating if the zone is over its physical memory cap and
 446  446   * various page-related statistics. The zpers_over flag is the interface for
 447  447   * the page scanner to use when reclaiming pages for zones that are over their
 448  448   * cap. The zone_persist_t structure also includes a mutex and a reference to a
 449  449   * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data.
 450  450   *
 451  451   * All zone physical memory cap data is stored in this array instead of within
 452  452   * the zone structure itself. This is because zone structures come and go, but
 453  453   * paging-related work can be asynchronous to any particular zone. In,
 454  454   * particular:
 455  455   * 1) Page scanning to reclaim pages occurs from a kernel thread that is not
 456  456   *    associated with any zone.
 457  457   * 2) Freeing segkp pages can occur long after the zone which first
 458  458   *    instantiated those pages has gone away.
 459  459   * We want to be able to account for pages/zone without constantly having to
 460  460   * take extra locks and finding the relevant zone structure, particularly during
 461  461   * page scanning.
 462  462   *
 463  463   * The page scanner can run when "zone_num_over_cap" is non-zero. It can
 464  464   * do a direct lookup of a zoneid into the "zone_pdata" array to determine
 465  465   * if that zone is over its cap.
 466  466   *
 467  467   * There is no locking for the page scanner to perform these two checks.
 468  468   * We cannot have the page scanner blocking normal paging activity for
 469  469   * running processes. Because the physical memory cap is a soft cap, it is
 470  470   * fine for the scanner to simply read the current state of the counter and
 471  471   * the zone's zpers_over entry in the array. The scanner should never modify
 472  472   * either of these items. Internally the entries and the counter are managed
 473  473   * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We
 474  474   * take care to ensure that we only take the zone_physcap_lock mutex when a
 475  475   * zone is transitioning over/under its physical memory cap.
 476  476   *
 477  477   * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage
 478  478   * the "zone_pdata" array and associated counter.
 479  479   *
 480  480   * The zone_persist_t structure tracks the zone's physical cap and phyiscal
 481  481   * usage in terms of pages. These values are currently defined as uint32. Thus,
 482  482   * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295)
 483  483   * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a
 484  484   * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size.
 485  485   * In the future we may need to expand these counters to 64-bit, but for now
 486  486   * we're using 32-bit to conserve memory, since this array is statically
 487  487   * allocated within the kernel based on the maximum number of zones supported.
 488  488   *
 489  489   * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under
 490  490   * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we
 491  491   * had to continuously find the zone structure associated with an I/O that has
 492  492   * just completed. To avoid that overhead, we track the I/O data within the
 493  493   * zone_zfs_io_t instead. We can directly access that data without having to
 494  494   * lookup the full zone_t structure.
 495  495   */
 496  496  uint_t zone_num_over_cap;
 497  497  zone_persist_t zone_pdata[MAX_ZONES];
 498  498  static kmutex_t zone_physcap_lock;
 499  499  
 500  500  /*
 501  501   * Certain filesystems (such as NFS and autofs) need to know which zone
 502  502   * the mount is being placed in.  Because of this, we need to be able to
 503  503   * ensure that a zone isn't in the process of being created/destroyed such
 504  504   * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 505  505   * it gets added the list of mounted zones, it ends up on the wrong zone's
 506  506   * mount list. Since a zone can't reside on an NFS file system, we don't
 507  507   * have to worry about the zonepath itself.
 508  508   *
 509  509   * The following functions: block_mounts()/resume_mounts() and
 510  510   * mount_in_progress()/mount_completed() are used by zones and the VFS
 511  511   * layer (respectively) to synchronize zone state transitions and new
 512  512   * mounts within a zone. This syncronization is on a per-zone basis, so
 513  513   * activity for one zone will not interfere with activity for another zone.
 514  514   *
 515  515   * The semantics are like a reader-reader lock such that there may
 516  516   * either be multiple mounts (or zone state transitions, if that weren't
 517  517   * serialized by zonehash_lock) in progress at the same time, but not
 518  518   * both.
 519  519   *
 520  520   * We use cv's so the user can ctrl-C out of the operation if it's
 521  521   * taking too long.
 522  522   *
 523  523   * The semantics are such that there is unfair bias towards the
 524  524   * "current" operation.  This means that zone halt may starve if
 525  525   * there is a rapid succession of new mounts coming in to the zone.
 526  526   */
 527  527  /*
 528  528   * Prevent new mounts from progressing to the point of calling
 529  529   * VFS_MOUNT().  If there are already mounts in this "region", wait for
 530  530   * them to complete.
 531  531   */
 532  532  static int
 533  533  block_mounts(zone_t *zp)
 534  534  {
 535  535          int retval = 0;
 536  536  
 537  537          /*
 538  538           * Since it may block for a long time, block_mounts() shouldn't be
 539  539           * called with zonehash_lock held.
 540  540           */
 541  541          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
 542  542          mutex_enter(&zp->zone_mount_lock);
 543  543          while (zp->zone_mounts_in_progress > 0) {
 544  544                  if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0)
 545  545                          goto signaled;
 546  546          }
 547  547          /*
 548  548           * A negative value of mounts_in_progress indicates that mounts
 549  549           * have been blocked by (-mounts_in_progress) different callers
 550  550           * (remotely possible if two threads enter zone_shutdown at the same
 551  551           * time).
 552  552           */
 553  553          zp->zone_mounts_in_progress--;
 554  554          retval = 1;
 555  555  signaled:
 556  556          mutex_exit(&zp->zone_mount_lock);
 557  557          return (retval);
 558  558  }
 559  559  
 560  560  /*
 561  561   * The VFS layer may progress with new mounts as far as we're concerned.
 562  562   * Allow them to progress if we were the last obstacle.
 563  563   */
 564  564  static void
 565  565  resume_mounts(zone_t *zp)
 566  566  {
 567  567          mutex_enter(&zp->zone_mount_lock);
 568  568          if (++zp->zone_mounts_in_progress == 0)
 569  569                  cv_broadcast(&zp->zone_mount_cv);
 570  570          mutex_exit(&zp->zone_mount_lock);
 571  571  }
 572  572  
 573  573  /*
 574  574   * The VFS layer is busy with a mount; this zone should wait until all
 575  575   * of its mounts are completed to progress.
 576  576   */
 577  577  void
 578  578  mount_in_progress(zone_t *zp)
 579  579  {
 580  580          mutex_enter(&zp->zone_mount_lock);
 581  581          while (zp->zone_mounts_in_progress < 0)
 582  582                  cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock);
 583  583          zp->zone_mounts_in_progress++;
 584  584          mutex_exit(&zp->zone_mount_lock);
 585  585  }
 586  586  
 587  587  /*
 588  588   * VFS is done with one mount; wake up any waiting block_mounts()
 589  589   * callers if this is the last mount.
 590  590   */
 591  591  void
 592  592  mount_completed(zone_t *zp)
 593  593  {
 594  594          mutex_enter(&zp->zone_mount_lock);
 595  595          if (--zp->zone_mounts_in_progress == 0)
 596  596                  cv_broadcast(&zp->zone_mount_cv);
 597  597          mutex_exit(&zp->zone_mount_lock);
 598  598  }
 599  599  
 600  600  /*
 601  601   * ZSD routines.
 602  602   *
 603  603   * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
 604  604   * defined by the pthread_key_create() and related interfaces.
 605  605   *
 606  606   * Kernel subsystems may register one or more data items and/or
 607  607   * callbacks to be executed when a zone is created, shutdown, or
 608  608   * destroyed.
 609  609   *
 610  610   * Unlike the thread counterpart, destructor callbacks will be executed
 611  611   * even if the data pointer is NULL and/or there are no constructor
 612  612   * callbacks, so it is the responsibility of such callbacks to check for
 613  613   * NULL data values if necessary.
 614  614   *
 615  615   * The locking strategy and overall picture is as follows:
 616  616   *
 617  617   * When someone calls zone_key_create(), a template ZSD entry is added to the
 618  618   * global list "zsd_registered_keys", protected by zsd_key_lock.  While
 619  619   * holding that lock all the existing zones are marked as
 620  620   * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
 621  621   * zone_zsd list (protected by zone_lock). The global list is updated first
 622  622   * (under zone_key_lock) to make sure that newly created zones use the
 623  623   * most recent list of keys. Then under zonehash_lock we walk the zones
 624  624   * and mark them.  Similar locking is used in zone_key_delete().
 625  625   *
 626  626   * The actual create, shutdown, and destroy callbacks are done without
 627  627   * holding any lock. And zsd_flags are used to ensure that the operations
 628  628   * completed so that when zone_key_create (and zone_create) is done, as well as
 629  629   * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
 630  630   * are completed.
 631  631   *
 632  632   * When new zones are created constructor callbacks for all registered ZSD
 633  633   * entries will be called. That also uses the above two phases of marking
 634  634   * what needs to be done, and then running the callbacks without holding
 635  635   * any locks.
 636  636   *
 637  637   * The framework does not provide any locking around zone_getspecific() and
 638  638   * zone_setspecific() apart from that needed for internal consistency, so
 639  639   * callers interested in atomic "test-and-set" semantics will need to provide
 640  640   * their own locking.
 641  641   */
 642  642  
 643  643  /*
 644  644   * Helper function to find the zsd_entry associated with the key in the
 645  645   * given list.
 646  646   */
 647  647  static struct zsd_entry *
 648  648  zsd_find(list_t *l, zone_key_t key)
 649  649  {
 650  650          struct zsd_entry *zsd;
 651  651  
 652  652          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 653  653                  if (zsd->zsd_key == key) {
 654  654                          return (zsd);
 655  655                  }
 656  656          }
 657  657          return (NULL);
 658  658  }
 659  659  
 660  660  /*
 661  661   * Helper function to find the zsd_entry associated with the key in the
 662  662   * given list. Move it to the front of the list.
 663  663   */
 664  664  static struct zsd_entry *
 665  665  zsd_find_mru(list_t *l, zone_key_t key)
 666  666  {
 667  667          struct zsd_entry *zsd;
 668  668  
 669  669          for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
 670  670                  if (zsd->zsd_key == key) {
 671  671                          /*
 672  672                           * Move to head of list to keep list in MRU order.
 673  673                           */
 674  674                          if (zsd != list_head(l)) {
 675  675                                  list_remove(l, zsd);
 676  676                                  list_insert_head(l, zsd);
 677  677                          }
 678  678                          return (zsd);
 679  679                  }
 680  680          }
 681  681          return (NULL);
 682  682  }
 683  683  
 684  684  void
 685  685  zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
 686  686      void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
 687  687  {
 688  688          struct zsd_entry *zsdp;
 689  689          struct zsd_entry *t;
 690  690          struct zone *zone;
 691  691          zone_key_t  key;
 692  692  
 693  693          zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
 694  694          zsdp->zsd_data = NULL;
 695  695          zsdp->zsd_create = create;
 696  696          zsdp->zsd_shutdown = shutdown;
 697  697          zsdp->zsd_destroy = destroy;
 698  698  
 699  699          /*
 700  700           * Insert in global list of callbacks. Makes future zone creations
 701  701           * see it.
 702  702           */
 703  703          mutex_enter(&zsd_key_lock);
 704  704          key = zsdp->zsd_key = ++zsd_keyval;
 705  705          ASSERT(zsd_keyval != 0);
 706  706          list_insert_tail(&zsd_registered_keys, zsdp);
 707  707          mutex_exit(&zsd_key_lock);
 708  708  
 709  709          /*
 710  710           * Insert for all existing zones and mark them as needing
 711  711           * a create callback.
 712  712           */
 713  713          mutex_enter(&zonehash_lock);    /* stop the world */
 714  714          for (zone = list_head(&zone_active); zone != NULL;
 715  715              zone = list_next(&zone_active, zone)) {
 716  716                  zone_status_t status;
 717  717  
 718  718                  mutex_enter(&zone->zone_lock);
 719  719  
 720  720                  /* Skip zones that are on the way down or not yet up */
 721  721                  status = zone_status_get(zone);
 722  722                  if (status >= ZONE_IS_DOWN ||
 723  723                      status == ZONE_IS_UNINITIALIZED) {
 724  724                          mutex_exit(&zone->zone_lock);
 725  725                          continue;
 726  726                  }
 727  727  
 728  728                  t = zsd_find_mru(&zone->zone_zsd, key);
 729  729                  if (t != NULL) {
 730  730                          /*
 731  731                           * A zsd_configure already inserted it after
 732  732                           * we dropped zsd_key_lock above.
 733  733                           */
 734  734                          mutex_exit(&zone->zone_lock);
 735  735                          continue;
 736  736                  }
 737  737                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 738  738                  t->zsd_key = key;
 739  739                  t->zsd_create = create;
 740  740                  t->zsd_shutdown = shutdown;
 741  741                  t->zsd_destroy = destroy;
 742  742                  if (create != NULL) {
 743  743                          t->zsd_flags = ZSD_CREATE_NEEDED;
 744  744                          DTRACE_PROBE2(zsd__create__needed,
 745  745                              zone_t *, zone, zone_key_t, key);
 746  746                  }
 747  747                  list_insert_tail(&zone->zone_zsd, t);
 748  748                  mutex_exit(&zone->zone_lock);
 749  749          }
 750  750          mutex_exit(&zonehash_lock);
 751  751  
 752  752          if (create != NULL) {
 753  753                  /* Now call the create callback for this key */
 754  754                  zsd_apply_all_zones(zsd_apply_create, key);
 755  755          }
 756  756          /*
 757  757           * It is safe for consumers to use the key now, make it
 758  758           * globally visible. Specifically zone_getspecific() will
 759  759           * always successfully return the zone specific data associated
 760  760           * with the key.
 761  761           */
 762  762          *keyp = key;
 763  763  
 764  764  }
 765  765  
 766  766  /*
 767  767   * Function called when a module is being unloaded, or otherwise wishes
 768  768   * to unregister its ZSD key and callbacks.
 769  769   *
 770  770   * Remove from the global list and determine the functions that need to
 771  771   * be called under a global lock. Then call the functions without
 772  772   * holding any locks. Finally free up the zone_zsd entries. (The apply
 773  773   * functions need to access the zone_zsd entries to find zsd_data etc.)
 774  774   */
 775  775  int
 776  776  zone_key_delete(zone_key_t key)
 777  777  {
 778  778          struct zsd_entry *zsdp = NULL;
 779  779          zone_t *zone;
 780  780  
 781  781          mutex_enter(&zsd_key_lock);
 782  782          zsdp = zsd_find_mru(&zsd_registered_keys, key);
 783  783          if (zsdp == NULL) {
 784  784                  mutex_exit(&zsd_key_lock);
 785  785                  return (-1);
 786  786          }
 787  787          list_remove(&zsd_registered_keys, zsdp);
 788  788          mutex_exit(&zsd_key_lock);
 789  789  
 790  790          mutex_enter(&zonehash_lock);
 791  791          for (zone = list_head(&zone_active); zone != NULL;
 792  792              zone = list_next(&zone_active, zone)) {
 793  793                  struct zsd_entry *del;
 794  794  
 795  795                  mutex_enter(&zone->zone_lock);
 796  796                  del = zsd_find_mru(&zone->zone_zsd, key);
 797  797                  if (del == NULL) {
 798  798                          /*
 799  799                           * Somebody else got here first e.g the zone going
 800  800                           * away.
 801  801                           */
 802  802                          mutex_exit(&zone->zone_lock);
 803  803                          continue;
 804  804                  }
 805  805                  ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
 806  806                  ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
 807  807                  if (del->zsd_shutdown != NULL &&
 808  808                      (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 809  809                          del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 810  810                          DTRACE_PROBE2(zsd__shutdown__needed,
 811  811                              zone_t *, zone, zone_key_t, key);
 812  812                  }
 813  813                  if (del->zsd_destroy != NULL &&
 814  814                      (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 815  815                          del->zsd_flags |= ZSD_DESTROY_NEEDED;
 816  816                          DTRACE_PROBE2(zsd__destroy__needed,
 817  817                              zone_t *, zone, zone_key_t, key);
 818  818                  }
 819  819                  mutex_exit(&zone->zone_lock);
 820  820          }
 821  821          mutex_exit(&zonehash_lock);
 822  822          kmem_free(zsdp, sizeof (*zsdp));
 823  823  
 824  824          /* Now call the shutdown and destroy callback for this key */
 825  825          zsd_apply_all_zones(zsd_apply_shutdown, key);
 826  826          zsd_apply_all_zones(zsd_apply_destroy, key);
 827  827  
 828  828          /* Now we can free up the zsdp structures in each zone */
 829  829          mutex_enter(&zonehash_lock);
 830  830          for (zone = list_head(&zone_active); zone != NULL;
 831  831              zone = list_next(&zone_active, zone)) {
 832  832                  struct zsd_entry *del;
 833  833  
 834  834                  mutex_enter(&zone->zone_lock);
 835  835                  del = zsd_find(&zone->zone_zsd, key);
 836  836                  if (del != NULL) {
 837  837                          list_remove(&zone->zone_zsd, del);
 838  838                          ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
 839  839                          kmem_free(del, sizeof (*del));
 840  840                  }
 841  841                  mutex_exit(&zone->zone_lock);
 842  842          }
 843  843          mutex_exit(&zonehash_lock);
 844  844  
 845  845          return (0);
 846  846  }
 847  847  
 848  848  /*
 849  849   * ZSD counterpart of pthread_setspecific().
 850  850   *
 851  851   * Since all zsd callbacks, including those with no create function,
 852  852   * have an entry in zone_zsd, if the key is registered it is part of
 853  853   * the zone_zsd list.
 854  854   * Return an error if the key wasn't registerd.
 855  855   */
 856  856  int
 857  857  zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 858  858  {
 859  859          struct zsd_entry *t;
 860  860  
 861  861          mutex_enter(&zone->zone_lock);
 862  862          t = zsd_find_mru(&zone->zone_zsd, key);
 863  863          if (t != NULL) {
 864  864                  /*
 865  865                   * Replace old value with new
 866  866                   */
 867  867                  t->zsd_data = (void *)data;
 868  868                  mutex_exit(&zone->zone_lock);
 869  869                  return (0);
 870  870          }
 871  871          mutex_exit(&zone->zone_lock);
 872  872          return (-1);
 873  873  }
 874  874  
 875  875  /*
 876  876   * ZSD counterpart of pthread_getspecific().
 877  877   */
 878  878  void *
 879  879  zone_getspecific(zone_key_t key, zone_t *zone)
 880  880  {
 881  881          struct zsd_entry *t;
 882  882          void *data;
 883  883  
 884  884          mutex_enter(&zone->zone_lock);
 885  885          t = zsd_find_mru(&zone->zone_zsd, key);
 886  886          data = (t == NULL ? NULL : t->zsd_data);
 887  887          mutex_exit(&zone->zone_lock);
 888  888          return (data);
 889  889  }
 890  890  
 891  891  /*
 892  892   * Function used to initialize a zone's list of ZSD callbacks and data
 893  893   * when the zone is being created.  The callbacks are initialized from
 894  894   * the template list (zsd_registered_keys). The constructor callback is
 895  895   * executed later (once the zone exists and with locks dropped).
 896  896   */
 897  897  static void
 898  898  zone_zsd_configure(zone_t *zone)
 899  899  {
 900  900          struct zsd_entry *zsdp;
 901  901          struct zsd_entry *t;
 902  902  
 903  903          ASSERT(MUTEX_HELD(&zonehash_lock));
 904  904          ASSERT(list_head(&zone->zone_zsd) == NULL);
 905  905          mutex_enter(&zone->zone_lock);
 906  906          mutex_enter(&zsd_key_lock);
 907  907          for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 908  908              zsdp = list_next(&zsd_registered_keys, zsdp)) {
 909  909                  /*
 910  910                   * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
 911  911                   * should not have added anything to it.
 912  912                   */
 913  913                  ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
 914  914  
 915  915                  t = kmem_zalloc(sizeof (*t), KM_SLEEP);
 916  916                  t->zsd_key = zsdp->zsd_key;
 917  917                  t->zsd_create = zsdp->zsd_create;
 918  918                  t->zsd_shutdown = zsdp->zsd_shutdown;
 919  919                  t->zsd_destroy = zsdp->zsd_destroy;
 920  920                  if (zsdp->zsd_create != NULL) {
 921  921                          t->zsd_flags = ZSD_CREATE_NEEDED;
 922  922                          DTRACE_PROBE2(zsd__create__needed,
 923  923                              zone_t *, zone, zone_key_t, zsdp->zsd_key);
 924  924                  }
 925  925                  list_insert_tail(&zone->zone_zsd, t);
 926  926          }
 927  927          mutex_exit(&zsd_key_lock);
 928  928          mutex_exit(&zone->zone_lock);
 929  929  }
 930  930  
 931  931  enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 932  932  
 933  933  /*
 934  934   * Helper function to execute shutdown or destructor callbacks.
 935  935   */
 936  936  static void
 937  937  zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 938  938  {
 939  939          struct zsd_entry *t;
 940  940  
 941  941          ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 942  942          ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 943  943          ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 944  944  
 945  945          /*
 946  946           * Run the callback solely based on what is registered for the zone
 947  947           * in zone_zsd. The global list can change independently of this
 948  948           * as keys are registered and unregistered and we don't register new
 949  949           * callbacks for a zone that is in the process of going away.
 950  950           */
 951  951          mutex_enter(&zone->zone_lock);
 952  952          for (t = list_head(&zone->zone_zsd); t != NULL;
 953  953              t = list_next(&zone->zone_zsd, t)) {
 954  954                  zone_key_t key = t->zsd_key;
 955  955  
 956  956                  /* Skip if no callbacks registered */
 957  957  
 958  958                  if (ct == ZSD_SHUTDOWN) {
 959  959                          if (t->zsd_shutdown != NULL &&
 960  960                              (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
 961  961                                  t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
 962  962                                  DTRACE_PROBE2(zsd__shutdown__needed,
 963  963                                      zone_t *, zone, zone_key_t, key);
 964  964                          }
 965  965                  } else {
 966  966                          if (t->zsd_destroy != NULL &&
 967  967                              (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
 968  968                                  t->zsd_flags |= ZSD_DESTROY_NEEDED;
 969  969                                  DTRACE_PROBE2(zsd__destroy__needed,
 970  970                                      zone_t *, zone, zone_key_t, key);
 971  971                          }
 972  972                  }
 973  973          }
 974  974          mutex_exit(&zone->zone_lock);
 975  975  
 976  976          /* Now call the shutdown and destroy callback for this key */
 977  977          zsd_apply_all_keys(zsd_apply_shutdown, zone);
 978  978          zsd_apply_all_keys(zsd_apply_destroy, zone);
 979  979  
 980  980  }
 981  981  
 982  982  /*
 983  983   * Called when the zone is going away; free ZSD-related memory, and
 984  984   * destroy the zone_zsd list.
 985  985   */
 986  986  static void
 987  987  zone_free_zsd(zone_t *zone)
 988  988  {
 989  989          struct zsd_entry *t, *next;
 990  990  
 991  991          /*
 992  992           * Free all the zsd_entry's we had on this zone.
 993  993           */
 994  994          mutex_enter(&zone->zone_lock);
 995  995          for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 996  996                  next = list_next(&zone->zone_zsd, t);
 997  997                  list_remove(&zone->zone_zsd, t);
 998  998                  ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 999  999                  kmem_free(t, sizeof (*t));
1000 1000          }
1001 1001          list_destroy(&zone->zone_zsd);
1002 1002          mutex_exit(&zone->zone_lock);
1003 1003  
1004 1004  }
1005 1005  
1006 1006  /*
1007 1007   * Apply a function to all zones for particular key value.
1008 1008   *
1009 1009   * The applyfn has to drop zonehash_lock if it does some work, and
1010 1010   * then reacquire it before it returns.
1011 1011   * When the lock is dropped we don't follow list_next even
1012 1012   * if it is possible to do so without any hazards. This is
1013 1013   * because we want the design to allow for the list of zones
1014 1014   * to change in any arbitrary way during the time the
1015 1015   * lock was dropped.
1016 1016   *
1017 1017   * It is safe to restart the loop at list_head since the applyfn
1018 1018   * changes the zsd_flags as it does work, so a subsequent
1019 1019   * pass through will have no effect in applyfn, hence the loop will terminate
1020 1020   * in at worst O(N^2).
1021 1021   */
1022 1022  static void
1023 1023  zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
1024 1024  {
1025 1025          zone_t *zone;
1026 1026  
1027 1027          mutex_enter(&zonehash_lock);
1028 1028          zone = list_head(&zone_active);
1029 1029          while (zone != NULL) {
1030 1030                  if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
1031 1031                          /* Lock dropped - restart at head */
1032 1032                          zone = list_head(&zone_active);
1033 1033                  } else {
1034 1034                          zone = list_next(&zone_active, zone);
1035 1035                  }
1036 1036          }
1037 1037          mutex_exit(&zonehash_lock);
1038 1038  }
1039 1039  
1040 1040  /*
1041 1041   * Apply a function to all keys for a particular zone.
1042 1042   *
1043 1043   * The applyfn has to drop zonehash_lock if it does some work, and
1044 1044   * then reacquire it before it returns.
1045 1045   * When the lock is dropped we don't follow list_next even
1046 1046   * if it is possible to do so without any hazards. This is
1047 1047   * because we want the design to allow for the list of zsd callbacks
1048 1048   * to change in any arbitrary way during the time the
1049 1049   * lock was dropped.
1050 1050   *
1051 1051   * It is safe to restart the loop at list_head since the applyfn
1052 1052   * changes the zsd_flags as it does work, so a subsequent
1053 1053   * pass through will have no effect in applyfn, hence the loop will terminate
1054 1054   * in at worst O(N^2).
1055 1055   */
1056 1056  static void
1057 1057  zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
1058 1058  {
1059 1059          struct zsd_entry *t;
1060 1060  
1061 1061          mutex_enter(&zone->zone_lock);
1062 1062          t = list_head(&zone->zone_zsd);
1063 1063          while (t != NULL) {
1064 1064                  if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
1065 1065                          /* Lock dropped - restart at head */
1066 1066                          t = list_head(&zone->zone_zsd);
1067 1067                  } else {
1068 1068                          t = list_next(&zone->zone_zsd, t);
1069 1069                  }
1070 1070          }
1071 1071          mutex_exit(&zone->zone_lock);
1072 1072  }
1073 1073  
1074 1074  /*
1075 1075   * Call the create function for the zone and key if CREATE_NEEDED
1076 1076   * is set.
1077 1077   * If some other thread gets here first and sets CREATE_INPROGRESS, then
1078 1078   * we wait for that thread to complete so that we can ensure that
1079 1079   * all the callbacks are done when we've looped over all zones/keys.
1080 1080   *
1081 1081   * When we call the create function, we drop the global held by the
1082 1082   * caller, and return true to tell the caller it needs to re-evalute the
1083 1083   * state.
1084 1084   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1085 1085   * remains held on exit.
1086 1086   */
1087 1087  static boolean_t
1088 1088  zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
1089 1089      zone_t *zone, zone_key_t key)
1090 1090  {
1091 1091          void *result;
1092 1092          struct zsd_entry *t;
1093 1093          boolean_t dropped;
1094 1094  
1095 1095          if (lockp != NULL) {
1096 1096                  ASSERT(MUTEX_HELD(lockp));
1097 1097          }
1098 1098          if (zone_lock_held) {
1099 1099                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1100 1100          } else {
1101 1101                  mutex_enter(&zone->zone_lock);
1102 1102          }
1103 1103  
1104 1104          t = zsd_find(&zone->zone_zsd, key);
1105 1105          if (t == NULL) {
1106 1106                  /*
1107 1107                   * Somebody else got here first e.g the zone going
1108 1108                   * away.
1109 1109                   */
1110 1110                  if (!zone_lock_held)
1111 1111                          mutex_exit(&zone->zone_lock);
1112 1112                  return (B_FALSE);
1113 1113          }
1114 1114          dropped = B_FALSE;
1115 1115          if (zsd_wait_for_inprogress(zone, t, lockp))
1116 1116                  dropped = B_TRUE;
1117 1117  
1118 1118          if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1119 1119                  t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1120 1120                  t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1121 1121                  DTRACE_PROBE2(zsd__create__inprogress,
1122 1122                      zone_t *, zone, zone_key_t, key);
1123 1123                  mutex_exit(&zone->zone_lock);
1124 1124                  if (lockp != NULL)
1125 1125                          mutex_exit(lockp);
1126 1126  
1127 1127                  dropped = B_TRUE;
1128 1128                  ASSERT(t->zsd_create != NULL);
1129 1129                  DTRACE_PROBE2(zsd__create__start,
1130 1130                      zone_t *, zone, zone_key_t, key);
1131 1131  
1132 1132                  result = (*t->zsd_create)(zone->zone_id);
1133 1133  
1134 1134                  DTRACE_PROBE2(zsd__create__end,
1135 1135                      zone_t *, zone, voidn *, result);
1136 1136  
1137 1137                  ASSERT(result != NULL);
1138 1138                  if (lockp != NULL)
1139 1139                          mutex_enter(lockp);
1140 1140                  mutex_enter(&zone->zone_lock);
1141 1141                  t->zsd_data = result;
1142 1142                  t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1143 1143                  t->zsd_flags |= ZSD_CREATE_COMPLETED;
1144 1144                  cv_broadcast(&t->zsd_cv);
1145 1145                  DTRACE_PROBE2(zsd__create__completed,
1146 1146                      zone_t *, zone, zone_key_t, key);
1147 1147          }
1148 1148          if (!zone_lock_held)
1149 1149                  mutex_exit(&zone->zone_lock);
1150 1150          return (dropped);
1151 1151  }
1152 1152  
1153 1153  /*
1154 1154   * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1155 1155   * is set.
1156 1156   * If some other thread gets here first and sets *_INPROGRESS, then
1157 1157   * we wait for that thread to complete so that we can ensure that
1158 1158   * all the callbacks are done when we've looped over all zones/keys.
1159 1159   *
1160 1160   * When we call the shutdown function, we drop the global held by the
1161 1161   * caller, and return true to tell the caller it needs to re-evalute the
1162 1162   * state.
1163 1163   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1164 1164   * remains held on exit.
1165 1165   */
1166 1166  static boolean_t
1167 1167  zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1168 1168      zone_t *zone, zone_key_t key)
1169 1169  {
1170 1170          struct zsd_entry *t;
1171 1171          void *data;
1172 1172          boolean_t dropped;
1173 1173  
1174 1174          if (lockp != NULL) {
1175 1175                  ASSERT(MUTEX_HELD(lockp));
1176 1176          }
1177 1177          if (zone_lock_held) {
1178 1178                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1179 1179          } else {
1180 1180                  mutex_enter(&zone->zone_lock);
1181 1181          }
1182 1182  
1183 1183          t = zsd_find(&zone->zone_zsd, key);
1184 1184          if (t == NULL) {
1185 1185                  /*
1186 1186                   * Somebody else got here first e.g the zone going
1187 1187                   * away.
1188 1188                   */
1189 1189                  if (!zone_lock_held)
1190 1190                          mutex_exit(&zone->zone_lock);
1191 1191                  return (B_FALSE);
1192 1192          }
1193 1193          dropped = B_FALSE;
1194 1194          if (zsd_wait_for_creator(zone, t, lockp))
1195 1195                  dropped = B_TRUE;
1196 1196  
1197 1197          if (zsd_wait_for_inprogress(zone, t, lockp))
1198 1198                  dropped = B_TRUE;
1199 1199  
1200 1200          if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1201 1201                  t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1202 1202                  t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1203 1203                  DTRACE_PROBE2(zsd__shutdown__inprogress,
1204 1204                      zone_t *, zone, zone_key_t, key);
1205 1205                  mutex_exit(&zone->zone_lock);
1206 1206                  if (lockp != NULL)
1207 1207                          mutex_exit(lockp);
1208 1208                  dropped = B_TRUE;
1209 1209  
1210 1210                  ASSERT(t->zsd_shutdown != NULL);
1211 1211                  data = t->zsd_data;
1212 1212  
1213 1213                  DTRACE_PROBE2(zsd__shutdown__start,
1214 1214                      zone_t *, zone, zone_key_t, key);
1215 1215  
1216 1216                  (t->zsd_shutdown)(zone->zone_id, data);
1217 1217                  DTRACE_PROBE2(zsd__shutdown__end,
1218 1218                      zone_t *, zone, zone_key_t, key);
1219 1219  
1220 1220                  if (lockp != NULL)
1221 1221                          mutex_enter(lockp);
1222 1222                  mutex_enter(&zone->zone_lock);
1223 1223                  t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1224 1224                  t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1225 1225                  cv_broadcast(&t->zsd_cv);
1226 1226                  DTRACE_PROBE2(zsd__shutdown__completed,
1227 1227                      zone_t *, zone, zone_key_t, key);
1228 1228          }
1229 1229          if (!zone_lock_held)
1230 1230                  mutex_exit(&zone->zone_lock);
1231 1231          return (dropped);
1232 1232  }
1233 1233  
1234 1234  /*
1235 1235   * Call the destroy function for the zone and key if DESTROY_NEEDED
1236 1236   * is set.
1237 1237   * If some other thread gets here first and sets *_INPROGRESS, then
1238 1238   * we wait for that thread to complete so that we can ensure that
1239 1239   * all the callbacks are done when we've looped over all zones/keys.
1240 1240   *
1241 1241   * When we call the destroy function, we drop the global held by the
1242 1242   * caller, and return true to tell the caller it needs to re-evalute the
1243 1243   * state.
1244 1244   * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1245 1245   * remains held on exit.
1246 1246   */
1247 1247  static boolean_t
1248 1248  zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1249 1249      zone_t *zone, zone_key_t key)
1250 1250  {
1251 1251          struct zsd_entry *t;
1252 1252          void *data;
1253 1253          boolean_t dropped;
1254 1254  
1255 1255          if (lockp != NULL) {
1256 1256                  ASSERT(MUTEX_HELD(lockp));
1257 1257          }
1258 1258          if (zone_lock_held) {
1259 1259                  ASSERT(MUTEX_HELD(&zone->zone_lock));
1260 1260          } else {
1261 1261                  mutex_enter(&zone->zone_lock);
1262 1262          }
1263 1263  
1264 1264          t = zsd_find(&zone->zone_zsd, key);
1265 1265          if (t == NULL) {
1266 1266                  /*
1267 1267                   * Somebody else got here first e.g the zone going
1268 1268                   * away.
1269 1269                   */
1270 1270                  if (!zone_lock_held)
1271 1271                          mutex_exit(&zone->zone_lock);
1272 1272                  return (B_FALSE);
1273 1273          }
1274 1274          dropped = B_FALSE;
1275 1275          if (zsd_wait_for_creator(zone, t, lockp))
1276 1276                  dropped = B_TRUE;
1277 1277  
1278 1278          if (zsd_wait_for_inprogress(zone, t, lockp))
1279 1279                  dropped = B_TRUE;
1280 1280  
1281 1281          if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1282 1282                  t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1283 1283                  t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1284 1284                  DTRACE_PROBE2(zsd__destroy__inprogress,
1285 1285                      zone_t *, zone, zone_key_t, key);
1286 1286                  mutex_exit(&zone->zone_lock);
1287 1287                  if (lockp != NULL)
1288 1288                          mutex_exit(lockp);
1289 1289                  dropped = B_TRUE;
1290 1290  
1291 1291                  ASSERT(t->zsd_destroy != NULL);
1292 1292                  data = t->zsd_data;
1293 1293                  DTRACE_PROBE2(zsd__destroy__start,
1294 1294                      zone_t *, zone, zone_key_t, key);
1295 1295  
1296 1296                  (t->zsd_destroy)(zone->zone_id, data);
1297 1297                  DTRACE_PROBE2(zsd__destroy__end,
1298 1298                      zone_t *, zone, zone_key_t, key);
1299 1299  
1300 1300                  if (lockp != NULL)
1301 1301                          mutex_enter(lockp);
1302 1302                  mutex_enter(&zone->zone_lock);
1303 1303                  t->zsd_data = NULL;
1304 1304                  t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1305 1305                  t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1306 1306                  cv_broadcast(&t->zsd_cv);
1307 1307                  DTRACE_PROBE2(zsd__destroy__completed,
1308 1308                      zone_t *, zone, zone_key_t, key);
1309 1309          }
1310 1310          if (!zone_lock_held)
1311 1311                  mutex_exit(&zone->zone_lock);
1312 1312          return (dropped);
1313 1313  }
1314 1314  
1315 1315  /*
1316 1316   * Wait for any CREATE_NEEDED flag to be cleared.
1317 1317   * Returns true if lockp was temporarily dropped while waiting.
1318 1318   */
1319 1319  static boolean_t
1320 1320  zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1321 1321  {
1322 1322          boolean_t dropped = B_FALSE;
1323 1323  
1324 1324          while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1325 1325                  DTRACE_PROBE2(zsd__wait__for__creator,
1326 1326                      zone_t *, zone, struct zsd_entry *, t);
1327 1327                  if (lockp != NULL) {
1328 1328                          dropped = B_TRUE;
1329 1329                          mutex_exit(lockp);
1330 1330                  }
1331 1331                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1332 1332                  if (lockp != NULL) {
1333 1333                          /* First drop zone_lock to preserve order */
1334 1334                          mutex_exit(&zone->zone_lock);
1335 1335                          mutex_enter(lockp);
1336 1336                          mutex_enter(&zone->zone_lock);
1337 1337                  }
1338 1338          }
1339 1339          return (dropped);
1340 1340  }
1341 1341  
1342 1342  /*
1343 1343   * Wait for any INPROGRESS flag to be cleared.
1344 1344   * Returns true if lockp was temporarily dropped while waiting.
1345 1345   */
1346 1346  static boolean_t
1347 1347  zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1348 1348  {
1349 1349          boolean_t dropped = B_FALSE;
1350 1350  
1351 1351          while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1352 1352                  DTRACE_PROBE2(zsd__wait__for__inprogress,
1353 1353                      zone_t *, zone, struct zsd_entry *, t);
1354 1354                  if (lockp != NULL) {
1355 1355                          dropped = B_TRUE;
1356 1356                          mutex_exit(lockp);
1357 1357                  }
1358 1358                  cv_wait(&t->zsd_cv, &zone->zone_lock);
1359 1359                  if (lockp != NULL) {
1360 1360                          /* First drop zone_lock to preserve order */
1361 1361                          mutex_exit(&zone->zone_lock);
1362 1362                          mutex_enter(lockp);
1363 1363                          mutex_enter(&zone->zone_lock);
1364 1364                  }
1365 1365          }
1366 1366          return (dropped);
1367 1367  }
1368 1368  
1369 1369  /*
1370 1370   * Frees memory associated with the zone dataset list.
1371 1371   */
1372 1372  static void
1373 1373  zone_free_datasets(zone_t *zone)
1374 1374  {
1375 1375          zone_dataset_t *t, *next;
1376 1376  
1377 1377          for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1378 1378                  next = list_next(&zone->zone_datasets, t);
1379 1379                  list_remove(&zone->zone_datasets, t);
1380 1380                  kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1381 1381                  kmem_free(t, sizeof (*t));
1382 1382          }
1383 1383          list_destroy(&zone->zone_datasets);
1384 1384  }
1385 1385  
1386 1386  /*
1387 1387   * zone.cpu-shares resource control support.
1388 1388   */
1389 1389  /*ARGSUSED*/
1390 1390  static rctl_qty_t
1391 1391  zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1392 1392  {
1393 1393          ASSERT(MUTEX_HELD(&p->p_lock));
1394 1394          return (p->p_zone->zone_shares);
1395 1395  }
1396 1396  
1397 1397  /*ARGSUSED*/
1398 1398  static int
1399 1399  zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1400 1400      rctl_qty_t nv)
1401 1401  {
1402 1402          ASSERT(MUTEX_HELD(&p->p_lock));
1403 1403          ASSERT(e->rcep_t == RCENTITY_ZONE);
1404 1404          if (e->rcep_p.zone == NULL)
1405 1405                  return (0);
1406 1406  
1407 1407          e->rcep_p.zone->zone_shares = nv;
1408 1408          return (0);
1409 1409  }
1410 1410  
1411 1411  static rctl_ops_t zone_cpu_shares_ops = {
1412 1412          rcop_no_action,
1413 1413          zone_cpu_shares_usage,
1414 1414          zone_cpu_shares_set,
1415 1415          rcop_no_test
1416 1416  };
1417 1417  
1418 1418  /*
1419 1419   * zone.cpu-cap resource control support.
1420 1420   */
1421 1421  /*ARGSUSED*/
1422 1422  static rctl_qty_t
1423 1423  zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1424 1424  {
1425 1425          ASSERT(MUTEX_HELD(&p->p_lock));
1426 1426          return (cpucaps_zone_get(p->p_zone));
1427 1427  }
1428 1428  
1429 1429  /*ARGSUSED*/
1430 1430  static int
1431 1431  zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1432 1432      rctl_qty_t nv)
1433 1433  {
1434 1434          zone_t *zone = e->rcep_p.zone;
1435 1435  
1436 1436          ASSERT(MUTEX_HELD(&p->p_lock));
1437 1437          ASSERT(e->rcep_t == RCENTITY_ZONE);
1438 1438  
1439 1439          if (zone == NULL)
1440 1440                  return (0);
1441 1441  
1442 1442          /*
1443 1443           * set cap to the new value.
1444 1444           */
1445 1445          return (cpucaps_zone_set(zone, nv));
1446 1446  }
1447 1447  
1448 1448  static rctl_ops_t zone_cpu_cap_ops = {
1449 1449          rcop_no_action,
1450 1450          zone_cpu_cap_get,
1451 1451          zone_cpu_cap_set,
1452 1452          rcop_no_test
1453 1453  };
1454 1454  
1455 1455  /*ARGSUSED*/
1456 1456  static rctl_qty_t
1457 1457  zone_cpu_base_get(rctl_t *rctl, struct proc *p)
1458 1458  {
1459 1459          ASSERT(MUTEX_HELD(&p->p_lock));
1460 1460          return (cpucaps_zone_get_base(p->p_zone));
1461 1461  }
1462 1462  
1463 1463  /*
1464 1464   * The zone cpu base is used to set the baseline CPU for the zone
1465 1465   * so we can track when the zone is bursting.
1466 1466   */
1467 1467  /*ARGSUSED*/
1468 1468  static int
1469 1469  zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1470 1470      rctl_qty_t nv)
1471 1471  {
1472 1472          zone_t *zone = e->rcep_p.zone;
1473 1473  
1474 1474          ASSERT(MUTEX_HELD(&p->p_lock));
1475 1475          ASSERT(e->rcep_t == RCENTITY_ZONE);
1476 1476  
1477 1477          if (zone == NULL)
1478 1478                  return (0);
1479 1479  
1480 1480          return (cpucaps_zone_set_base(zone, nv));
1481 1481  }
1482 1482  
1483 1483  static rctl_ops_t zone_cpu_base_ops = {
1484 1484          rcop_no_action,
1485 1485          zone_cpu_base_get,
1486 1486          zone_cpu_base_set,
1487 1487          rcop_no_test
1488 1488  };
1489 1489  
1490 1490  /*ARGSUSED*/
1491 1491  static rctl_qty_t
1492 1492  zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
1493 1493  {
1494 1494          ASSERT(MUTEX_HELD(&p->p_lock));
1495 1495          return (cpucaps_zone_get_burst_time(p->p_zone));
1496 1496  }
1497 1497  
1498 1498  /*
1499 1499   * The zone cpu burst time is used to set the amount of time CPU(s) can be
1500 1500   * bursting for the zone.
1501 1501   */
1502 1502  /*ARGSUSED*/
1503 1503  static int
1504 1504  zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1505 1505      rctl_qty_t nv)
1506 1506  {
1507 1507          zone_t *zone = e->rcep_p.zone;
1508 1508  
1509 1509          ASSERT(MUTEX_HELD(&p->p_lock));
1510 1510          ASSERT(e->rcep_t == RCENTITY_ZONE);
1511 1511  
1512 1512          if (zone == NULL)
1513 1513                  return (0);
1514 1514  
1515 1515          return (cpucaps_zone_set_burst_time(zone, nv));
1516 1516  }
1517 1517  
1518 1518  static rctl_ops_t zone_cpu_burst_time_ops = {
1519 1519          rcop_no_action,
1520 1520          zone_cpu_burst_time_get,
1521 1521          zone_cpu_burst_time_set,
1522 1522          rcop_no_test
1523 1523  };
1524 1524  
1525 1525  /*
1526 1526   * zone.zfs-io-pri resource control support (IO priority).
1527 1527   */
1528 1528  /*ARGSUSED*/
1529 1529  static rctl_qty_t
1530 1530  zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1531 1531  {
1532 1532          zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
1533 1533          rctl_qty_t r = 0;
1534 1534  
1535 1535          ASSERT(MUTEX_HELD(&p->p_lock));
1536 1536          mutex_enter(&zp->zpers_zfs_lock);
1537 1537          if (zp->zpers_zfsp != NULL)
1538 1538                  r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri;
1539 1539          mutex_exit(&zp->zpers_zfs_lock);
1540 1540  
1541 1541          return (r);
1542 1542  }
1543 1543  
1544 1544  /*ARGSUSED*/
1545 1545  static int
1546 1546  zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1547 1547      rctl_qty_t nv)
1548 1548  {
1549 1549          zone_t *zone = e->rcep_p.zone;
1550 1550          zone_persist_t *zp;
1551 1551  
1552 1552          ASSERT(MUTEX_HELD(&p->p_lock));
1553 1553          ASSERT(e->rcep_t == RCENTITY_ZONE);
1554 1554  
1555 1555          if (zone == NULL)
1556 1556                  return (0);
1557 1557  
1558 1558          /*
1559 1559           * set priority to the new value.
1560 1560           */
1561 1561          zp = &zone_pdata[zone->zone_id];
1562 1562          mutex_enter(&zp->zpers_zfs_lock);
1563 1563          if (zp->zpers_zfsp != NULL)
1564 1564                  zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv;
1565 1565          mutex_exit(&zp->zpers_zfs_lock);
1566 1566          return (0);
1567 1567  }
1568 1568  
1569 1569  static rctl_ops_t zone_zfs_io_pri_ops = {
1570 1570          rcop_no_action,
1571 1571          zone_zfs_io_pri_get,
1572 1572          zone_zfs_io_pri_set,
1573 1573          rcop_no_test
1574 1574  };
1575 1575  
1576 1576  /*ARGSUSED*/
1577 1577  static rctl_qty_t
1578 1578  zone_lwps_usage(rctl_t *r, proc_t *p)
1579 1579  {
1580 1580          rctl_qty_t nlwps;
1581 1581          zone_t *zone = p->p_zone;
1582 1582  
1583 1583          ASSERT(MUTEX_HELD(&p->p_lock));
1584 1584  
1585 1585          mutex_enter(&zone->zone_nlwps_lock);
1586 1586          nlwps = zone->zone_nlwps;
1587 1587          mutex_exit(&zone->zone_nlwps_lock);
1588 1588  
1589 1589          return (nlwps);
1590 1590  }
1591 1591  
1592 1592  /*ARGSUSED*/
1593 1593  static int
1594 1594  zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1595 1595      rctl_qty_t incr, uint_t flags)
1596 1596  {
1597 1597          rctl_qty_t nlwps;
1598 1598  
1599 1599          ASSERT(MUTEX_HELD(&p->p_lock));
1600 1600          ASSERT(e->rcep_t == RCENTITY_ZONE);
1601 1601          if (e->rcep_p.zone == NULL)
1602 1602                  return (0);
1603 1603          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1604 1604          nlwps = e->rcep_p.zone->zone_nlwps;
1605 1605  
1606 1606          if (nlwps + incr > rcntl->rcv_value)
1607 1607                  return (1);
1608 1608  
1609 1609          return (0);
1610 1610  }
1611 1611  
1612 1612  /*ARGSUSED*/
1613 1613  static int
1614 1614  zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1615 1615  {
1616 1616          ASSERT(MUTEX_HELD(&p->p_lock));
1617 1617          ASSERT(e->rcep_t == RCENTITY_ZONE);
1618 1618          if (e->rcep_p.zone == NULL)
1619 1619                  return (0);
1620 1620          e->rcep_p.zone->zone_nlwps_ctl = nv;
1621 1621          return (0);
1622 1622  }
1623 1623  
1624 1624  static rctl_ops_t zone_lwps_ops = {
1625 1625          rcop_no_action,
1626 1626          zone_lwps_usage,
1627 1627          zone_lwps_set,
1628 1628          zone_lwps_test,
1629 1629  };
1630 1630  
1631 1631  /*ARGSUSED*/
1632 1632  static rctl_qty_t
1633 1633  zone_procs_usage(rctl_t *r, proc_t *p)
1634 1634  {
1635 1635          rctl_qty_t nprocs;
1636 1636          zone_t *zone = p->p_zone;
1637 1637  
1638 1638          ASSERT(MUTEX_HELD(&p->p_lock));
1639 1639  
1640 1640          mutex_enter(&zone->zone_nlwps_lock);
1641 1641          nprocs = zone->zone_nprocs;
1642 1642          mutex_exit(&zone->zone_nlwps_lock);
1643 1643  
1644 1644          return (nprocs);
1645 1645  }
1646 1646  
1647 1647  /*ARGSUSED*/
1648 1648  static int
1649 1649  zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1650 1650      rctl_qty_t incr, uint_t flags)
1651 1651  {
1652 1652          rctl_qty_t nprocs;
1653 1653  
1654 1654          ASSERT(MUTEX_HELD(&p->p_lock));
1655 1655          ASSERT(e->rcep_t == RCENTITY_ZONE);
1656 1656          if (e->rcep_p.zone == NULL)
1657 1657                  return (0);
1658 1658          ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1659 1659          nprocs = e->rcep_p.zone->zone_nprocs;
1660 1660  
1661 1661          if (nprocs + incr > rcntl->rcv_value)
1662 1662                  return (1);
1663 1663  
1664 1664          return (0);
1665 1665  }
1666 1666  
1667 1667  /*ARGSUSED*/
1668 1668  static int
1669 1669  zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1670 1670  {
1671 1671          ASSERT(MUTEX_HELD(&p->p_lock));
1672 1672          ASSERT(e->rcep_t == RCENTITY_ZONE);
1673 1673          if (e->rcep_p.zone == NULL)
1674 1674                  return (0);
1675 1675          e->rcep_p.zone->zone_nprocs_ctl = nv;
1676 1676          return (0);
1677 1677  }
1678 1678  
1679 1679  static rctl_ops_t zone_procs_ops = {
1680 1680          rcop_no_action,
1681 1681          zone_procs_usage,
1682 1682          zone_procs_set,
1683 1683          zone_procs_test,
1684 1684  };
1685 1685  
1686 1686  /*ARGSUSED*/
1687 1687  static rctl_qty_t
1688 1688  zone_shmmax_usage(rctl_t *rctl, struct proc *p)
1689 1689  {
1690 1690          ASSERT(MUTEX_HELD(&p->p_lock));
1691 1691          return (p->p_zone->zone_shmmax);
1692 1692  }
1693 1693  
1694 1694  /*ARGSUSED*/
1695 1695  static int
1696 1696  zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1697 1697      rctl_qty_t incr, uint_t flags)
1698 1698  {
1699 1699          rctl_qty_t v;
1700 1700          ASSERT(MUTEX_HELD(&p->p_lock));
1701 1701          ASSERT(e->rcep_t == RCENTITY_ZONE);
1702 1702          v = e->rcep_p.zone->zone_shmmax + incr;
1703 1703          if (v > rval->rcv_value)
1704 1704                  return (1);
1705 1705          return (0);
1706 1706  }
1707 1707  
1708 1708  static rctl_ops_t zone_shmmax_ops = {
1709 1709          rcop_no_action,
1710 1710          zone_shmmax_usage,
1711 1711          rcop_no_set,
1712 1712          zone_shmmax_test
1713 1713  };
1714 1714  
1715 1715  /*ARGSUSED*/
1716 1716  static rctl_qty_t
1717 1717  zone_shmmni_usage(rctl_t *rctl, struct proc *p)
1718 1718  {
1719 1719          ASSERT(MUTEX_HELD(&p->p_lock));
1720 1720          return (p->p_zone->zone_ipc.ipcq_shmmni);
1721 1721  }
1722 1722  
1723 1723  /*ARGSUSED*/
1724 1724  static int
1725 1725  zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1726 1726      rctl_qty_t incr, uint_t flags)
1727 1727  {
1728 1728          rctl_qty_t v;
1729 1729          ASSERT(MUTEX_HELD(&p->p_lock));
1730 1730          ASSERT(e->rcep_t == RCENTITY_ZONE);
1731 1731          v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1732 1732          if (v > rval->rcv_value)
1733 1733                  return (1);
1734 1734          return (0);
1735 1735  }
1736 1736  
1737 1737  static rctl_ops_t zone_shmmni_ops = {
1738 1738          rcop_no_action,
1739 1739          zone_shmmni_usage,
1740 1740          rcop_no_set,
1741 1741          zone_shmmni_test
1742 1742  };
1743 1743  
1744 1744  /*ARGSUSED*/
1745 1745  static rctl_qty_t
1746 1746  zone_semmni_usage(rctl_t *rctl, struct proc *p)
1747 1747  {
1748 1748          ASSERT(MUTEX_HELD(&p->p_lock));
1749 1749          return (p->p_zone->zone_ipc.ipcq_semmni);
1750 1750  }
1751 1751  
1752 1752  /*ARGSUSED*/
1753 1753  static int
1754 1754  zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1755 1755      rctl_qty_t incr, uint_t flags)
1756 1756  {
1757 1757          rctl_qty_t v;
1758 1758          ASSERT(MUTEX_HELD(&p->p_lock));
1759 1759          ASSERT(e->rcep_t == RCENTITY_ZONE);
1760 1760          v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1761 1761          if (v > rval->rcv_value)
1762 1762                  return (1);
1763 1763          return (0);
1764 1764  }
1765 1765  
1766 1766  static rctl_ops_t zone_semmni_ops = {
1767 1767          rcop_no_action,
1768 1768          zone_semmni_usage,
1769 1769          rcop_no_set,
1770 1770          zone_semmni_test
1771 1771  };
1772 1772  
1773 1773  /*ARGSUSED*/
1774 1774  static rctl_qty_t
1775 1775  zone_msgmni_usage(rctl_t *rctl, struct proc *p)
1776 1776  {
1777 1777          ASSERT(MUTEX_HELD(&p->p_lock));
1778 1778          return (p->p_zone->zone_ipc.ipcq_msgmni);
1779 1779  }
1780 1780  
1781 1781  /*ARGSUSED*/
1782 1782  static int
1783 1783  zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1784 1784      rctl_qty_t incr, uint_t flags)
1785 1785  {
1786 1786          rctl_qty_t v;
1787 1787          ASSERT(MUTEX_HELD(&p->p_lock));
1788 1788          ASSERT(e->rcep_t == RCENTITY_ZONE);
1789 1789          v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1790 1790          if (v > rval->rcv_value)
1791 1791                  return (1);
1792 1792          return (0);
1793 1793  }
1794 1794  
1795 1795  static rctl_ops_t zone_msgmni_ops = {
1796 1796          rcop_no_action,
1797 1797          zone_msgmni_usage,
1798 1798          rcop_no_set,
1799 1799          zone_msgmni_test
1800 1800  };
1801 1801  
1802 1802  /*ARGSUSED*/
1803 1803  static rctl_qty_t
1804 1804  zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1805 1805  {
1806 1806          rctl_qty_t q;
1807 1807          ASSERT(MUTEX_HELD(&p->p_lock));
1808 1808          mutex_enter(&p->p_zone->zone_mem_lock);
1809 1809          q = p->p_zone->zone_locked_mem;
1810 1810          mutex_exit(&p->p_zone->zone_mem_lock);
1811 1811          return (q);
1812 1812  }
1813 1813  
1814 1814  /*ARGSUSED*/
1815 1815  static int
1816 1816  zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1817 1817      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1818 1818  {
1819 1819          rctl_qty_t q;
1820 1820          zone_t *z;
1821 1821  
1822 1822          z = e->rcep_p.zone;
1823 1823          ASSERT(MUTEX_HELD(&p->p_lock));
1824 1824          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1825 1825          q = z->zone_locked_mem;
1826 1826          if (q + incr > rcntl->rcv_value)
1827 1827                  return (1);
1828 1828          return (0);
1829 1829  }
1830 1830  
1831 1831  /*ARGSUSED*/
1832 1832  static int
1833 1833  zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1834 1834      rctl_qty_t nv)
1835 1835  {
1836 1836          ASSERT(MUTEX_HELD(&p->p_lock));
1837 1837          ASSERT(e->rcep_t == RCENTITY_ZONE);
1838 1838          if (e->rcep_p.zone == NULL)
1839 1839                  return (0);
1840 1840          e->rcep_p.zone->zone_locked_mem_ctl = nv;
1841 1841          return (0);
1842 1842  }
1843 1843  
1844 1844  static rctl_ops_t zone_locked_mem_ops = {
1845 1845          rcop_no_action,
1846 1846          zone_locked_mem_usage,
1847 1847          zone_locked_mem_set,
1848 1848          zone_locked_mem_test
1849 1849  };
1850 1850  
1851 1851  /*ARGSUSED*/
1852 1852  static rctl_qty_t
1853 1853  zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1854 1854  {
1855 1855          rctl_qty_t q;
1856 1856          zone_t *z = p->p_zone;
1857 1857  
1858 1858          ASSERT(MUTEX_HELD(&p->p_lock));
1859 1859          mutex_enter(&z->zone_mem_lock);
1860 1860          q = z->zone_max_swap;
1861 1861          mutex_exit(&z->zone_mem_lock);
1862 1862          return (q);
1863 1863  }
1864 1864  
1865 1865  /*ARGSUSED*/
1866 1866  static int
1867 1867  zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1868 1868      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1869 1869  {
1870 1870          rctl_qty_t q;
1871 1871          zone_t *z;
1872 1872  
1873 1873          z = e->rcep_p.zone;
1874 1874          ASSERT(MUTEX_HELD(&p->p_lock));
1875 1875          ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1876 1876          q = z->zone_max_swap;
1877 1877          if (q + incr > rcntl->rcv_value)
1878 1878                  return (1);
1879 1879          return (0);
1880 1880  }
1881 1881  
1882 1882  /*ARGSUSED*/
1883 1883  static int
1884 1884  zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1885 1885      rctl_qty_t nv)
1886 1886  {
1887 1887          ASSERT(MUTEX_HELD(&p->p_lock));
1888 1888          ASSERT(e->rcep_t == RCENTITY_ZONE);
1889 1889          if (e->rcep_p.zone == NULL)
1890 1890                  return (0);
1891 1891          e->rcep_p.zone->zone_max_swap_ctl = nv;
1892 1892          return (0);
1893 1893  }
1894 1894  
1895 1895  static rctl_ops_t zone_max_swap_ops = {
1896 1896          rcop_no_action,
1897 1897          zone_max_swap_usage,
1898 1898          zone_max_swap_set,
1899 1899          zone_max_swap_test
1900 1900  };
1901 1901  
1902 1902  /*ARGSUSED*/
1903 1903  static rctl_qty_t
1904 1904  zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
1905 1905  {
1906 1906          rctl_qty_t q;
1907 1907          zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id];
1908 1908  
1909 1909          ASSERT(MUTEX_HELD(&p->p_lock));
1910 1910          q = ptob(zp->zpers_pg_cnt);
1911 1911          return (q);
1912 1912  }
1913 1913  
1914 1914  /*ARGSUSED*/
1915 1915  static int
1916 1916  zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1917 1917      rctl_qty_t nv)
1918 1918  {
1919 1919          zoneid_t zid;
1920 1920          uint_t pg_val;
1921 1921  
1922 1922          ASSERT(MUTEX_HELD(&p->p_lock));
1923 1923          ASSERT(e->rcep_t == RCENTITY_ZONE);
1924 1924          if (e->rcep_p.zone == NULL)
1925 1925                  return (0);
1926 1926          zid = e->rcep_p.zone->zone_id;
1927 1927          if (nv == UINT64_MAX) {
1928 1928                  pg_val = UINT32_MAX;
1929 1929          } else {
1930 1930                  uint64_t pages = btop(nv);
1931 1931  
1932 1932                  /*
1933 1933                   * Return from RCTLOP_SET is always ignored so just clamp an
1934 1934                   * out-of-range value to our largest "limited" value.
1935 1935                   */
1936 1936                  if (pages >= UINT32_MAX) {
1937 1937                          pg_val = UINT32_MAX - 1;
1938 1938                  } else {
1939 1939                          pg_val = (uint_t)pages;
1940 1940                  }
1941 1941          }
1942 1942          zone_pdata[zid].zpers_pg_limit = pg_val;
1943 1943          return (0);
1944 1944  }
1945 1945  
1946 1946  static rctl_ops_t zone_phys_mem_ops = {
1947 1947          rcop_no_action,
1948 1948          zone_phys_mem_usage,
1949 1949          zone_phys_mem_set,
1950 1950          rcop_no_test
1951 1951  };
1952 1952  
1953 1953  /*ARGSUSED*/
1954 1954  static rctl_qty_t
1955 1955  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1956 1956  {
1957 1957          rctl_qty_t q;
1958 1958          zone_t *z = p->p_zone;
1959 1959  
1960 1960          ASSERT(MUTEX_HELD(&p->p_lock));
1961 1961          mutex_enter(&z->zone_rctl_lock);
1962 1962          q = z->zone_max_lofi;
1963 1963          mutex_exit(&z->zone_rctl_lock);
1964 1964          return (q);
1965 1965  }
1966 1966  
1967 1967  /*ARGSUSED*/
1968 1968  static int
1969 1969  zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1970 1970      rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1971 1971  {
1972 1972          rctl_qty_t q;
1973 1973          zone_t *z;
1974 1974  
1975 1975          z = e->rcep_p.zone;
1976 1976          ASSERT(MUTEX_HELD(&p->p_lock));
1977 1977          ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1978 1978          q = z->zone_max_lofi;
1979 1979          if (q + incr > rcntl->rcv_value)
1980 1980                  return (1);
1981 1981          return (0);
1982 1982  }
1983 1983  
1984 1984  /*ARGSUSED*/
1985 1985  static int
1986 1986  zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1987 1987      rctl_qty_t nv)
1988 1988  {
1989 1989          ASSERT(MUTEX_HELD(&p->p_lock));
1990 1990          ASSERT(e->rcep_t == RCENTITY_ZONE);
1991 1991          if (e->rcep_p.zone == NULL)
1992 1992                  return (0);
1993 1993          e->rcep_p.zone->zone_max_lofi_ctl = nv;
1994 1994          return (0);
1995 1995  }
1996 1996  
1997 1997  static rctl_ops_t zone_max_lofi_ops = {
1998 1998          rcop_no_action,
1999 1999          zone_max_lofi_usage,
2000 2000          zone_max_lofi_set,
2001 2001          zone_max_lofi_test
2002 2002  };
2003 2003  
2004 2004  /*
2005 2005   * Helper function to brand the zone with a unique ID.
2006 2006   */
2007 2007  static void
2008 2008  zone_uniqid(zone_t *zone)
2009 2009  {
2010 2010          static uint64_t uniqid = 0;
2011 2011  
2012 2012          ASSERT(MUTEX_HELD(&zonehash_lock));
2013 2013          zone->zone_uniqid = uniqid++;
2014 2014  }
2015 2015  
2016 2016  /*
2017 2017   * Returns a held pointer to the "kcred" for the specified zone.
2018 2018   */
2019 2019  struct cred *
2020 2020  zone_get_kcred(zoneid_t zoneid)
2021 2021  {
2022 2022          zone_t *zone;
2023 2023          cred_t *cr;
2024 2024  
2025 2025          if ((zone = zone_find_by_id(zoneid)) == NULL)
2026 2026                  return (NULL);
2027 2027          cr = zone->zone_kcred;
2028 2028          crhold(cr);
2029 2029          zone_rele(zone);
2030 2030          return (cr);
2031 2031  }
2032 2032  
2033 2033  static int
2034 2034  zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
2035 2035  {
2036 2036          zone_t *zone = ksp->ks_private;
2037 2037          zone_kstat_t *zk = ksp->ks_data;
2038 2038  
2039 2039          if (rw == KSTAT_WRITE)
2040 2040                  return (EACCES);
2041 2041  
2042 2042          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
2043 2043          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
2044 2044          return (0);
2045 2045  }
2046 2046  
2047 2047  static int
2048 2048  zone_physmem_kstat_update(kstat_t *ksp, int rw)
2049 2049  {
2050 2050          zone_t *zone = ksp->ks_private;
2051 2051          zone_kstat_t *zk = ksp->ks_data;
2052 2052          zone_persist_t *zp = &zone_pdata[zone->zone_id];
2053 2053  
2054 2054          if (rw == KSTAT_WRITE)
2055 2055                  return (EACCES);
2056 2056  
2057 2057          zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt);
2058 2058          zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit);
2059 2059          return (0);
2060 2060  }
2061 2061  
2062 2062  static int
2063 2063  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
2064 2064  {
2065 2065          zone_t *zone = ksp->ks_private;
2066 2066          zone_kstat_t *zk = ksp->ks_data;
2067 2067  
2068 2068          if (rw == KSTAT_WRITE)
2069 2069                  return (EACCES);
2070 2070  
2071 2071          zk->zk_usage.value.ui64 = zone->zone_nprocs;
2072 2072          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
2073 2073          return (0);
2074 2074  }
2075 2075  
2076 2076  static int
2077 2077  zone_swapresv_kstat_update(kstat_t *ksp, int rw)
2078 2078  {
2079 2079          zone_t *zone = ksp->ks_private;
2080 2080          zone_kstat_t *zk = ksp->ks_data;
2081 2081  
2082 2082          if (rw == KSTAT_WRITE)
2083 2083                  return (EACCES);
2084 2084  
2085 2085          zk->zk_usage.value.ui64 = zone->zone_max_swap;
2086 2086          zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
2087 2087          return (0);
2088 2088  }
2089 2089  
2090 2090  static kstat_t *
2091 2091  zone_rctl_kstat_create_common(zone_t *zone, char *name,
2092 2092      int (*updatefunc) (kstat_t *, int))
2093 2093  {
2094 2094          kstat_t *ksp;
2095 2095          zone_kstat_t *zk;
2096 2096  
2097 2097          ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
2098 2098              sizeof (zone_kstat_t) / sizeof (kstat_named_t),
2099 2099              KSTAT_FLAG_VIRTUAL);
2100 2100  
2101 2101          if (ksp == NULL)
2102 2102                  return (NULL);
2103 2103  
2104 2104          zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
2105 2105          ksp->ks_data_size += strlen(zone->zone_name) + 1;
2106 2106          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
2107 2107          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
2108 2108          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
2109 2109          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
2110 2110          ksp->ks_update = updatefunc;
2111 2111          ksp->ks_private = zone;
2112 2112          kstat_install(ksp);
2113 2113          return (ksp);
2114 2114  }
2115 2115  
2116 2116  static int
2117 2117  zone_vfs_kstat_update(kstat_t *ksp, int rw)
2118 2118  {
2119 2119          zone_t *zone = ksp->ks_private;
2120 2120          zone_vfs_kstat_t *zvp = ksp->ks_data;
2121 2121          kstat_io_t *kiop = &zone->zone_vfs_rwstats;
2122 2122  
2123 2123          if (rw == KSTAT_WRITE)
2124 2124                  return (EACCES);
2125 2125  
2126 2126          /*
2127 2127           * Extract the VFS statistics from the kstat_io_t structure used by
2128 2128           * kstat_runq_enter() and related functions.  Since the slow ops
2129 2129           * counters are updated directly by the VFS layer, there's no need to
2130 2130           * copy those statistics here.
2131 2131           *
2132 2132           * Note that kstat_runq_enter() and the related functions use
2133 2133           * gethrtime_unscaled(), so scale the time here.
2134 2134           */
2135 2135          zvp->zv_nread.value.ui64 = kiop->nread;
2136 2136          zvp->zv_reads.value.ui64 = kiop->reads;
2137 2137          zvp->zv_rtime.value.ui64 = kiop->rtime;
2138 2138          zvp->zv_rcnt.value.ui64 = kiop->rcnt;
2139 2139          zvp->zv_rlentime.value.ui64 = kiop->rlentime;
2140 2140          zvp->zv_nwritten.value.ui64 = kiop->nwritten;
2141 2141          zvp->zv_writes.value.ui64 = kiop->writes;
2142 2142          zvp->zv_wtime.value.ui64 = kiop->wtime;
2143 2143          zvp->zv_wcnt.value.ui64 = kiop->wcnt;
2144 2144          zvp->zv_wlentime.value.ui64 = kiop->wlentime;
2145 2145  
2146 2146          scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
2147 2147          scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
2148 2148          scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
2149 2149          scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
2150 2150  
2151 2151          return (0);
2152 2152  }
2153 2153  
2154 2154  static kstat_t *
2155 2155  zone_vfs_kstat_create(zone_t *zone)
2156 2156  {
2157 2157          kstat_t *ksp;
2158 2158          zone_vfs_kstat_t *zvp;
2159 2159  
2160 2160          if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
2161 2161              zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
2162 2162              sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
2163 2163              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2164 2164                  return (NULL);
2165 2165  
2166 2166          if (zone->zone_id != GLOBAL_ZONEID)
2167 2167                  kstat_zone_add(ksp, GLOBAL_ZONEID);
2168 2168  
2169 2169          zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
2170 2170          ksp->ks_data_size += strlen(zone->zone_name) + 1;
2171 2171          ksp->ks_lock = &zone->zone_vfs_lock;
2172 2172          zone->zone_vfs_stats = zvp;
2173 2173  
2174 2174          /* The kstat "name" field is not large enough for a full zonename */
2175 2175          kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
2176 2176          kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
2177 2177          kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
2178 2178          kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
2179 2179          kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
2180 2180          kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
2181 2181          kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
2182 2182          kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
2183 2183          kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
2184 2184          kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
2185 2185          kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
2186 2186          kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
2187 2187          kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
2188 2188          kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
2189 2189          kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
2190 2190          kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
2191 2191          kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
2192 2192          kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
2193 2193  
2194 2194          ksp->ks_update = zone_vfs_kstat_update;
2195 2195          ksp->ks_private = zone;
2196 2196  
2197 2197          kstat_install(ksp);
2198 2198          return (ksp);
2199 2199  }
2200 2200  
2201 2201  static int
2202 2202  zone_zfs_kstat_update(kstat_t *ksp, int rw)
2203 2203  {
2204 2204          zone_t *zone = ksp->ks_private;
2205 2205          zone_zfs_kstat_t *zzp = ksp->ks_data;
2206 2206          zone_persist_t *zp = &zone_pdata[zone->zone_id];
2207 2207  
2208 2208          if (rw == KSTAT_WRITE)
2209 2209                  return (EACCES);
2210 2210  
2211 2211          mutex_enter(&zp->zpers_zfs_lock);
2212 2212          if (zp->zpers_zfsp == NULL) {
2213 2213                  zzp->zz_nread.value.ui64 = 0;
2214 2214                  zzp->zz_reads.value.ui64 = 0;
2215 2215                  zzp->zz_rtime.value.ui64 = 0;
2216 2216                  zzp->zz_rlentime.value.ui64 = 0;
2217 2217                  zzp->zz_nwritten.value.ui64 = 0;
2218 2218                  zzp->zz_writes.value.ui64 = 0;
2219 2219                  zzp->zz_waittime.value.ui64 = 0;
2220 2220          } else {
2221 2221                  kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats;
2222 2222  
2223 2223                  /*
2224 2224                   * Extract the ZFS statistics from the kstat_io_t structure
2225 2225                   * used by kstat_runq_enter() and related functions. Since the
2226 2226                   * I/O throttle counters are updated directly by the ZFS layer,
2227 2227                   * there's no need to copy those statistics here.
2228 2228                   *
2229 2229                   * Note that kstat_runq_enter() and the related functions use
2230 2230                   * gethrtime_unscaled(), so scale the time here.
2231 2231                   */
2232 2232                  zzp->zz_nread.value.ui64 = kiop->nread;
2233 2233                  zzp->zz_reads.value.ui64 = kiop->reads;
2234 2234                  zzp->zz_rtime.value.ui64 = kiop->rtime;
2235 2235                  zzp->zz_rlentime.value.ui64 = kiop->rlentime;
2236 2236                  zzp->zz_nwritten.value.ui64 = kiop->nwritten;
2237 2237                  zzp->zz_writes.value.ui64 = kiop->writes;
2238 2238                  zzp->zz_waittime.value.ui64 =
2239 2239                      zp->zpers_zfsp->zpers_zfs_rd_waittime;
2240 2240          }
2241 2241          mutex_exit(&zp->zpers_zfs_lock);
2242 2242  
2243 2243          scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
2244 2244          scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
2245 2245  
2246 2246          return (0);
2247 2247  }
2248 2248  
2249 2249  static kstat_t *
2250 2250  zone_zfs_kstat_create(zone_t *zone)
2251 2251  {
2252 2252          kstat_t *ksp;
2253 2253          zone_zfs_kstat_t *zzp;
2254 2254  
2255 2255          if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
2256 2256              zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
2257 2257              sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
2258 2258              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2259 2259                  return (NULL);
2260 2260  
2261 2261          if (zone->zone_id != GLOBAL_ZONEID)
2262 2262                  kstat_zone_add(ksp, GLOBAL_ZONEID);
2263 2263  
2264 2264          zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
2265 2265          ksp->ks_data_size += strlen(zone->zone_name) + 1;
2266 2266          ksp->ks_lock = &zone->zone_zfs_lock;
2267 2267          zone->zone_zfs_stats = zzp;
2268 2268  
2269 2269          /* The kstat "name" field is not large enough for a full zonename */
2270 2270          kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
2271 2271          kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
2272 2272          kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
2273 2273          kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
2274 2274          kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
2275 2275          kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
2276 2276          kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
2277 2277          kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
2278 2278          kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
2279 2279  
2280 2280          ksp->ks_update = zone_zfs_kstat_update;
2281 2281          ksp->ks_private = zone;
2282 2282  
2283 2283          kstat_install(ksp);
2284 2284          return (ksp);
2285 2285  }
2286 2286  
2287 2287  static int
2288 2288  zone_mcap_kstat_update(kstat_t *ksp, int rw)
2289 2289  {
2290 2290          zone_t *zone = ksp->ks_private;
2291 2291          zone_mcap_kstat_t *zmp = ksp->ks_data;
2292 2292          zone_persist_t *zp;
2293 2293  
2294 2294          if (rw == KSTAT_WRITE)
2295 2295                  return (EACCES);
2296 2296  
2297 2297          zp = &zone_pdata[zone->zone_id];
2298 2298  
2299 2299          zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt);
2300 2300          zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit);
2301 2301          zmp->zm_swap.value.ui64 = zone->zone_max_swap;
2302 2302          zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
2303 2303          zmp->zm_nover.value.ui64 = zp->zpers_nover;
2304 2304  #ifndef DEBUG
2305 2305          zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out);
2306 2306  #else
2307 2307          zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty +
2308 2308              zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty);
2309 2309  #endif
2310 2310          zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
2311 2311          zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
2312 2312          zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
2313 2313          zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
2314 2314          zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
2315 2315  
2316 2316          return (0);
2317 2317  }
2318 2318  
2319 2319  static kstat_t *
2320 2320  zone_mcap_kstat_create(zone_t *zone)
2321 2321  {
2322 2322          kstat_t *ksp;
2323 2323          zone_mcap_kstat_t *zmp;
2324 2324  
2325 2325          if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2326 2326              zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2327 2327              sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2328 2328              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2329 2329                  return (NULL);
2330 2330  
2331 2331          if (zone->zone_id != GLOBAL_ZONEID)
2332 2332                  kstat_zone_add(ksp, GLOBAL_ZONEID);
2333 2333  
2334 2334          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2335 2335          ksp->ks_data_size += strlen(zone->zone_name) + 1;
2336 2336          ksp->ks_lock = &zone->zone_mcap_lock;
2337 2337          zone->zone_mcap_stats = zmp;
2338 2338  
2339 2339          /* The kstat "name" field is not large enough for a full zonename */
2340 2340          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2341 2341          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2342 2342          kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2343 2343          kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2344 2344          kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2345 2345          kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2346 2346          kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2347 2347          kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2348 2348          kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2349 2349          kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2350 2350          kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2351 2351          kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2352 2352          kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2353 2353              KSTAT_DATA_UINT64);
2354 2354  
2355 2355          ksp->ks_update = zone_mcap_kstat_update;
2356 2356          ksp->ks_private = zone;
2357 2357  
2358 2358          kstat_install(ksp);
2359 2359          return (ksp);
2360 2360  }
2361 2361  
2362 2362  static int
2363 2363  zone_misc_kstat_update(kstat_t *ksp, int rw)
2364 2364  {
2365 2365          zone_t *zone = ksp->ks_private;
2366 2366          zone_misc_kstat_t *zmp = ksp->ks_data;
2367 2367          hrtime_t hrtime;
2368 2368          uint64_t tmp;
2369 2369  
2370 2370          if (rw == KSTAT_WRITE)
2371 2371                  return (EACCES);
2372 2372  
2373 2373          tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
2374 2374          hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
2375 2375          scalehrtime(&hrtime);
2376 2376          zmp->zm_stime.value.ui64 = hrtime;
2377 2377  
2378 2378          tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
2379 2379          hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
2380 2380          scalehrtime(&hrtime);
2381 2381          zmp->zm_utime.value.ui64 = hrtime;
2382 2382  
2383 2383          tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
2384 2384          hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
2385 2385          scalehrtime(&hrtime);
2386 2386          zmp->zm_wtime.value.ui64 = hrtime;
2387 2387  
2388 2388          zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
2389 2389          zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
2390 2390          zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
2391 2391  
2392 2392          zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
2393 2393          zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
2394 2394          zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
2395 2395          zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
2396 2396  
2397 2397          zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
2398 2398  
2399 2399          zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
2400 2400  
2401 2401          zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
2402 2402          zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts;
2403 2403          zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
2404 2404  
2405 2405          return (0);
2406 2406  }
2407 2407  
2408 2408  static kstat_t *
2409 2409  zone_misc_kstat_create(zone_t *zone)
2410 2410  {
2411 2411          kstat_t *ksp;
2412 2412          zone_misc_kstat_t *zmp;
2413 2413  
2414 2414          if ((ksp = kstat_create_zone("zones", zone->zone_id,
2415 2415              zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
2416 2416              sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
2417 2417              KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2418 2418                  return (NULL);
2419 2419  
2420 2420          if (zone->zone_id != GLOBAL_ZONEID)
2421 2421                  kstat_zone_add(ksp, GLOBAL_ZONEID);
2422 2422  
2423 2423          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2424 2424          ksp->ks_data_size += strlen(zone->zone_name) + 1;
2425 2425          ksp->ks_lock = &zone->zone_misc_lock;
2426 2426          zone->zone_misc_stats = zmp;
2427 2427  
2428 2428          /* The kstat "name" field is not large enough for a full zonename */
2429 2429          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2430 2430          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2431 2431          kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2432 2432          kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
2433 2433          kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2434 2434          kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2435 2435          kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2436 2436          kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2437 2437              KSTAT_DATA_UINT32);
2438 2438          kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2439 2439          kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2440 2440              KSTAT_DATA_UINT32);
2441 2441          kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2442 2442          kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2443 2443          kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
2444 2444              KSTAT_DATA_UINT32);
2445 2445          kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2446 2446              KSTAT_DATA_UINT32);
2447 2447          kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2448 2448          kstat_named_init(&zmp->zm_init_restarts, "init_restarts",
2449 2449              KSTAT_DATA_UINT32);
2450 2450          kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2451 2451  
2452 2452          ksp->ks_update = zone_misc_kstat_update;
2453 2453          ksp->ks_private = zone;
2454 2454  
2455 2455          kstat_install(ksp);
2456 2456          return (ksp);
2457 2457  }
2458 2458  
2459 2459  static void
2460 2460  zone_kstat_create(zone_t *zone)
2461 2461  {
2462 2462          zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
2463 2463              "lockedmem", zone_lockedmem_kstat_update);
2464 2464          zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
2465 2465              "swapresv", zone_swapresv_kstat_update);
2466 2466          zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
2467 2467              "physicalmem", zone_physmem_kstat_update);
2468 2468          zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
2469 2469              "nprocs", zone_nprocs_kstat_update);
2470 2470  
2471 2471          if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2472 2472                  zone->zone_vfs_stats = kmem_zalloc(
2473 2473                      sizeof (zone_vfs_kstat_t), KM_SLEEP);
2474 2474          }
2475 2475  
2476 2476          if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
2477 2477                  zone->zone_zfs_stats = kmem_zalloc(
2478 2478                      sizeof (zone_zfs_kstat_t), KM_SLEEP);
2479 2479          }
2480 2480  
2481 2481          if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2482 2482                  zone->zone_mcap_stats = kmem_zalloc(
2483 2483                      sizeof (zone_mcap_kstat_t), KM_SLEEP);
2484 2484          }
2485 2485  
2486 2486          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2487 2487                  zone->zone_misc_stats = kmem_zalloc(
2488 2488                      sizeof (zone_misc_kstat_t), KM_SLEEP);
2489 2489          }
2490 2490  }
2491 2491  
2492 2492  static void
2493 2493  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2494 2494  {
2495 2495          void *data;
2496 2496  
2497 2497          if (*pkstat != NULL) {
2498 2498                  data = (*pkstat)->ks_data;
2499 2499                  kstat_delete(*pkstat);
2500 2500                  kmem_free(data, datasz);
2501 2501                  *pkstat = NULL;
2502 2502          }
2503 2503  }
2504 2504  
2505 2505  static void
2506 2506  zone_kstat_delete(zone_t *zone)
2507 2507  {
2508 2508          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2509 2509              sizeof (zone_kstat_t));
2510 2510          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2511 2511              sizeof (zone_kstat_t));
2512 2512          zone_kstat_delete_common(&zone->zone_physmem_kstat,
2513 2513              sizeof (zone_kstat_t));
2514 2514          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2515 2515              sizeof (zone_kstat_t));
2516 2516  
2517 2517          zone_kstat_delete_common(&zone->zone_vfs_ksp,
2518 2518              sizeof (zone_vfs_kstat_t));
2519 2519          zone_kstat_delete_common(&zone->zone_zfs_ksp,
2520 2520              sizeof (zone_zfs_kstat_t));
2521 2521          zone_kstat_delete_common(&zone->zone_mcap_ksp,
2522 2522              sizeof (zone_mcap_kstat_t));
2523 2523          zone_kstat_delete_common(&zone->zone_misc_ksp,
2524 2524              sizeof (zone_misc_kstat_t));
2525 2525  }
2526 2526  
2527 2527  /*
2528 2528   * Called very early on in boot to initialize the ZSD list so that
2529 2529   * zone_key_create() can be called before zone_init().  It also initializes
2530 2530   * portions of zone0 which may be used before zone_init() is called.  The
2531 2531   * variable "global_zone" will be set when zone0 is fully initialized by
2532 2532   * zone_init().
2533 2533   */
2534 2534  void
2535 2535  zone_zsd_init(void)
2536 2536  {
2537 2537          mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2538 2538          mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2539 2539          list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2540 2540              offsetof(struct zsd_entry, zsd_linkage));
2541 2541          list_create(&zone_active, sizeof (zone_t),
2542 2542              offsetof(zone_t, zone_linkage));
2543 2543          list_create(&zone_deathrow, sizeof (zone_t),
2544 2544              offsetof(zone_t, zone_linkage));
2545 2545  
2546 2546          mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2547 2547          mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2548 2548          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2549 2549          zone0.zone_shares = 1;
2550 2550          zone0.zone_nlwps = 0;
2551 2551          zone0.zone_nlwps_ctl = INT_MAX;
2552 2552          zone0.zone_nprocs = 0;
2553 2553          zone0.zone_nprocs_ctl = INT_MAX;
2554 2554          zone0.zone_locked_mem = 0;
2555 2555          zone0.zone_locked_mem_ctl = UINT64_MAX;
2556 2556          ASSERT(zone0.zone_max_swap == 0);
2557 2557          zone0.zone_max_swap_ctl = UINT64_MAX;
2558 2558          zone0.zone_max_lofi = 0;
2559 2559          zone0.zone_max_lofi_ctl = UINT64_MAX;
2560 2560          zone0.zone_shmmax = 0;
2561 2561          zone0.zone_ipc.ipcq_shmmni = 0;
2562 2562          zone0.zone_ipc.ipcq_semmni = 0;
2563 2563          zone0.zone_ipc.ipcq_msgmni = 0;
2564 2564          zone0.zone_name = GLOBAL_ZONENAME;
2565 2565          zone0.zone_nodename = utsname.nodename;
2566 2566          zone0.zone_domain = srpc_domain;
2567 2567          zone0.zone_hostid = HW_INVALID_HOSTID;
2568 2568          zone0.zone_fs_allowed = NULL;
2569 2569          psecflags_default(&zone0.zone_secflags);
2570 2570          zone0.zone_ref = 1;
2571 2571          zone0.zone_id = GLOBAL_ZONEID;
2572 2572          zone0.zone_status = ZONE_IS_RUNNING;
2573 2573          zone0.zone_rootpath = "/";
2574 2574          zone0.zone_rootpathlen = 2;
2575 2575          zone0.zone_psetid = ZONE_PS_INVAL;
2576 2576          zone0.zone_ncpus = 0;
2577 2577          zone0.zone_ncpus_online = 0;
2578 2578          zone0.zone_proc_initpid = 1;
2579 2579          zone0.zone_initname = initname;
2580 2580          zone0.zone_lockedmem_kstat = NULL;
2581 2581          zone0.zone_swapresv_kstat = NULL;
2582 2582          zone0.zone_physmem_kstat = NULL;
2583 2583          zone0.zone_nprocs_kstat = NULL;
2584 2584  
2585 2585          zone_pdata[0].zpers_zfsp = &zone0_zp_zfs;
2586 2586          zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1;
2587 2587  
2588 2588          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2589 2589              offsetof(zone_ref_t, zref_linkage));
2590 2590          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2591 2591              offsetof(struct zsd_entry, zsd_linkage));
2592 2592          list_insert_head(&zone_active, &zone0);
2593 2593  
2594 2594          /*
2595 2595           * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2596 2596           * to anything meaningful.  It is assigned to be 'rootdir' in
2597 2597           * vfs_mountroot().
2598 2598           */
2599 2599          zone0.zone_rootvp = NULL;
2600 2600          zone0.zone_vfslist = NULL;
2601 2601          zone0.zone_bootargs = initargs;
2602 2602          zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2603 2603          /*
2604 2604           * The global zone has all privileges
2605 2605           */
2606 2606          priv_fillset(zone0.zone_privset);
2607 2607          /*
2608 2608           * Add p0 to the global zone
2609 2609           */
2610 2610          zone0.zone_zsched = &p0;
2611 2611          p0.p_zone = &zone0;
2612 2612  }
2613 2613  
2614 2614  /*
2615 2615   * Compute a hash value based on the contents of the label and the DOI.  The
2616 2616   * hash algorithm is somewhat arbitrary, but is based on the observation that
2617 2617   * humans will likely pick labels that differ by amounts that work out to be
2618 2618   * multiples of the number of hash chains, and thus stirring in some primes
2619 2619   * should help.
2620 2620   */
2621 2621  static uint_t
2622 2622  hash_bylabel(void *hdata, mod_hash_key_t key)
2623 2623  {
2624 2624          const ts_label_t *lab = (ts_label_t *)key;
2625 2625          const uint32_t *up, *ue;
2626 2626          uint_t hash;
2627 2627          int i;
2628 2628  
2629 2629          _NOTE(ARGUNUSED(hdata));
2630 2630  
2631 2631          hash = lab->tsl_doi + (lab->tsl_doi << 1);
2632 2632          /* we depend on alignment of label, but not representation */
2633 2633          up = (const uint32_t *)&lab->tsl_label;
2634 2634          ue = up + sizeof (lab->tsl_label) / sizeof (*up);
2635 2635          i = 1;
2636 2636          while (up < ue) {
2637 2637                  /* using 2^n + 1, 1 <= n <= 16 as source of many primes */
2638 2638                  hash += *up + (*up << ((i % 16) + 1));
2639 2639                  up++;
2640 2640                  i++;
2641 2641          }
2642 2642          return (hash);
2643 2643  }
2644 2644  
2645 2645  /*
2646 2646   * All that mod_hash cares about here is zero (equal) versus non-zero (not
2647 2647   * equal).  This may need to be changed if less than / greater than is ever
2648 2648   * needed.
2649 2649   */
2650 2650  static int
2651 2651  hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
2652 2652  {
2653 2653          ts_label_t *lab1 = (ts_label_t *)key1;
2654 2654          ts_label_t *lab2 = (ts_label_t *)key2;
2655 2655  
2656 2656          return (label_equal(lab1, lab2) ? 0 : 1);
2657 2657  }
2658 2658  
2659 2659  /*
2660 2660   * Called by main() to initialize the zones framework.
2661 2661   */
2662 2662  void
2663 2663  zone_init(void)
2664 2664  {
2665 2665          rctl_dict_entry_t *rde;
2666 2666          rctl_val_t *dval;
2667 2667          rctl_set_t *set;
2668 2668          rctl_alloc_gp_t *gp;
2669 2669          rctl_entity_p_t e;
2670 2670          int res;
2671 2671  
2672 2672          ASSERT(curproc == &p0);
2673 2673  
2674 2674          /*
2675 2675           * Create ID space for zone IDs.  ID 0 is reserved for the
2676 2676           * global zone.
2677 2677           */
2678 2678          zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2679 2679  
2680 2680          /*
2681 2681           * Initialize generic zone resource controls, if any.
2682 2682           */
2683 2683          rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2684 2684              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2685 2685              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2686 2686              FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2687 2687  
2688 2688          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2689 2689              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2690 2690              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2691 2691              RCTL_GLOBAL_INFINITE,
2692 2692              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2693 2693  
2694 2694          rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
2695 2695              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2696 2696              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2697 2697              MAXCAP, MAXCAP, &zone_cpu_base_ops);
2698 2698  
2699 2699          rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
2700 2700              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2701 2701              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2702 2702              INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
2703 2703  
2704 2704          rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2705 2705              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2706 2706              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2707 2707              16384, 16384, &zone_zfs_io_pri_ops);
2708 2708  
2709 2709          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2710 2710              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2711 2711              INT_MAX, INT_MAX, &zone_lwps_ops);
2712 2712  
2713 2713          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2714 2714              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2715 2715              INT_MAX, INT_MAX, &zone_procs_ops);
2716 2716  
2717 2717          /*
2718 2718           * System V IPC resource controls
2719 2719           */
2720 2720          rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2721 2721              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2722 2722              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2723 2723  
2724 2724          rc_zone_semmni = rctl_register("zone.max-sem-ids",
2725 2725              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2726 2726              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2727 2727  
2728 2728          rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2729 2729              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2730 2730              RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2731 2731  
2732 2732          rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2733 2733              RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2734 2734              RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2735 2735  
2736 2736          /*
2737 2737           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2738 2738           * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2739 2739           */
2740 2740          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2741 2741          bzero(dval, sizeof (rctl_val_t));
2742 2742          dval->rcv_value = 1;
2743 2743          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2744 2744          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2745 2745          dval->rcv_action_recip_pid = -1;
2746 2746  
2747 2747          rde = rctl_dict_lookup("zone.cpu-shares");
2748 2748          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2749 2749  
2750 2750          /*
2751 2751           * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2752 2752           * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
2753 2753           */
2754 2754          dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2755 2755          bzero(dval, sizeof (rctl_val_t));
2756 2756          dval->rcv_value = 1;
2757 2757          dval->rcv_privilege = RCPRIV_PRIVILEGED;
2758 2758          dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2759 2759          dval->rcv_action_recip_pid = -1;
2760 2760  
2761 2761          rde = rctl_dict_lookup("zone.zfs-io-priority");
2762 2762          (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2763 2763  
2764 2764          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2765 2765              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2766 2766              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2767 2767              &zone_locked_mem_ops);
2768 2768  
2769 2769          rc_zone_max_swap = rctl_register("zone.max-swap",
2770 2770              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2771 2771              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2772 2772              &zone_max_swap_ops);
2773 2773  
2774 2774          rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2775 2775              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2776 2776              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2777 2777              &zone_phys_mem_ops);
2778 2778  
2779 2779          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2780 2780              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2781 2781              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2782 2782              &zone_max_lofi_ops);
2783 2783  
2784 2784          /*
2785 2785           * Initialize the ``global zone''.
2786 2786           */
2787 2787          set = rctl_set_create();
2788 2788          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2789 2789          mutex_enter(&p0.p_lock);
2790 2790          e.rcep_p.zone = &zone0;
2791 2791          e.rcep_t = RCENTITY_ZONE;
2792 2792          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2793 2793              gp);
2794 2794  
2795 2795          zone0.zone_nlwps = p0.p_lwpcnt;
2796 2796          zone0.zone_nprocs = 1;
2797 2797          zone0.zone_ntasks = 1;
2798 2798          mutex_exit(&p0.p_lock);
2799 2799          zone0.zone_restart_init = B_TRUE;
2800 2800          zone0.zone_reboot_on_init_exit = B_FALSE;
2801 2801          zone0.zone_restart_init_0 = B_FALSE;
2802 2802          zone0.zone_init_status = -1;
2803 2803          zone0.zone_brand = &native_brand;
2804 2804          rctl_prealloc_destroy(gp);
2805 2805          /*
2806 2806           * pool_default hasn't been initialized yet, so we let pool_init()
2807 2807           * take care of making sure the global zone is in the default pool.
2808 2808           */
2809 2809  
2810 2810          /*
2811 2811           * Initialize global zone kstats
2812 2812           */
2813 2813          zone_kstat_create(&zone0);
2814 2814  
2815 2815          /*
2816 2816           * Initialize zone label.
2817 2817           * mlp are initialized when tnzonecfg is loaded.
2818 2818           */
2819 2819          zone0.zone_slabel = l_admin_low;
2820 2820          rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2821 2821          label_hold(l_admin_low);
2822 2822  
2823 2823          /*
2824 2824           * Initialise the lock for the database structure used by mntfs.
2825 2825           */
2826 2826          rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2827 2827  
2828 2828          zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
2829 2829  
2830 2830          mutex_enter(&zonehash_lock);
2831 2831          zone_uniqid(&zone0);
2832 2832          ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2833 2833  
2834 2834          zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2835 2835              mod_hash_null_valdtor);
2836 2836          zonehashbyname = mod_hash_create_strhash("zone_by_name",
2837 2837              zone_hash_size, mod_hash_null_valdtor);
2838 2838          /*
2839 2839           * maintain zonehashbylabel only for labeled systems
2840 2840           */
2841 2841          if (is_system_labeled())
2842 2842                  zonehashbylabel = mod_hash_create_extended("zone_by_label",
2843 2843                      zone_hash_size, mod_hash_null_keydtor,
2844 2844                      mod_hash_null_valdtor, hash_bylabel, NULL,
2845 2845                      hash_labelkey_cmp, KM_SLEEP);
2846 2846          zonecount = 1;
2847 2847  
2848 2848          (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2849 2849              (mod_hash_val_t)&zone0);
2850 2850          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2851 2851              (mod_hash_val_t)&zone0);
2852 2852          if (is_system_labeled()) {
2853 2853                  zone0.zone_flags |= ZF_HASHED_LABEL;
2854 2854                  (void) mod_hash_insert(zonehashbylabel,
2855 2855                      (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2856 2856          }
2857 2857          mutex_exit(&zonehash_lock);
2858 2858  
2859 2859          /*
2860 2860           * We avoid setting zone_kcred until now, since kcred is initialized
2861 2861           * sometime after zone_zsd_init() and before zone_init().
2862 2862           */
2863 2863          zone0.zone_kcred = kcred;
2864 2864          /*
2865 2865           * The global zone is fully initialized (except for zone_rootvp which
2866 2866           * will be set when the root filesystem is mounted).
2867 2867           */
2868 2868          global_zone = &zone0;
2869 2869  
2870 2870          /*
2871 2871           * Setup an event channel to send zone status change notifications on
2872 2872           */
2873 2873          res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2874 2874              EVCH_CREAT);
2875 2875  
2876 2876          if (res)
2877 2877                  panic("Sysevent_evc_bind failed during zone setup.\n");
2878 2878  
2879 2879  }
2880 2880  
2881 2881  static void
2882 2882  zone_free(zone_t *zone)
2883 2883  {
2884 2884          zone_dl_t *zdl;
2885 2885  
2886 2886          ASSERT(zone != global_zone);
2887 2887          ASSERT(zone->zone_ntasks == 0);
2888 2888          ASSERT(zone->zone_nlwps == 0);
2889 2889          ASSERT(zone->zone_nprocs == 0);
2890 2890          ASSERT(zone->zone_cred_ref == 0);
2891 2891          ASSERT(zone->zone_kcred == NULL);
2892 2892          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2893 2893              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2894 2894          ASSERT(list_is_empty(&zone->zone_ref_list));
2895 2895  
2896 2896          /*
2897 2897           * Remove any zone caps.
2898 2898           */
2899 2899          cpucaps_zone_remove(zone);
2900 2900  
2901 2901          /* Clear physical memory capping data. */
2902 2902          bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t));
2903 2903  
2904 2904          ASSERT(zone->zone_cpucap == NULL);
2905 2905  
2906 2906          /* remove from deathrow list */
2907 2907          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2908 2908                  ASSERT(zone->zone_ref == 0);
2909 2909                  mutex_enter(&zone_deathrow_lock);
2910 2910                  list_remove(&zone_deathrow, zone);
2911 2911                  mutex_exit(&zone_deathrow_lock);
2912 2912          }
2913 2913  
2914 2914          list_destroy(&zone->zone_ref_list);
2915 2915          zone_free_zsd(zone);
2916 2916          zone_free_datasets(zone);
2917 2917  
2918 2918          /*
2919 2919           * While dlmgmtd should have removed all of these, it could have left
2920 2920           * something behind or crashed. In which case it's not safe for us to
2921 2921           * assume that the list is empty which list_destroy() will ASSERT. We
2922 2922           * clean up for our userland comrades which may have crashed, or worse,
2923 2923           * been disabled by SMF.
2924 2924           */
2925 2925          while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
2926 2926                  if (zdl->zdl_net != NULL)
2927 2927                          nvlist_free(zdl->zdl_net);
2928 2928                  kmem_free(zdl, sizeof (zone_dl_t));
2929 2929          }
2930 2930          list_destroy(&zone->zone_dl_list);
2931 2931  
2932 2932          /*

↓ open down ↓

2932 lines elided

↑ open up ↑

2933 2933           * This zone_t can no longer inhibit creation of another zone_t
2934 2934           * with the same name or debug ID.  Generate a sysevent so that
2935 2935           * userspace tools know it is safe to carry on.
2936 2936           */
2937 2937          mutex_enter(&zone_status_lock);
2938 2938          zone_status_set(zone, ZONE_IS_FREE);
2939 2939          mutex_exit(&zone_status_lock);
2940 2940  
2941 2941          cpu_uarray_free(zone->zone_ustate);
2942 2942  
2943      -        if (zone->zone_rootvp != NULL)
2944      -                VN_RELE(zone->zone_rootvp);
     2943 +        if (zone->zone_rootvp != NULL) {
     2944 +                vnode_t *vp = zone->zone_rootvp;
     2945 +
     2946 +                mutex_enter(&vp->v_lock);
     2947 +                vp->v_flag &= ~VZONEROOT;
     2948 +                mutex_exit(&vp->v_lock);
     2949 +                VN_RELE(vp);
     2950 +                /* No need to worry about NULL-ing out zone_rootvp. */
     2951 +        }
2945 2952          if (zone->zone_rootpath)
2946 2953                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2947 2954          if (zone->zone_name != NULL)
2948 2955                  kmem_free(zone->zone_name, ZONENAME_MAX);
2949 2956          if (zone->zone_slabel != NULL)
2950 2957                  label_rele(zone->zone_slabel);
2951 2958          if (zone->zone_nodename != NULL)
2952 2959                  kmem_free(zone->zone_nodename, _SYS_NMLN);
2953 2960          if (zone->zone_domain != NULL)
2954 2961                  kmem_free(zone->zone_domain, _SYS_NMLN);

2955 2962          if (zone->zone_privset != NULL)
2956 2963                  kmem_free(zone->zone_privset, sizeof (priv_set_t));
2957 2964          if (zone->zone_rctls != NULL)
2958 2965                  rctl_set_free(zone->zone_rctls);
2959 2966          if (zone->zone_bootargs != NULL)
2960 2967                  strfree(zone->zone_bootargs);
2961 2968          if (zone->zone_initname != NULL)
2962 2969                  strfree(zone->zone_initname);
2963 2970          if (zone->zone_fs_allowed != NULL)
2964 2971                  strfree(zone->zone_fs_allowed);
2965 2972          if (zone->zone_pfexecd != NULL)
2966 2973                  klpd_freelist(&zone->zone_pfexecd);
2967 2974          id_free(zoneid_space, zone->zone_id);
2968 2975          mutex_destroy(&zone->zone_lock);
2969 2976          cv_destroy(&zone->zone_cv);
2970 2977          rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2971 2978          rw_destroy(&zone->zone_mntfs_db_lock);
2972 2979          kmem_free(zone, sizeof (zone_t));
2973 2980  }
2974 2981  
2975 2982  /*
2976 2983   * See block comment at the top of this file for information about zone
2977 2984   * status values.
2978 2985   */
2979 2986  /*
2980 2987   * Convenience function for setting zone status.
2981 2988   */
2982 2989  static void
2983 2990  zone_status_set(zone_t *zone, zone_status_t status)
2984 2991  {
2985 2992          timestruc_t now;
2986 2993          uint64_t t;
2987 2994  
2988 2995          nvlist_t *nvl = NULL;
2989 2996          ASSERT(MUTEX_HELD(&zone_status_lock));
2990 2997          ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE ||
2991 2998              status == ZONE_IS_FREE) && status >= zone_status_get(zone));
2992 2999  
2993 3000          /* Current time since Jan 1 1970 but consumers expect NS */
2994 3001          gethrestime(&now);
2995 3002          t = (now.tv_sec * NANOSEC) + now.tv_nsec;
2996 3003  
2997 3004          if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2998 3005              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2999 3006              nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
3000 3007              zone_status_table[status]) ||
3001 3008              nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
3002 3009              zone_status_table[zone->zone_status]) ||
3003 3010              nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
3004 3011              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
3005 3012              sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
3006 3013              ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
3007 3014  #ifdef DEBUG
3008 3015                  (void) printf(
3009 3016                      "Failed to allocate and send zone state change event.\n");
3010 3017  #else
3011 3018                  /* EMPTY */
3012 3019  #endif
3013 3020          }
3014 3021          nvlist_free(nvl);
3015 3022  
3016 3023          zone->zone_status = status;
3017 3024  
3018 3025          cv_broadcast(&zone->zone_cv);
3019 3026  }
3020 3027  
3021 3028  /*
3022 3029   * Public function to retrieve the zone status.  The zone status may
3023 3030   * change after it is retrieved.
3024 3031   */
3025 3032  zone_status_t
3026 3033  zone_status_get(zone_t *zone)
3027 3034  {
3028 3035          return (zone->zone_status);
3029 3036  }
3030 3037  
3031 3038  /*
3032 3039   * Publish a zones-related sysevent for purposes other than zone state changes.
3033 3040   * While it is unfortunate that zone_event_chan is associated with
3034 3041   * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be
3035 3042   * the only ones with class "status" and subclass "change".
3036 3043   */
3037 3044  void
3038 3045  zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass,
3039 3046      nvlist_t *ev_nvl)
3040 3047  {
3041 3048          nvlist_t *nvl = NULL;
3042 3049          timestruc_t now;
3043 3050          uint64_t t;
3044 3051  
3045 3052          gethrestime(&now);
3046 3053          t = (now.tv_sec * NANOSEC) + now.tv_nsec;
3047 3054  
3048 3055          if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 ||
3049 3056              nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 ||
3050 3057              nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 ||
3051 3058              nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 ||
3052 3059              sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com",
3053 3060              "kernel", nvl, EVCH_SLEEP) != 0) {
3054 3061  #ifdef DEBUG
3055 3062                  (void) printf("Failed to allocate and send zone misc event.\n");
3056 3063  #else
3057 3064                  /* EMPTY */
3058 3065  #endif
3059 3066          }
3060 3067          nvlist_free(nvl);
3061 3068  }
3062 3069  
3063 3070  static int
3064 3071  zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
3065 3072  {
3066 3073          char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
3067 3074          int err = 0;
3068 3075  
3069 3076          ASSERT(zone != global_zone);
3070 3077          if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
3071 3078                  goto done;      /* EFAULT or ENAMETOOLONG */
3072 3079  
3073 3080          if (zone->zone_bootargs != NULL)
3074 3081                  strfree(zone->zone_bootargs);
3075 3082  
3076 3083          zone->zone_bootargs = strdup(buf);
3077 3084  
3078 3085  done:
3079 3086          kmem_free(buf, BOOTARGS_MAX);
3080 3087          return (err);
3081 3088  }
3082 3089  
3083 3090  static int
3084 3091  zone_set_brand(zone_t *zone, const char *brand)
3085 3092  {
3086 3093          struct brand_attr *attrp;
3087 3094          brand_t *bp;
3088 3095  
3089 3096          attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
3090 3097          if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
3091 3098                  kmem_free(attrp, sizeof (struct brand_attr));
3092 3099                  return (EFAULT);
3093 3100          }
3094 3101  
3095 3102          bp = brand_register_zone(attrp);
3096 3103          kmem_free(attrp, sizeof (struct brand_attr));
3097 3104          if (bp == NULL)
3098 3105                  return (EINVAL);
3099 3106  
3100 3107          /*
3101 3108           * This is the only place where a zone can change it's brand.
3102 3109           * We already need to hold zone_status_lock to check the zone
3103 3110           * status, so we'll just use that lock to serialize zone
3104 3111           * branding requests as well.
3105 3112           */
3106 3113          mutex_enter(&zone_status_lock);
3107 3114  
3108 3115          /* Re-Branding is not allowed and the zone can't be booted yet */
3109 3116          if ((ZONE_IS_BRANDED(zone)) ||
3110 3117              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
3111 3118                  mutex_exit(&zone_status_lock);
3112 3119                  brand_unregister_zone(bp);
3113 3120                  return (EINVAL);
3114 3121          }
3115 3122  
3116 3123          /*
3117 3124           * Set up the brand specific data.
3118 3125           * Note that it's possible that the hook has to drop the
3119 3126           * zone_status_lock and reaquire it before returning so we can't
3120 3127           * assume the lock has been held the entire time.
3121 3128           */
3122 3129          zone->zone_brand = bp;
3123 3130          ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
3124 3131  
3125 3132          mutex_exit(&zone_status_lock);
3126 3133          return (0);
3127 3134  }
3128 3135  
3129 3136  static int
3130 3137  zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags)
3131 3138  {
3132 3139          int err = 0;
3133 3140          psecflags_t psf;
3134 3141  
3135 3142          ASSERT(zone != global_zone);
3136 3143  
3137 3144          if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0)
3138 3145                  return (err);
3139 3146  
3140 3147          if (zone_status_get(zone) > ZONE_IS_READY)
3141 3148                  return (EINVAL);
3142 3149  
3143 3150          if (!psecflags_validate(&psf))
3144 3151                  return (EINVAL);
3145 3152  
3146 3153          (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf));
3147 3154  
3148 3155          /* Set security flags on the zone's zsched */
3149 3156          (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags,
3150 3157              sizeof (zone->zone_zsched->p_secflags));
3151 3158  
3152 3159          return (0);
3153 3160  }
3154 3161  
3155 3162  static int
3156 3163  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
3157 3164  {
3158 3165          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
3159 3166          int err = 0;
3160 3167  
3161 3168          ASSERT(zone != global_zone);
3162 3169          if ((err = copyinstr(zone_fs_allowed, buf,
3163 3170              ZONE_FS_ALLOWED_MAX, NULL)) != 0)
3164 3171                  goto done;
3165 3172  
3166 3173          if (zone->zone_fs_allowed != NULL)
3167 3174                  strfree(zone->zone_fs_allowed);
3168 3175  
3169 3176          zone->zone_fs_allowed = strdup(buf);
3170 3177  
3171 3178  done:
3172 3179          kmem_free(buf, ZONE_FS_ALLOWED_MAX);
3173 3180          return (err);
3174 3181  }
3175 3182  
3176 3183  static int
3177 3184  zone_set_initname(zone_t *zone, const char *zone_initname)
3178 3185  {
3179 3186          char initname[INITNAME_SZ];
3180 3187          size_t len;
3181 3188          int err = 0;
3182 3189  
3183 3190          ASSERT(zone != global_zone);
3184 3191          if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
3185 3192                  return (err);   /* EFAULT or ENAMETOOLONG */
3186 3193  
3187 3194          if (zone->zone_initname != NULL)
3188 3195                  strfree(zone->zone_initname);
3189 3196  
3190 3197          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
3191 3198          (void) strcpy(zone->zone_initname, initname);
3192 3199          return (0);
3193 3200  }
3194 3201  
3195 3202  static int
3196 3203  zone_set_sched_class(zone_t *zone, const char *new_class)
3197 3204  {
3198 3205          char sched_class[PC_CLNMSZ];
3199 3206          id_t classid;
3200 3207          int err;
3201 3208  
3202 3209          ASSERT(zone != global_zone);
3203 3210          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
3204 3211                  return (err);   /* EFAULT or ENAMETOOLONG */
3205 3212  
3206 3213          if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
3207 3214                  return (set_errno(EINVAL));
3208 3215          zone->zone_defaultcid = classid;
3209 3216          ASSERT(zone->zone_defaultcid > 0 &&
3210 3217              zone->zone_defaultcid < loaded_classes);
3211 3218  
3212 3219          return (0);
3213 3220  }
3214 3221  
3215 3222  /*
3216 3223   * Block indefinitely waiting for (zone_status >= status)
3217 3224   */
3218 3225  void
3219 3226  zone_status_wait(zone_t *zone, zone_status_t status)
3220 3227  {
3221 3228          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3222 3229  
3223 3230          mutex_enter(&zone_status_lock);
3224 3231          while (zone->zone_status < status) {
3225 3232                  cv_wait(&zone->zone_cv, &zone_status_lock);
3226 3233          }
3227 3234          mutex_exit(&zone_status_lock);
3228 3235  }
3229 3236  
3230 3237  /*
3231 3238   * Private CPR-safe version of zone_status_wait().
3232 3239   */
3233 3240  static void
3234 3241  zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
3235 3242  {
3236 3243          callb_cpr_t cprinfo;
3237 3244  
3238 3245          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3239 3246  
3240 3247          CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
3241 3248              str);
3242 3249          mutex_enter(&zone_status_lock);
3243 3250          while (zone->zone_status < status) {
3244 3251                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
3245 3252                  cv_wait(&zone->zone_cv, &zone_status_lock);
3246 3253                  CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
3247 3254          }
3248 3255          /*
3249 3256           * zone_status_lock is implicitly released by the following.
3250 3257           */
3251 3258          CALLB_CPR_EXIT(&cprinfo);
3252 3259  }
3253 3260  
3254 3261  /*
3255 3262   * Block until zone enters requested state or signal is received.  Return (0)
3256 3263   * if signaled, non-zero otherwise.
3257 3264   */
3258 3265  int
3259 3266  zone_status_wait_sig(zone_t *zone, zone_status_t status)
3260 3267  {
3261 3268          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3262 3269  
3263 3270          mutex_enter(&zone_status_lock);
3264 3271          while (zone->zone_status < status) {
3265 3272                  if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
3266 3273                          mutex_exit(&zone_status_lock);
3267 3274                          return (0);
3268 3275                  }
3269 3276          }
3270 3277          mutex_exit(&zone_status_lock);
3271 3278          return (1);
3272 3279  }
3273 3280  
3274 3281  /*
3275 3282   * Block until the zone enters the requested state or the timeout expires,
3276 3283   * whichever happens first.  Return (-1) if operation timed out, time remaining
3277 3284   * otherwise.
3278 3285   */
3279 3286  clock_t
3280 3287  zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
3281 3288  {
3282 3289          clock_t timeleft = 0;
3283 3290  
3284 3291          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3285 3292  
3286 3293          mutex_enter(&zone_status_lock);
3287 3294          while (zone->zone_status < status && timeleft != -1) {
3288 3295                  timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
3289 3296          }
3290 3297          mutex_exit(&zone_status_lock);
3291 3298          return (timeleft);
3292 3299  }
3293 3300  
3294 3301  /*
3295 3302   * Block until the zone enters the requested state, the current process is
3296 3303   * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
3297 3304   * operation timed out, 0 if signaled, time remaining otherwise.
3298 3305   */
3299 3306  clock_t
3300 3307  zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
3301 3308  {
3302 3309          clock_t timeleft = tim - ddi_get_lbolt();
3303 3310  
3304 3311          ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
3305 3312  
3306 3313          mutex_enter(&zone_status_lock);
3307 3314          while (zone->zone_status < status) {
3308 3315                  timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
3309 3316                      tim);
3310 3317                  if (timeleft <= 0)
3311 3318                          break;
3312 3319          }
3313 3320          mutex_exit(&zone_status_lock);
3314 3321          return (timeleft);
3315 3322  }
3316 3323  
3317 3324  /*
3318 3325   * Zones have two reference counts: one for references from credential
3319 3326   * structures (zone_cred_ref), and one (zone_ref) for everything else.
3320 3327   * This is so we can allow a zone to be rebooted while there are still
3321 3328   * outstanding cred references, since certain drivers cache dblks (which
3322 3329   * implicitly results in cached creds).  We wait for zone_ref to drop to
3323 3330   * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
3324 3331   * later freed when the zone_cred_ref drops to 0, though nothing other
3325 3332   * than the zone id and privilege set should be accessed once the zone
3326 3333   * is "dead".
3327 3334   *
3328 3335   * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
3329 3336   * to force halt/reboot to block waiting for the zone_cred_ref to drop
3330 3337   * to 0.  This can be useful to flush out other sources of cached creds
3331 3338   * that may be less innocuous than the driver case.
3332 3339   *
3333 3340   * Zones also provide a tracked reference counting mechanism in which zone
3334 3341   * references are represented by "crumbs" (zone_ref structures).  Crumbs help
3335 3342   * debuggers determine the sources of leaked zone references.  See
3336 3343   * zone_hold_ref() and zone_rele_ref() below for more information.
3337 3344   */
3338 3345  
3339 3346  int zone_wait_for_cred = 0;
3340 3347  
3341 3348  static void
3342 3349  zone_hold_locked(zone_t *z)
3343 3350  {
3344 3351          ASSERT(MUTEX_HELD(&z->zone_lock));
3345 3352          z->zone_ref++;
3346 3353          ASSERT(z->zone_ref != 0);
3347 3354  }
3348 3355  
3349 3356  /*
3350 3357   * Increment the specified zone's reference count.  The zone's zone_t structure
3351 3358   * will not be freed as long as the zone's reference count is nonzero.
3352 3359   * Decrement the zone's reference count via zone_rele().
3353 3360   *
3354 3361   * NOTE: This function should only be used to hold zones for short periods of
3355 3362   * time.  Use zone_hold_ref() if the zone must be held for a long time.
3356 3363   */
3357 3364  void
3358 3365  zone_hold(zone_t *z)
3359 3366  {
3360 3367          mutex_enter(&z->zone_lock);
3361 3368          zone_hold_locked(z);
3362 3369          mutex_exit(&z->zone_lock);
3363 3370  }
3364 3371  
3365 3372  /*
3366 3373   * If the non-cred ref count drops to 1 and either the cred ref count
3367 3374   * is 0 or we aren't waiting for cred references, the zone is ready to
3368 3375   * be destroyed.
3369 3376   */
3370 3377  #define ZONE_IS_UNREF(zone)     ((zone)->zone_ref == 1 && \
3371 3378              (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
3372 3379  
3373 3380  /*
3374 3381   * Common zone reference release function invoked by zone_rele() and
3375 3382   * zone_rele_ref().  If subsys is ZONE_REF_NUM_SUBSYS, then the specified
3376 3383   * zone's subsystem-specific reference counters are not affected by the
3377 3384   * release.  If ref is not NULL, then the zone_ref_t to which it refers is
3378 3385   * removed from the specified zone's reference list.  ref must be non-NULL iff
3379 3386   * subsys is not ZONE_REF_NUM_SUBSYS.
3380 3387   */
3381 3388  static void
3382 3389  zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3383 3390  {
3384 3391          boolean_t wakeup;
3385 3392  
3386 3393          mutex_enter(&z->zone_lock);
3387 3394          ASSERT(z->zone_ref != 0);
3388 3395          z->zone_ref--;
3389 3396          if (subsys != ZONE_REF_NUM_SUBSYS) {
3390 3397                  ASSERT(z->zone_subsys_ref[subsys] != 0);
3391 3398                  z->zone_subsys_ref[subsys]--;
3392 3399                  list_remove(&z->zone_ref_list, ref);
3393 3400          }
3394 3401          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3395 3402                  /* no more refs, free the structure */
3396 3403                  mutex_exit(&z->zone_lock);
3397 3404                  zone_free(z);
3398 3405                  return;
3399 3406          }
3400 3407          /* signal zone_destroy so the zone can finish halting */
3401 3408          wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
3402 3409          mutex_exit(&z->zone_lock);
3403 3410  
3404 3411          if (wakeup) {
3405 3412                  /*
3406 3413                   * Grabbing zonehash_lock here effectively synchronizes with
3407 3414                   * zone_destroy() to avoid missed signals.
3408 3415                   */
3409 3416                  mutex_enter(&zonehash_lock);
3410 3417                  cv_broadcast(&zone_destroy_cv);
3411 3418                  mutex_exit(&zonehash_lock);
3412 3419          }
3413 3420  }
3414 3421  
3415 3422  /*
3416 3423   * Decrement the specified zone's reference count.  The specified zone will
3417 3424   * cease to exist after this function returns if the reference count drops to
3418 3425   * zero.  This function should be paired with zone_hold().
3419 3426   */
3420 3427  void
3421 3428  zone_rele(zone_t *z)
3422 3429  {
3423 3430          zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
3424 3431  }
3425 3432  
3426 3433  /*
3427 3434   * Initialize a zone reference structure.  This function must be invoked for
3428 3435   * a reference structure before the structure is passed to zone_hold_ref().
3429 3436   */
3430 3437  void
3431 3438  zone_init_ref(zone_ref_t *ref)
3432 3439  {
3433 3440          ref->zref_zone = NULL;
3434 3441          list_link_init(&ref->zref_linkage);
3435 3442  }
3436 3443  
3437 3444  /*
3438 3445   * Acquire a reference to zone z.  The caller must specify the
3439 3446   * zone_ref_subsys_t constant associated with its subsystem.  The specified
3440 3447   * zone_ref_t structure will represent a reference to the specified zone.  Use
3441 3448   * zone_rele_ref() to release the reference.
3442 3449   *
3443 3450   * The referenced zone_t structure will not be freed as long as the zone_t's
3444 3451   * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
3445 3452   * references.
3446 3453   *
3447 3454   * NOTE: The zone_ref_t structure must be initialized before it is used.
3448 3455   * See zone_init_ref() above.
3449 3456   */
3450 3457  void
3451 3458  zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
3452 3459  {
3453 3460          ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
3454 3461  
3455 3462          /*
3456 3463           * Prevent consumers from reusing a reference structure before
3457 3464           * releasing it.
3458 3465           */
3459 3466          VERIFY(ref->zref_zone == NULL);
3460 3467  
3461 3468          ref->zref_zone = z;
3462 3469          mutex_enter(&z->zone_lock);
3463 3470          zone_hold_locked(z);
3464 3471          z->zone_subsys_ref[subsys]++;
3465 3472          ASSERT(z->zone_subsys_ref[subsys] != 0);
3466 3473          list_insert_head(&z->zone_ref_list, ref);
3467 3474          mutex_exit(&z->zone_lock);
3468 3475  }
3469 3476  
3470 3477  /*
3471 3478   * Release the zone reference represented by the specified zone_ref_t.
3472 3479   * The reference is invalid after it's released; however, the zone_ref_t
3473 3480   * structure can be reused without having to invoke zone_init_ref().
3474 3481   * subsys should be the same value that was passed to zone_hold_ref()
3475 3482   * when the reference was acquired.
3476 3483   */
3477 3484  void
3478 3485  zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
3479 3486  {
3480 3487          zone_rele_common(ref->zref_zone, ref, subsys);
3481 3488  
3482 3489          /*
3483 3490           * Set the zone_ref_t's zref_zone field to NULL to generate panics
3484 3491           * when consumers dereference the reference.  This helps us catch
3485 3492           * consumers who use released references.  Furthermore, this lets
3486 3493           * consumers reuse the zone_ref_t structure without having to
3487 3494           * invoke zone_init_ref().
3488 3495           */
3489 3496          ref->zref_zone = NULL;
3490 3497  }
3491 3498  
3492 3499  void
3493 3500  zone_cred_hold(zone_t *z)
3494 3501  {
3495 3502          mutex_enter(&z->zone_lock);
3496 3503          z->zone_cred_ref++;
3497 3504          ASSERT(z->zone_cred_ref != 0);
3498 3505          mutex_exit(&z->zone_lock);
3499 3506  }
3500 3507  
3501 3508  void
3502 3509  zone_cred_rele(zone_t *z)
3503 3510  {
3504 3511          boolean_t wakeup;
3505 3512  
3506 3513          mutex_enter(&z->zone_lock);
3507 3514          ASSERT(z->zone_cred_ref != 0);
3508 3515          z->zone_cred_ref--;
3509 3516          if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
3510 3517                  /* no more refs, free the structure */
3511 3518                  mutex_exit(&z->zone_lock);
3512 3519                  zone_free(z);
3513 3520                  return;
3514 3521          }
3515 3522          /*
3516 3523           * If zone_destroy is waiting for the cred references to drain
3517 3524           * out, and they have, signal it.
3518 3525           */
3519 3526          wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
3520 3527              zone_status_get(z) >= ZONE_IS_DEAD);
3521 3528          mutex_exit(&z->zone_lock);
3522 3529  
3523 3530          if (wakeup) {
3524 3531                  /*
3525 3532                   * Grabbing zonehash_lock here effectively synchronizes with
3526 3533                   * zone_destroy() to avoid missed signals.
3527 3534                   */
3528 3535                  mutex_enter(&zonehash_lock);
3529 3536                  cv_broadcast(&zone_destroy_cv);
3530 3537                  mutex_exit(&zonehash_lock);
3531 3538          }
3532 3539  }
3533 3540  
3534 3541  void
3535 3542  zone_task_hold(zone_t *z)
3536 3543  {
3537 3544          mutex_enter(&z->zone_lock);
3538 3545          z->zone_ntasks++;
3539 3546          ASSERT(z->zone_ntasks != 0);
3540 3547          mutex_exit(&z->zone_lock);
3541 3548  }
3542 3549  
3543 3550  void
3544 3551  zone_task_rele(zone_t *zone)
3545 3552  {
3546 3553          uint_t refcnt;
3547 3554  
3548 3555          mutex_enter(&zone->zone_lock);
3549 3556          ASSERT(zone->zone_ntasks != 0);
3550 3557          refcnt = --zone->zone_ntasks;
3551 3558          if (refcnt > 1) {       /* Common case */
3552 3559                  mutex_exit(&zone->zone_lock);
3553 3560                  return;
3554 3561          }
3555 3562          zone_hold_locked(zone); /* so we can use the zone_t later */
3556 3563          mutex_exit(&zone->zone_lock);
3557 3564          if (refcnt == 1) {
3558 3565                  /*
3559 3566                   * See if the zone is shutting down.
3560 3567                   */
3561 3568                  mutex_enter(&zone_status_lock);
3562 3569                  if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
3563 3570                          goto out;
3564 3571                  }
3565 3572  
3566 3573                  /*
3567 3574                   * Make sure the ntasks didn't change since we
3568 3575                   * dropped zone_lock.
3569 3576                   */
3570 3577                  mutex_enter(&zone->zone_lock);
3571 3578                  if (refcnt != zone->zone_ntasks) {
3572 3579                          mutex_exit(&zone->zone_lock);
3573 3580                          goto out;
3574 3581                  }
3575 3582                  mutex_exit(&zone->zone_lock);
3576 3583  
3577 3584                  /*
3578 3585                   * No more user processes in the zone.  The zone is empty.
3579 3586                   */
3580 3587                  zone_status_set(zone, ZONE_IS_EMPTY);
3581 3588                  goto out;
3582 3589          }
3583 3590  
3584 3591          ASSERT(refcnt == 0);
3585 3592          /*
3586 3593           * zsched has exited; the zone is dead.
3587 3594           */
3588 3595          zone->zone_zsched = NULL;               /* paranoia */
3589 3596          mutex_enter(&zone_status_lock);
3590 3597          zone_status_set(zone, ZONE_IS_DEAD);
3591 3598  out:
3592 3599          mutex_exit(&zone_status_lock);
3593 3600          zone_rele(zone);
3594 3601  }
3595 3602  
3596 3603  zoneid_t
3597 3604  getzoneid(void)
3598 3605  {
3599 3606          return (curproc->p_zone->zone_id);
3600 3607  }
3601 3608  
3602 3609  zoneid_t
3603 3610  getzonedid(void)
3604 3611  {
3605 3612          return (curproc->p_zone->zone_did);
3606 3613  }
3607 3614  
3608 3615  /*
3609 3616   * Internal versions of zone_find_by_*().  These don't zone_hold() or
3610 3617   * check the validity of a zone's state.
3611 3618   */
3612 3619  static zone_t *
3613 3620  zone_find_all_by_id(zoneid_t zoneid)
3614 3621  {
3615 3622          mod_hash_val_t hv;
3616 3623          zone_t *zone = NULL;
3617 3624  
3618 3625          ASSERT(MUTEX_HELD(&zonehash_lock));
3619 3626  
3620 3627          if (mod_hash_find(zonehashbyid,
3621 3628              (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3622 3629                  zone = (zone_t *)hv;
3623 3630          return (zone);
3624 3631  }
3625 3632  
3626 3633  static zone_t *
3627 3634  zone_find_all_by_label(const ts_label_t *label)
3628 3635  {
3629 3636          mod_hash_val_t hv;
3630 3637          zone_t *zone = NULL;
3631 3638  
3632 3639          ASSERT(MUTEX_HELD(&zonehash_lock));
3633 3640  
3634 3641          /*
3635 3642           * zonehashbylabel is not maintained for unlabeled systems
3636 3643           */
3637 3644          if (!is_system_labeled())
3638 3645                  return (NULL);
3639 3646          if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
3640 3647                  zone = (zone_t *)hv;
3641 3648          return (zone);
3642 3649  }
3643 3650  
3644 3651  static zone_t *
3645 3652  zone_find_all_by_name(char *name)
3646 3653  {
3647 3654          mod_hash_val_t hv;
3648 3655          zone_t *zone = NULL;
3649 3656  
3650 3657          ASSERT(MUTEX_HELD(&zonehash_lock));
3651 3658  
3652 3659          if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
3653 3660                  zone = (zone_t *)hv;
3654 3661          return (zone);
3655 3662  }
3656 3663  
3657 3664  /*
3658 3665   * Public interface for looking up a zone by zoneid.  Only returns the zone if
3659 3666   * it is fully initialized, and has not yet begun the zone_destroy() sequence.
3660 3667   * Caller must call zone_rele() once it is done with the zone.
3661 3668   *
3662 3669   * The zone may begin the zone_destroy() sequence immediately after this
3663 3670   * function returns, but may be safely used until zone_rele() is called.
3664 3671   */
3665 3672  zone_t *
3666 3673  zone_find_by_id(zoneid_t zoneid)
3667 3674  {
3668 3675          zone_t *zone;
3669 3676          zone_status_t status;
3670 3677  
3671 3678          mutex_enter(&zonehash_lock);
3672 3679          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3673 3680                  mutex_exit(&zonehash_lock);
3674 3681                  return (NULL);
3675 3682          }
3676 3683          status = zone_status_get(zone);
3677 3684          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3678 3685                  /*
3679 3686                   * For all practical purposes the zone doesn't exist.
3680 3687                   */
3681 3688                  mutex_exit(&zonehash_lock);
3682 3689                  return (NULL);
3683 3690          }
3684 3691          zone_hold(zone);
3685 3692          mutex_exit(&zonehash_lock);
3686 3693          return (zone);
3687 3694  }
3688 3695  
3689 3696  /*
3690 3697   * Similar to zone_find_by_id, but using zone label as the key.
3691 3698   */
3692 3699  zone_t *
3693 3700  zone_find_by_label(const ts_label_t *label)
3694 3701  {
3695 3702          zone_t *zone;
3696 3703          zone_status_t status;
3697 3704  
3698 3705          mutex_enter(&zonehash_lock);
3699 3706          if ((zone = zone_find_all_by_label(label)) == NULL) {
3700 3707                  mutex_exit(&zonehash_lock);
3701 3708                  return (NULL);
3702 3709          }
3703 3710  
3704 3711          status = zone_status_get(zone);
3705 3712          if (status > ZONE_IS_DOWN) {
3706 3713                  /*
3707 3714                   * For all practical purposes the zone doesn't exist.
3708 3715                   */
3709 3716                  mutex_exit(&zonehash_lock);
3710 3717                  return (NULL);
3711 3718          }
3712 3719          zone_hold(zone);
3713 3720          mutex_exit(&zonehash_lock);
3714 3721          return (zone);
3715 3722  }
3716 3723  
3717 3724  /*
3718 3725   * Similar to zone_find_by_id, but using zone name as the key.
3719 3726   */
3720 3727  zone_t *
3721 3728  zone_find_by_name(char *name)
3722 3729  {
3723 3730          zone_t *zone;
3724 3731          zone_status_t status;
3725 3732  
3726 3733          mutex_enter(&zonehash_lock);
3727 3734          if ((zone = zone_find_all_by_name(name)) == NULL) {
3728 3735                  mutex_exit(&zonehash_lock);
3729 3736                  return (NULL);
3730 3737          }
3731 3738          status = zone_status_get(zone);
3732 3739          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3733 3740                  /*
3734 3741                   * For all practical purposes the zone doesn't exist.
3735 3742                   */
3736 3743                  mutex_exit(&zonehash_lock);
3737 3744                  return (NULL);
3738 3745          }
3739 3746          zone_hold(zone);
3740 3747          mutex_exit(&zonehash_lock);
3741 3748          return (zone);
3742 3749  }
3743 3750  
3744 3751  /*
3745 3752   * Similar to zone_find_by_id(), using the path as a key.  For instance,
3746 3753   * if there is a zone "foo" rooted at /foo/root, and the path argument
3747 3754   * is "/foo/root/proc", it will return the held zone_t corresponding to
3748 3755   * zone "foo".
3749 3756   *
3750 3757   * zone_find_by_path() always returns a non-NULL value, since at the
3751 3758   * very least every path will be contained in the global zone.
3752 3759   *
3753 3760   * As with the other zone_find_by_*() functions, the caller is
3754 3761   * responsible for zone_rele()ing the return value of this function.
3755 3762   */
3756 3763  zone_t *
3757 3764  zone_find_by_path(const char *path)
3758 3765  {
3759 3766          zone_t *zone;
3760 3767          zone_t *zret = NULL;
3761 3768          zone_status_t status;
3762 3769  
3763 3770          if (path == NULL) {
3764 3771                  /*
3765 3772                   * Call from rootconf().
3766 3773                   */
3767 3774                  zone_hold(global_zone);
3768 3775                  return (global_zone);
3769 3776          }
3770 3777          ASSERT(*path == '/');
3771 3778          mutex_enter(&zonehash_lock);
3772 3779          for (zone = list_head(&zone_active); zone != NULL;
3773 3780              zone = list_next(&zone_active, zone)) {
3774 3781                  if (ZONE_PATH_VISIBLE(path, zone))
3775 3782                          zret = zone;
3776 3783          }
3777 3784          ASSERT(zret != NULL);
3778 3785          status = zone_status_get(zret);
3779 3786          if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
3780 3787                  /*
3781 3788                   * Zone practically doesn't exist.
3782 3789                   */
3783 3790                  zret = global_zone;
3784 3791          }
3785 3792          zone_hold(zret);
3786 3793          mutex_exit(&zonehash_lock);
3787 3794          return (zret);
3788 3795  }
3789 3796  
3790 3797  /*
3791 3798   * Public interface for updating per-zone load averages.  Called once per
3792 3799   * second.
3793 3800   *
3794 3801   * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
3795 3802   */
3796 3803  void
3797 3804  zone_loadavg_update(void)
3798 3805  {
3799 3806          zone_t *zp;
3800 3807          zone_status_t status;
3801 3808          struct loadavg_s *lavg;
3802 3809          hrtime_t zone_total;
3803 3810          uint64_t tmp;
3804 3811          int i;
3805 3812          hrtime_t hr_avg;
3806 3813          int nrun;
3807 3814          static int64_t f[3] = { 135, 27, 9 };
3808 3815          int64_t q, r;
3809 3816  
3810 3817          mutex_enter(&zonehash_lock);
3811 3818          for (zp = list_head(&zone_active); zp != NULL;
3812 3819              zp = list_next(&zone_active, zp)) {
3813 3820                  mutex_enter(&zp->zone_lock);
3814 3821  
3815 3822                  /* Skip zones that are on the way down or not yet up */
3816 3823                  status = zone_status_get(zp);
3817 3824                  if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) {
3818 3825                          /* For all practical purposes the zone doesn't exist. */
3819 3826                          mutex_exit(&zp->zone_lock);
3820 3827                          continue;
3821 3828                  }
3822 3829  
3823 3830                  /*
3824 3831                   * Update the 10 second moving average data in zone_loadavg.
3825 3832                   */
3826 3833                  lavg = &zp->zone_loadavg;
3827 3834  
3828 3835                  tmp = cpu_uarray_sum_all(zp->zone_ustate);
3829 3836                  zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
3830 3837  
3831 3838                  scalehrtime(&zone_total);
3832 3839  
3833 3840                  /* The zone_total should always be increasing. */
3834 3841                  lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ?
3835 3842                      zone_total - lavg->lg_total : 0;
3836 3843                  lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
3837 3844                  /* lg_total holds the prev. 1 sec. total */
3838 3845                  lavg->lg_total = zone_total;
3839 3846  
3840 3847                  /*
3841 3848                   * To simplify the calculation, we don't calculate the load avg.
3842 3849                   * until the zone has been up for at least 10 seconds and our
3843 3850                   * moving average is thus full.
3844 3851                   */
3845 3852                  if ((lavg->lg_len + 1) < S_LOADAVG_SZ) {
3846 3853                          lavg->lg_len++;
3847 3854                          mutex_exit(&zp->zone_lock);
3848 3855                          continue;
3849 3856                  }
3850 3857  
3851 3858                  /* Now calculate the 1min, 5min, 15 min load avg. */
3852 3859                  hr_avg = 0;
3853 3860                  for (i = 0; i < S_LOADAVG_SZ; i++)
3854 3861                          hr_avg += lavg->lg_loads[i];
3855 3862                  hr_avg = hr_avg / S_LOADAVG_SZ;
3856 3863                  nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
3857 3864  
3858 3865                  /* Compute load avg. See comment in calcloadavg() */
3859 3866                  for (i = 0; i < 3; i++) {
3860 3867                          q = (zp->zone_hp_avenrun[i] >> 16) << 7;
3861 3868                          r = (zp->zone_hp_avenrun[i] & 0xffff) << 7;
3862 3869                          zp->zone_hp_avenrun[i] +=
3863 3870                              ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
3864 3871  
3865 3872                          /* avenrun[] can only hold 31 bits of load avg. */
3866 3873                          if (zp->zone_hp_avenrun[i] <
3867 3874                              ((uint64_t)1<<(31+16-FSHIFT)))
3868 3875                                  zp->zone_avenrun[i] = (int32_t)
3869 3876                                      (zp->zone_hp_avenrun[i] >> (16 - FSHIFT));
3870 3877                          else
3871 3878                                  zp->zone_avenrun[i] = 0x7fffffff;
3872 3879                  }
3873 3880  
3874 3881                  mutex_exit(&zp->zone_lock);
3875 3882          }
3876 3883          mutex_exit(&zonehash_lock);
3877 3884  }
3878 3885  
3879 3886  /*
3880 3887   * Get the number of cpus visible to this zone.  The system-wide global
3881 3888   * 'ncpus' is returned if pools are disabled, the caller is in the
3882 3889   * global zone, or a NULL zone argument is passed in.
3883 3890   */
3884 3891  int
3885 3892  zone_ncpus_get(zone_t *zone)
3886 3893  {
3887 3894          int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
3888 3895  
3889 3896          return (myncpus != 0 ? myncpus : ncpus);
3890 3897  }
3891 3898  
3892 3899  /*
3893 3900   * Get the number of online cpus visible to this zone.  The system-wide
3894 3901   * global 'ncpus_online' is returned if pools are disabled, the caller
3895 3902   * is in the global zone, or a NULL zone argument is passed in.
3896 3903   */
3897 3904  int
3898 3905  zone_ncpus_online_get(zone_t *zone)
3899 3906  {
3900 3907          int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
3901 3908  
3902 3909          return (myncpus_online != 0 ? myncpus_online : ncpus_online);
3903 3910  }
3904 3911  
3905 3912  /*
3906 3913   * Return the pool to which the zone is currently bound.
3907 3914   */
3908 3915  pool_t *
3909 3916  zone_pool_get(zone_t *zone)
3910 3917  {
3911 3918          ASSERT(pool_lock_held());
3912 3919  
3913 3920          return (zone->zone_pool);
3914 3921  }
3915 3922  
3916 3923  /*
3917 3924   * Set the zone's pool pointer and update the zone's visibility to match
3918 3925   * the resources in the new pool.
3919 3926   */
3920 3927  void
3921 3928  zone_pool_set(zone_t *zone, pool_t *pool)
3922 3929  {
3923 3930          ASSERT(pool_lock_held());
3924 3931          ASSERT(MUTEX_HELD(&cpu_lock));
3925 3932  
3926 3933          zone->zone_pool = pool;
3927 3934          zone_pset_set(zone, pool->pool_pset->pset_id);
3928 3935  }
3929 3936  
3930 3937  /*
3931 3938   * Return the cached value of the id of the processor set to which the
3932 3939   * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
3933 3940   * facility is disabled.
3934 3941   */
3935 3942  psetid_t
3936 3943  zone_pset_get(zone_t *zone)
3937 3944  {
3938 3945          ASSERT(MUTEX_HELD(&cpu_lock));
3939 3946  
3940 3947          return (zone->zone_psetid);
3941 3948  }
3942 3949  
3943 3950  /*
3944 3951   * Set the cached value of the id of the processor set to which the zone
3945 3952   * is currently bound.  Also update the zone's visibility to match the
3946 3953   * resources in the new processor set.
3947 3954   */
3948 3955  void
3949 3956  zone_pset_set(zone_t *zone, psetid_t newpsetid)
3950 3957  {
3951 3958          psetid_t oldpsetid;
3952 3959  
3953 3960          ASSERT(MUTEX_HELD(&cpu_lock));
3954 3961          oldpsetid = zone_pset_get(zone);
3955 3962  
3956 3963          if (oldpsetid == newpsetid)
3957 3964                  return;
3958 3965          /*
3959 3966           * Global zone sees all.
3960 3967           */
3961 3968          if (zone != global_zone) {
3962 3969                  zone->zone_psetid = newpsetid;
3963 3970                  if (newpsetid != ZONE_PS_INVAL)
3964 3971                          pool_pset_visibility_add(newpsetid, zone);
3965 3972                  if (oldpsetid != ZONE_PS_INVAL)
3966 3973                          pool_pset_visibility_remove(oldpsetid, zone);
3967 3974          }
3968 3975          /*
3969 3976           * Disabling pools, so we should start using the global values
3970 3977           * for ncpus and ncpus_online.
3971 3978           */
3972 3979          if (newpsetid == ZONE_PS_INVAL) {
3973 3980                  zone->zone_ncpus = 0;
3974 3981                  zone->zone_ncpus_online = 0;
3975 3982          }
3976 3983  }
3977 3984  
3978 3985  /*
3979 3986   * Walk the list of active zones and issue the provided callback for
3980 3987   * each of them.
3981 3988   *
3982 3989   * Caller must not be holding any locks that may be acquired under
3983 3990   * zonehash_lock.  See comment at the beginning of the file for a list of
3984 3991   * common locks and their interactions with zones.
3985 3992   */
3986 3993  int
3987 3994  zone_walk(int (*cb)(zone_t *, void *), void *data)
3988 3995  {
3989 3996          zone_t *zone;
3990 3997          int ret = 0;
3991 3998          zone_status_t status;
3992 3999  
3993 4000          mutex_enter(&zonehash_lock);
3994 4001          for (zone = list_head(&zone_active); zone != NULL;
3995 4002              zone = list_next(&zone_active, zone)) {
3996 4003                  /*
3997 4004                   * Skip zones that shouldn't be externally visible.
3998 4005                   */
3999 4006                  status = zone_status_get(zone);
4000 4007                  if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
4001 4008                          continue;
4002 4009                  /*
4003 4010                   * Bail immediately if any callback invocation returns a
4004 4011                   * non-zero value.
4005 4012                   */
4006 4013                  ret = (*cb)(zone, data);
4007 4014                  if (ret != 0)
4008 4015                          break;
4009 4016          }
4010 4017          mutex_exit(&zonehash_lock);
4011 4018          return (ret);
4012 4019  }
4013 4020  
4014 4021  static int
4015 4022  zone_set_root(zone_t *zone, const char *upath)
4016 4023  {
4017 4024          vnode_t *vp;
4018 4025          int trycount;
4019 4026          int error = 0;
4020 4027          char *path;
4021 4028          struct pathname upn, pn;
4022 4029          size_t pathlen;
4023 4030  
4024 4031          if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
4025 4032                  return (error);
4026 4033  
4027 4034          pn_alloc(&pn);
4028 4035  
4029 4036          /* prevent infinite loop */
4030 4037          trycount = 10;
4031 4038          for (;;) {
4032 4039                  if (--trycount <= 0) {
4033 4040                          error = ESTALE;
4034 4041                          goto out;
4035 4042                  }
4036 4043  
4037 4044                  if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
4038 4045                          /*
4039 4046                           * VOP_ACCESS() may cover 'vp' with a new
4040 4047                           * filesystem, if 'vp' is an autoFS vnode.
4041 4048                           * Get the new 'vp' if so.
4042 4049                           */
4043 4050                          if ((error =
4044 4051                              VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
4045 4052                              (!vn_ismntpt(vp) ||
4046 4053                              (error = traverse(&vp)) == 0)) {
4047 4054                                  pathlen = pn.pn_pathlen + 2;
4048 4055                                  path = kmem_alloc(pathlen, KM_SLEEP);
4049 4056                                  (void) strncpy(path, pn.pn_path,
4050 4057                                      pn.pn_pathlen + 1);
4051 4058                                  path[pathlen - 2] = '/';
4052 4059                                  path[pathlen - 1] = '\0';
4053 4060                                  pn_free(&pn);
4054 4061                                  pn_free(&upn);
4055 4062

↓ open down ↓

1101 lines elided

↑ open up ↑

4056 4063                                  /* Success! */
4057 4064                                  break;
4058 4065                          }
4059 4066                          VN_RELE(vp);
4060 4067                  }
4061 4068                  if (error != ESTALE)
4062 4069                          goto out;
4063 4070          }
4064 4071  
4065 4072          ASSERT(error == 0);
     4073 +        mutex_enter(&vp->v_lock);
     4074 +        if (vp->v_flag & VZONEROOT) {
     4075 +                /* Wow, someone's already using this zone root! */
     4076 +                error = EEXIST; /* XXX KEBE ASKS, better errno? */
     4077 +                mutex_exit(&vp->v_lock);
     4078 +                VN_RELE(vp);
     4079 +                goto out;
     4080 +        }
     4081 +        vp->v_flag |= VZONEROOT;
     4082 +        mutex_exit(&vp->v_lock);
4066 4083          zone->zone_rootvp = vp;         /* we hold a reference to vp */
4067 4084          zone->zone_rootpath = path;
4068 4085          zone->zone_rootpathlen = pathlen;
4069 4086          if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
4070 4087                  zone->zone_flags |= ZF_IS_SCRATCH;
4071 4088          return (0);
4072 4089  
4073 4090  out:
4074 4091          pn_free(&pn);
4075 4092          pn_free(&upn);

4076 4093          return (error);
4077 4094  }
4078 4095  
4079 4096  #define isalnum(c)      (((c) >= '0' && (c) <= '9') || \
4080 4097                          ((c) >= 'a' && (c) <= 'z') || \
4081 4098                          ((c) >= 'A' && (c) <= 'Z'))
4082 4099  
4083 4100  static int
4084 4101  zone_set_name(zone_t *zone, const char *uname)
4085 4102  {
4086 4103          char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
4087 4104          size_t len;
4088 4105          int i, err;
4089 4106  
4090 4107          if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
4091 4108                  kmem_free(kname, ZONENAME_MAX);
4092 4109                  return (err);   /* EFAULT or ENAMETOOLONG */
4093 4110          }
4094 4111  
4095 4112          /* must be less than ZONENAME_MAX */
4096 4113          if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
4097 4114                  kmem_free(kname, ZONENAME_MAX);
4098 4115                  return (EINVAL);
4099 4116          }
4100 4117  
4101 4118          /*
4102 4119           * Name must start with an alphanumeric and must contain only
4103 4120           * alphanumerics, '-', '_' and '.'.
4104 4121           */
4105 4122          if (!isalnum(kname[0])) {
4106 4123                  kmem_free(kname, ZONENAME_MAX);
4107 4124                  return (EINVAL);
4108 4125          }
4109 4126          for (i = 1; i < len - 1; i++) {
4110 4127                  if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
4111 4128                      kname[i] != '.') {
4112 4129                          kmem_free(kname, ZONENAME_MAX);
4113 4130                          return (EINVAL);
4114 4131                  }
4115 4132          }
4116 4133  
4117 4134          zone->zone_name = kname;
4118 4135          return (0);
4119 4136  }
4120 4137  
4121 4138  /*
4122 4139   * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
4123 4140   * is NULL or it points to a zone with no hostid emulation, then the machine's
4124 4141   * hostid (i.e., the global zone's hostid) is returned.  This function returns
4125 4142   * zero if neither the zone nor the host machine (global zone) have hostids.  It
4126 4143   * returns HW_INVALID_HOSTID if the function attempts to return the machine's
4127 4144   * hostid and the machine's hostid is invalid.
4128 4145   */
4129 4146  uint32_t
4130 4147  zone_get_hostid(zone_t *zonep)
4131 4148  {
4132 4149          unsigned long machine_hostid;
4133 4150  
4134 4151          if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
4135 4152                  if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
4136 4153                          return (HW_INVALID_HOSTID);
4137 4154                  return ((uint32_t)machine_hostid);
4138 4155          }
4139 4156          return (zonep->zone_hostid);
4140 4157  }
4141 4158  
4142 4159  /*
4143 4160   * Similar to thread_create(), but makes sure the thread is in the appropriate
4144 4161   * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
4145 4162   */
4146 4163  /*ARGSUSED*/
4147 4164  kthread_t *
4148 4165  zthread_create(
4149 4166      caddr_t stk,
4150 4167      size_t stksize,
4151 4168      void (*proc)(),
4152 4169      void *arg,
4153 4170      size_t len,
4154 4171      pri_t pri)
4155 4172  {
4156 4173          kthread_t *t;
4157 4174          zone_t *zone = curproc->p_zone;
4158 4175          proc_t *pp = zone->zone_zsched;
4159 4176  
4160 4177          zone_hold(zone);        /* Reference to be dropped when thread exits */
4161 4178  
4162 4179          /*
4163 4180           * No-one should be trying to create threads if the zone is shutting
4164 4181           * down and there aren't any kernel threads around.  See comment
4165 4182           * in zthread_exit().
4166 4183           */
4167 4184          ASSERT(!(zone->zone_kthreads == NULL &&
4168 4185              zone_status_get(zone) >= ZONE_IS_EMPTY));
4169 4186          /*
4170 4187           * Create a thread, but don't let it run until we've finished setting
4171 4188           * things up.
4172 4189           */
4173 4190          t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
4174 4191          ASSERT(t->t_forw == NULL);
4175 4192          mutex_enter(&zone_status_lock);
4176 4193          if (zone->zone_kthreads == NULL) {
4177 4194                  t->t_forw = t->t_back = t;
4178 4195          } else {
4179 4196                  kthread_t *tx = zone->zone_kthreads;
4180 4197  
4181 4198                  t->t_forw = tx;
4182 4199                  t->t_back = tx->t_back;
4183 4200                  tx->t_back->t_forw = t;
4184 4201                  tx->t_back = t;
4185 4202          }
4186 4203          zone->zone_kthreads = t;
4187 4204          mutex_exit(&zone_status_lock);
4188 4205  
4189 4206          mutex_enter(&pp->p_lock);
4190 4207          t->t_proc_flag |= TP_ZTHREAD;
4191 4208          project_rele(t->t_proj);
4192 4209          t->t_proj = project_hold(pp->p_task->tk_proj);
4193 4210  
4194 4211          /*
4195 4212           * Setup complete, let it run.
4196 4213           */
4197 4214          thread_lock(t);
4198 4215          t->t_schedflag |= TS_ALLSTART;
4199 4216          setrun_locked(t);
4200 4217          thread_unlock(t);
4201 4218  
4202 4219          mutex_exit(&pp->p_lock);
4203 4220  
4204 4221          return (t);
4205 4222  }
4206 4223  
4207 4224  /*
4208 4225   * Similar to thread_exit().  Must be called by threads created via
4209 4226   * zthread_exit().
4210 4227   */
4211 4228  void
4212 4229  zthread_exit(void)
4213 4230  {
4214 4231          kthread_t *t = curthread;
4215 4232          proc_t *pp = curproc;
4216 4233          zone_t *zone = pp->p_zone;
4217 4234  
4218 4235          mutex_enter(&zone_status_lock);
4219 4236  
4220 4237          /*
4221 4238           * Reparent to p0
4222 4239           */
4223 4240          kpreempt_disable();
4224 4241          mutex_enter(&pp->p_lock);
4225 4242          t->t_proc_flag &= ~TP_ZTHREAD;
4226 4243          t->t_procp = &p0;
4227 4244          hat_thread_exit(t);
4228 4245          mutex_exit(&pp->p_lock);
4229 4246          kpreempt_enable();
4230 4247  
4231 4248          if (t->t_back == t) {
4232 4249                  ASSERT(t->t_forw == t);
4233 4250                  /*
4234 4251                   * If the zone is empty, once the thread count
4235 4252                   * goes to zero no further kernel threads can be
4236 4253                   * created.  This is because if the creator is a process
4237 4254                   * in the zone, then it must have exited before the zone
4238 4255                   * state could be set to ZONE_IS_EMPTY.
4239 4256                   * Otherwise, if the creator is a kernel thread in the
4240 4257                   * zone, the thread count is non-zero.
4241 4258                   *
4242 4259                   * This really means that non-zone kernel threads should
4243 4260                   * not create zone kernel threads.
4244 4261                   */
4245 4262                  zone->zone_kthreads = NULL;
4246 4263                  if (zone_status_get(zone) == ZONE_IS_EMPTY) {
4247 4264                          zone_status_set(zone, ZONE_IS_DOWN);
4248 4265                          /*
4249 4266                           * Remove any CPU caps on this zone.
4250 4267                           */
4251 4268                          cpucaps_zone_remove(zone);
4252 4269                  }
4253 4270          } else {
4254 4271                  t->t_forw->t_back = t->t_back;
4255 4272                  t->t_back->t_forw = t->t_forw;
4256 4273                  if (zone->zone_kthreads == t)
4257 4274                          zone->zone_kthreads = t->t_forw;
4258 4275          }
4259 4276          mutex_exit(&zone_status_lock);
4260 4277          zone_rele(zone);
4261 4278          thread_exit();
4262 4279          /* NOTREACHED */
4263 4280  }
4264 4281  
4265 4282  static void
4266 4283  zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
4267 4284  {
4268 4285          vnode_t *oldvp;
4269 4286  
4270 4287          /* we're going to hold a reference here to the directory */
4271 4288          VN_HOLD(vp);
4272 4289  
4273 4290          /* update abs cwd/root path see c2/audit.c */
4274 4291          if (AU_AUDITING())
4275 4292                  audit_chdirec(vp, vpp);
4276 4293  
4277 4294          mutex_enter(&pp->p_lock);
4278 4295          oldvp = *vpp;
4279 4296          *vpp = vp;
4280 4297          mutex_exit(&pp->p_lock);
4281 4298          if (oldvp != NULL)
4282 4299                  VN_RELE(oldvp);
4283 4300  }
4284 4301  
4285 4302  /*
4286 4303   * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
4287 4304   */
4288 4305  static int
4289 4306  nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
4290 4307  {
4291 4308          nvpair_t *nvp = NULL;
4292 4309          boolean_t priv_set = B_FALSE;
4293 4310          boolean_t limit_set = B_FALSE;
4294 4311          boolean_t action_set = B_FALSE;
4295 4312  
4296 4313          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4297 4314                  const char *name;
4298 4315                  uint64_t ui64;
4299 4316  
4300 4317                  name = nvpair_name(nvp);
4301 4318                  if (nvpair_type(nvp) != DATA_TYPE_UINT64)
4302 4319                          return (EINVAL);
4303 4320                  (void) nvpair_value_uint64(nvp, &ui64);
4304 4321                  if (strcmp(name, "privilege") == 0) {
4305 4322                          /*
4306 4323                           * Currently only privileged values are allowed, but
4307 4324                           * this may change in the future.
4308 4325                           */
4309 4326                          if (ui64 != RCPRIV_PRIVILEGED)
4310 4327                                  return (EINVAL);
4311 4328                          rv->rcv_privilege = ui64;
4312 4329                          priv_set = B_TRUE;
4313 4330                  } else if (strcmp(name, "limit") == 0) {
4314 4331                          rv->rcv_value = ui64;
4315 4332                          limit_set = B_TRUE;
4316 4333                  } else if (strcmp(name, "action") == 0) {
4317 4334                          if (ui64 != RCTL_LOCAL_NOACTION &&
4318 4335                              ui64 != RCTL_LOCAL_DENY)
4319 4336                                  return (EINVAL);
4320 4337                          rv->rcv_flagaction = ui64;
4321 4338                          action_set = B_TRUE;
4322 4339                  } else {
4323 4340                          return (EINVAL);
4324 4341                  }
4325 4342          }
4326 4343  
4327 4344          if (!(priv_set && limit_set && action_set))
4328 4345                  return (EINVAL);
4329 4346          rv->rcv_action_signal = 0;
4330 4347          rv->rcv_action_recipient = NULL;
4331 4348          rv->rcv_action_recip_pid = -1;
4332 4349          rv->rcv_firing_time = 0;
4333 4350  
4334 4351          return (0);
4335 4352  }
4336 4353  
4337 4354  /*
4338 4355   * Non-global zone version of start_init.
4339 4356   */
4340 4357  void
4341 4358  zone_start_init(void)
4342 4359  {
4343 4360          proc_t *p = ttoproc(curthread);
4344 4361          zone_t *z = p->p_zone;
4345 4362  
4346 4363          ASSERT(!INGLOBALZONE(curproc));
4347 4364  
4348 4365          /*
4349 4366           * For all purposes (ZONE_ATTR_INITPID and restart_init),
4350 4367           * storing just the pid of init is sufficient.
4351 4368           */
4352 4369          z->zone_proc_initpid = p->p_pid;
4353 4370  
4354 4371          if (z->zone_setup_app_contract == B_TRUE) {
4355 4372                  /*
4356 4373                   * Normally a process cannot modify its own contract, but we're
4357 4374                   * just starting the zone's init process and its contract is
4358 4375                   * always initialized from the sys_process_tmpl template, so
4359 4376                   * this is the simplest way to setup init's contract to kill
4360 4377                   * the process if any other process in the contract exits.
4361 4378                   */
4362 4379                  p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
4363 4380          }
4364 4381  
4365 4382          /*
4366 4383           * We maintain zone_boot_err so that we can return the cause of the
4367 4384           * failure back to the caller of the zone_boot syscall.
4368 4385           */
4369 4386          p->p_zone->zone_boot_err = start_init_common();
4370 4387  
4371 4388          /*
4372 4389           * We will prevent booting zones from becoming running zones if the
4373 4390           * global zone is shutting down.
4374 4391           */
4375 4392          mutex_enter(&zone_status_lock);
4376 4393          if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
4377 4394              ZONE_IS_SHUTTING_DOWN) {
4378 4395                  /*
4379 4396                   * Make sure we are still in the booting state-- we could have
4380 4397                   * raced and already be shutting down, or even further along.
4381 4398                   */
4382 4399                  if (zone_status_get(z) == ZONE_IS_BOOTING) {
4383 4400                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
4384 4401                  }
4385 4402                  mutex_exit(&zone_status_lock);
4386 4403                  /* It's gone bad, dispose of the process */
4387 4404                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
4388 4405                          mutex_enter(&p->p_lock);
4389 4406                          ASSERT(p->p_flag & SEXITLWPS);
4390 4407                          lwp_exit();
4391 4408                  }
4392 4409          } else {
4393 4410                  id_t cid = curthread->t_cid;
4394 4411  
4395 4412                  if (zone_status_get(z) == ZONE_IS_BOOTING)
4396 4413                          zone_status_set(z, ZONE_IS_RUNNING);
4397 4414                  mutex_exit(&zone_status_lock);
4398 4415  
4399 4416                  mutex_enter(&class_lock);
4400 4417                  ASSERT(cid < loaded_classes);
4401 4418                  if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
4402 4419                      z->zone_fixed_hipri) {
4403 4420                          /*
4404 4421                           * If the zone is using FX then by default all
4405 4422                           * processes start at the lowest priority and stay
4406 4423                           * there. We provide a mechanism for the zone to
4407 4424                           * indicate that it should run at "high priority". In
4408 4425                           * this case we setup init to run at the highest FX
4409 4426                           * priority (which is one level higher than the
4410 4427                           * non-fixed scheduling classes can use).
4411 4428                           */
4412 4429                          pcparms_t pcparms;
4413 4430  
4414 4431                          pcparms.pc_cid = cid;
4415 4432                          ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
4416 4433                          ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
4417 4434                              FXMAXUPRI;
4418 4435                          ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
4419 4436                              FX_DOUPRILIM | FX_DOUPRI;
4420 4437  
4421 4438                          mutex_enter(&pidlock);
4422 4439                          mutex_enter(&curproc->p_lock);
4423 4440  
4424 4441                          (void) parmsset(&pcparms, curthread);
4425 4442  
4426 4443                          mutex_exit(&curproc->p_lock);
4427 4444                          mutex_exit(&pidlock);
4428 4445                  } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
4429 4446                          /*
4430 4447                           * zsched always starts the init lwp at priority
4431 4448                           * minclsyspri - 1. This priority gets set in t_pri and
4432 4449                           * is invalid for RT, but RT never uses t_pri. However
4433 4450                           * t_pri is used by procfs, so we always see processes
4434 4451                           * within an RT zone with an invalid priority value.
4435 4452                           * We fix that up now.
4436 4453                           */
4437 4454                          curthread->t_pri = RTGPPRIO0;
4438 4455                  }
4439 4456                  mutex_exit(&class_lock);
4440 4457  
4441 4458                  /* cause the process to return to userland. */
4442 4459                  lwp_rtt();
4443 4460          }
4444 4461  }
4445 4462  
4446 4463  struct zsched_arg {
4447 4464          zone_t *zone;
4448 4465          nvlist_t *nvlist;
4449 4466  };
4450 4467  
4451 4468  /*
4452 4469   * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
4453 4470   * anything to do with scheduling, but rather with the fact that
4454 4471   * per-zone kernel threads are parented to zsched, just like regular
4455 4472   * kernel threads are parented to sched (p0).
4456 4473   *
4457 4474   * zsched is also responsible for launching init for the zone.
4458 4475   */
4459 4476  static void
4460 4477  zsched(void *arg)
4461 4478  {
4462 4479          struct zsched_arg *za = arg;
4463 4480          proc_t *pp = curproc;
4464 4481          proc_t *initp = proc_init;
4465 4482          zone_t *zone = za->zone;
4466 4483          cred_t *cr, *oldcred;
4467 4484          rctl_set_t *set;
4468 4485          rctl_alloc_gp_t *gp;
4469 4486          contract_t *ct = NULL;
4470 4487          task_t *tk, *oldtk;
4471 4488          rctl_entity_p_t e;
4472 4489          kproject_t *pj;
4473 4490  
4474 4491          nvlist_t *nvl = za->nvlist;
4475 4492          nvpair_t *nvp = NULL;
4476 4493  
4477 4494          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
4478 4495          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
4479 4496          PTOU(pp)->u_argc = 0;
4480 4497          PTOU(pp)->u_argv = 0;
4481 4498          PTOU(pp)->u_envp = 0;
4482 4499          PTOU(pp)->u_commpagep = 0;
4483 4500          closeall(P_FINFO(pp));
4484 4501  
4485 4502          /*
4486 4503           * We are this zone's "zsched" process.  As the zone isn't generally
4487 4504           * visible yet we don't need to grab any locks before initializing its
4488 4505           * zone_proc pointer.
4489 4506           */
4490 4507          zone_hold(zone);  /* this hold is released by zone_destroy() */
4491 4508          zone->zone_zsched = pp;
4492 4509          mutex_enter(&pp->p_lock);
4493 4510          pp->p_zone = zone;
4494 4511          mutex_exit(&pp->p_lock);
4495 4512  
4496 4513          /*
4497 4514           * Disassociate process from its 'parent'; parent ourselves to init
4498 4515           * (pid 1) and change other values as needed.
4499 4516           */
4500 4517          sess_create();
4501 4518  
4502 4519          mutex_enter(&pidlock);
4503 4520          proc_detach(pp);
4504 4521          pp->p_ppid = 1;
4505 4522          pp->p_flag |= SZONETOP;
4506 4523          pp->p_ancpid = 1;
4507 4524          pp->p_parent = initp;
4508 4525          pp->p_psibling = NULL;
4509 4526          if (initp->p_child)
4510 4527                  initp->p_child->p_psibling = pp;
4511 4528          pp->p_sibling = initp->p_child;
4512 4529          initp->p_child = pp;
4513 4530  
4514 4531          /* Decrement what newproc() incremented. */
4515 4532          upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
4516 4533          /*
4517 4534           * Our credentials are about to become kcred-like, so we don't care
4518 4535           * about the caller's ruid.
4519 4536           */
4520 4537          upcount_inc(crgetruid(kcred), zone->zone_id);
4521 4538          mutex_exit(&pidlock);
4522 4539  
4523 4540          /*
4524 4541           * getting out of global zone, so decrement lwp and process counts
4525 4542           */
4526 4543          pj = pp->p_task->tk_proj;
4527 4544          mutex_enter(&global_zone->zone_nlwps_lock);
4528 4545          pj->kpj_nlwps -= pp->p_lwpcnt;
4529 4546          global_zone->zone_nlwps -= pp->p_lwpcnt;
4530 4547          pj->kpj_nprocs--;
4531 4548          global_zone->zone_nprocs--;
4532 4549          mutex_exit(&global_zone->zone_nlwps_lock);
4533 4550  
4534 4551          /*
4535 4552           * Decrement locked memory counts on old zone and project.
4536 4553           */
4537 4554          mutex_enter(&global_zone->zone_mem_lock);
4538 4555          global_zone->zone_locked_mem -= pp->p_locked_mem;
4539 4556          pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
4540 4557          mutex_exit(&global_zone->zone_mem_lock);
4541 4558  
4542 4559          /*
4543 4560           * Create and join a new task in project '0' of this zone.
4544 4561           *
4545 4562           * We don't need to call holdlwps() since we know we're the only lwp in
4546 4563           * this process.
4547 4564           *
4548 4565           * task_join() returns with p_lock held.
4549 4566           */
4550 4567          tk = task_create(0, zone);
4551 4568          mutex_enter(&cpu_lock);
4552 4569          oldtk = task_join(tk, 0);
4553 4570  
4554 4571          pj = pp->p_task->tk_proj;
4555 4572  
4556 4573          mutex_enter(&zone->zone_mem_lock);
4557 4574          zone->zone_locked_mem += pp->p_locked_mem;
4558 4575          pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
4559 4576          mutex_exit(&zone->zone_mem_lock);
4560 4577  
4561 4578          /*
4562 4579           * add lwp and process counts to zsched's zone, and increment
4563 4580           * project's task and process count due to the task created in
4564 4581           * the above task_create.
4565 4582           */
4566 4583          mutex_enter(&zone->zone_nlwps_lock);
4567 4584          pj->kpj_nlwps += pp->p_lwpcnt;
4568 4585          pj->kpj_ntasks += 1;
4569 4586          zone->zone_nlwps += pp->p_lwpcnt;
4570 4587          pj->kpj_nprocs++;
4571 4588          zone->zone_nprocs++;
4572 4589          mutex_exit(&zone->zone_nlwps_lock);
4573 4590  
4574 4591          mutex_exit(&curproc->p_lock);
4575 4592          mutex_exit(&cpu_lock);
4576 4593          task_rele(oldtk);
4577 4594  
4578 4595          /*
4579 4596           * The process was created by a process in the global zone, hence the
4580 4597           * credentials are wrong.  We might as well have kcred-ish credentials.
4581 4598           */
4582 4599          cr = zone->zone_kcred;
4583 4600          crhold(cr);
4584 4601          mutex_enter(&pp->p_crlock);
4585 4602          oldcred = pp->p_cred;
4586 4603          pp->p_cred = cr;
4587 4604          mutex_exit(&pp->p_crlock);
4588 4605          crfree(oldcred);
4589 4606  
4590 4607          /*
4591 4608           * Hold credentials again (for thread)
4592 4609           */
4593 4610          crhold(cr);
4594 4611  
4595 4612          /*
4596 4613           * p_lwpcnt can't change since this is a kernel process.
4597 4614           */
4598 4615          crset(pp, cr);
4599 4616  
4600 4617          /*
4601 4618           * Chroot
4602 4619           */
4603 4620          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
4604 4621          zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
4605 4622  
4606 4623          /*
4607 4624           * Initialize zone's rctl set.
4608 4625           */
4609 4626          set = rctl_set_create();
4610 4627          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
4611 4628          mutex_enter(&pp->p_lock);
4612 4629          e.rcep_p.zone = zone;
4613 4630          e.rcep_t = RCENTITY_ZONE;
4614 4631          zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
4615 4632          mutex_exit(&pp->p_lock);
4616 4633          rctl_prealloc_destroy(gp);
4617 4634  
4618 4635          /*
4619 4636           * Apply the rctls passed in to zone_create().  This is basically a list
4620 4637           * assignment: all of the old values are removed and the new ones
4621 4638           * inserted.  That is, if an empty list is passed in, all values are
4622 4639           * removed.
4623 4640           */
4624 4641          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4625 4642                  rctl_dict_entry_t *rde;
4626 4643                  rctl_hndl_t hndl;
4627 4644                  char *name;
4628 4645                  nvlist_t **nvlarray;
4629 4646                  uint_t i, nelem;
4630 4647                  int error;      /* For ASSERT()s */
4631 4648  
4632 4649                  name = nvpair_name(nvp);
4633 4650                  hndl = rctl_hndl_lookup(name);
4634 4651                  ASSERT(hndl != -1);
4635 4652                  rde = rctl_dict_lookup_hndl(hndl);
4636 4653                  ASSERT(rde != NULL);
4637 4654  
4638 4655                  for (; /* ever */; ) {
4639 4656                          rctl_val_t oval;
4640 4657  
4641 4658                          mutex_enter(&pp->p_lock);
4642 4659                          error = rctl_local_get(hndl, NULL, &oval, pp);
4643 4660                          mutex_exit(&pp->p_lock);
4644 4661                          ASSERT(error == 0);     /* Can't fail for RCTL_FIRST */
4645 4662                          ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
4646 4663                          if (oval.rcv_privilege == RCPRIV_SYSTEM)
4647 4664                                  break;
4648 4665                          mutex_enter(&pp->p_lock);
4649 4666                          error = rctl_local_delete(hndl, &oval, pp);
4650 4667                          mutex_exit(&pp->p_lock);
4651 4668                          ASSERT(error == 0);
4652 4669                  }
4653 4670                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4654 4671                  ASSERT(error == 0);
4655 4672                  for (i = 0; i < nelem; i++) {
4656 4673                          rctl_val_t *nvalp;
4657 4674  
4658 4675                          nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
4659 4676                          error = nvlist2rctlval(nvlarray[i], nvalp);
4660 4677                          ASSERT(error == 0);
4661 4678                          /*
4662 4679                           * rctl_local_insert can fail if the value being
4663 4680                           * inserted is a duplicate; this is OK.
4664 4681                           */
4665 4682                          mutex_enter(&pp->p_lock);
4666 4683                          if (rctl_local_insert(hndl, nvalp, pp) != 0)
4667 4684                                  kmem_cache_free(rctl_val_cache, nvalp);
4668 4685                          mutex_exit(&pp->p_lock);
4669 4686                  }
4670 4687          }
4671 4688  
4672 4689          /*
4673 4690           * Tell the world that we're done setting up.
4674 4691           *
4675 4692           * At this point we want to set the zone status to ZONE_IS_INITIALIZED
4676 4693           * and atomically set the zone's processor set visibility.  Once
4677 4694           * we drop pool_lock() this zone will automatically get updated
4678 4695           * to reflect any future changes to the pools configuration.
4679 4696           *
4680 4697           * Note that after we drop the locks below (zonehash_lock in
4681 4698           * particular) other operations such as a zone_getattr call can
4682 4699           * now proceed and observe the zone. That is the reason for doing a
4683 4700           * state transition to the INITIALIZED state.
4684 4701           */
4685 4702          pool_lock();
4686 4703          mutex_enter(&cpu_lock);
4687 4704          mutex_enter(&zonehash_lock);
4688 4705          zone_uniqid(zone);
4689 4706          zone_zsd_configure(zone);
4690 4707          if (pool_state == POOL_ENABLED)
4691 4708                  zone_pset_set(zone, pool_default->pool_pset->pset_id);
4692 4709          mutex_enter(&zone_status_lock);
4693 4710          ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
4694 4711          zone_status_set(zone, ZONE_IS_INITIALIZED);
4695 4712          mutex_exit(&zone_status_lock);
4696 4713          mutex_exit(&zonehash_lock);
4697 4714          mutex_exit(&cpu_lock);
4698 4715          pool_unlock();
4699 4716  
4700 4717          /* Now call the create callback for this key */
4701 4718          zsd_apply_all_keys(zsd_apply_create, zone);
4702 4719  
4703 4720          /* The callbacks are complete. Mark ZONE_IS_READY */
4704 4721          mutex_enter(&zone_status_lock);
4705 4722          ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
4706 4723          zone_status_set(zone, ZONE_IS_READY);
4707 4724          mutex_exit(&zone_status_lock);
4708 4725  
4709 4726          /*
4710 4727           * Once we see the zone transition to the ZONE_IS_BOOTING state,
4711 4728           * we launch init, and set the state to running.
4712 4729           */
4713 4730          zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
4714 4731  
4715 4732          if (zone_status_get(zone) == ZONE_IS_BOOTING) {
4716 4733                  id_t cid;
4717 4734  
4718 4735                  /*
4719 4736                   * Ok, this is a little complicated.  We need to grab the
4720 4737                   * zone's pool's scheduling class ID; note that by now, we
4721 4738                   * are already bound to a pool if we need to be (zoneadmd
4722 4739                   * will have done that to us while we're in the READY
4723 4740                   * state).  *But* the scheduling class for the zone's 'init'
4724 4741                   * must be explicitly passed to newproc, which doesn't
4725 4742                   * respect pool bindings.
4726 4743                   *
4727 4744                   * We hold the pool_lock across the call to newproc() to
4728 4745                   * close the obvious race: the pool's scheduling class
4729 4746                   * could change before we manage to create the LWP with
4730 4747                   * classid 'cid'.
4731 4748                   */
4732 4749                  pool_lock();
4733 4750                  if (zone->zone_defaultcid > 0)
4734 4751                          cid = zone->zone_defaultcid;
4735 4752                  else
4736 4753                          cid = pool_get_class(zone->zone_pool);
4737 4754                  if (cid == -1)
4738 4755                          cid = defaultcid;
4739 4756  
4740 4757                  /*
4741 4758                   * If this fails, zone_boot will ultimately fail.  The
4742 4759                   * state of the zone will be set to SHUTTING_DOWN-- userland
4743 4760                   * will have to tear down the zone, and fail, or try again.
4744 4761                   */
4745 4762                  if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
4746 4763                      minclsyspri - 1, &ct, 0)) != 0) {
4747 4764                          mutex_enter(&zone_status_lock);
4748 4765                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4749 4766                          mutex_exit(&zone_status_lock);
4750 4767                  } else {
4751 4768                          zone->zone_boot_time = gethrestime_sec();
4752 4769                  }
4753 4770  
4754 4771                  pool_unlock();
4755 4772          }
4756 4773  
4757 4774          /*
4758 4775           * Wait for zone_destroy() to be called.  This is what we spend
4759 4776           * most of our life doing.
4760 4777           */
4761 4778          zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
4762 4779  
4763 4780          if (ct)
4764 4781                  /*
4765 4782                   * At this point the process contract should be empty.
4766 4783                   * (Though if it isn't, it's not the end of the world.)
4767 4784                   */
4768 4785                  VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
4769 4786  
4770 4787          /*
4771 4788           * Allow kcred to be freed when all referring processes
4772 4789           * (including this one) go away.  We can't just do this in
4773 4790           * zone_free because we need to wait for the zone_cred_ref to
4774 4791           * drop to 0 before calling zone_free, and the existence of
4775 4792           * zone_kcred will prevent that.  Thus, we call crfree here to
4776 4793           * balance the crdup in zone_create.  The crhold calls earlier
4777 4794           * in zsched will be dropped when the thread and process exit.
4778 4795           */
4779 4796          crfree(zone->zone_kcred);
4780 4797          zone->zone_kcred = NULL;
4781 4798  
4782 4799          exit(CLD_EXITED, 0);
4783 4800  }
4784 4801  
4785 4802  /*
4786 4803   * Helper function to determine if there are any submounts of the
4787 4804   * provided path.  Used to make sure the zone doesn't "inherit" any
4788 4805   * mounts from before it is created.
4789 4806   */
4790 4807  static uint_t
4791 4808  zone_mount_count(const char *rootpath)
4792 4809  {
4793 4810          vfs_t *vfsp;
4794 4811          uint_t count = 0;
4795 4812          size_t rootpathlen = strlen(rootpath);
4796 4813  
4797 4814          /*
4798 4815           * Holding zonehash_lock prevents race conditions with
4799 4816           * vfs_list_add()/vfs_list_remove() since we serialize with
4800 4817           * zone_find_by_path().
4801 4818           */
4802 4819          ASSERT(MUTEX_HELD(&zonehash_lock));
4803 4820          /*
4804 4821           * The rootpath must end with a '/'
4805 4822           */
4806 4823          ASSERT(rootpath[rootpathlen - 1] == '/');
4807 4824  
4808 4825          /*
4809 4826           * This intentionally does not count the rootpath itself if that
4810 4827           * happens to be a mount point.
4811 4828           */
4812 4829          vfs_list_read_lock();
4813 4830          vfsp = rootvfs;
4814 4831          do {
4815 4832                  if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
4816 4833                      rootpathlen) == 0)
4817 4834                          count++;
4818 4835                  vfsp = vfsp->vfs_next;
4819 4836          } while (vfsp != rootvfs);
4820 4837          vfs_list_unlock();
4821 4838          return (count);
4822 4839  }
4823 4840  
4824 4841  /*
4825 4842   * Helper function to make sure that a zone created on 'rootpath'
4826 4843   * wouldn't end up containing other zones' rootpaths.
4827 4844   */
4828 4845  static boolean_t
4829 4846  zone_is_nested(const char *rootpath)
4830 4847  {
4831 4848          zone_t *zone;
4832 4849          size_t rootpathlen = strlen(rootpath);
4833 4850          size_t len;
4834 4851  
4835 4852          ASSERT(MUTEX_HELD(&zonehash_lock));
4836 4853  
4837 4854          /*
4838 4855           * zone_set_root() appended '/' and '\0' at the end of rootpath
4839 4856           */
4840 4857          if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
4841 4858              (rootpath[1] == '/') && (rootpath[2] == '\0'))
4842 4859                  return (B_TRUE);
4843 4860  
4844 4861          for (zone = list_head(&zone_active); zone != NULL;
4845 4862              zone = list_next(&zone_active, zone)) {
4846 4863                  if (zone == global_zone)
4847 4864                          continue;
4848 4865                  len = strlen(zone->zone_rootpath);
4849 4866                  if (strncmp(rootpath, zone->zone_rootpath,
4850 4867                      MIN(rootpathlen, len)) == 0)
4851 4868                          return (B_TRUE);
4852 4869          }
4853 4870          return (B_FALSE);
4854 4871  }
4855 4872  
4856 4873  static int
4857 4874  zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
4858 4875      size_t zone_privssz)
4859 4876  {
4860 4877          priv_set_t *privs;
4861 4878  
4862 4879          if (zone_privssz < sizeof (priv_set_t))
4863 4880                  return (ENOMEM);
4864 4881  
4865 4882          privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
4866 4883  
4867 4884          if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
4868 4885                  kmem_free(privs, sizeof (priv_set_t));
4869 4886                  return (EFAULT);
4870 4887          }
4871 4888  
4872 4889          zone->zone_privset = privs;
4873 4890          return (0);
4874 4891  }
4875 4892  
4876 4893  /*
4877 4894   * We make creative use of nvlists to pass in rctls from userland.  The list is
4878 4895   * a list of the following structures:
4879 4896   *
4880 4897   * (name = rctl_name, value = nvpair_list_array)
4881 4898   *
4882 4899   * Where each element of the nvpair_list_array is of the form:
4883 4900   *
4884 4901   * [(name = "privilege", value = RCPRIV_PRIVILEGED),
4885 4902   *      (name = "limit", value = uint64_t),
4886 4903   *      (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
4887 4904   */
4888 4905  static int
4889 4906  parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
4890 4907  {
4891 4908          nvpair_t *nvp = NULL;
4892 4909          nvlist_t *nvl = NULL;
4893 4910          char *kbuf;
4894 4911          int error;
4895 4912          rctl_val_t rv;
4896 4913  
4897 4914          *nvlp = NULL;
4898 4915  
4899 4916          if (buflen == 0)
4900 4917                  return (0);
4901 4918  
4902 4919          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4903 4920                  return (ENOMEM);
4904 4921          if (copyin(ubuf, kbuf, buflen)) {
4905 4922                  error = EFAULT;
4906 4923                  goto out;
4907 4924          }
4908 4925          if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4909 4926                  /*
4910 4927                   * nvl may have been allocated/free'd, but the value set to
4911 4928                   * non-NULL, so we reset it here.
4912 4929                   */
4913 4930                  nvl = NULL;
4914 4931                  error = EINVAL;
4915 4932                  goto out;
4916 4933          }
4917 4934          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4918 4935                  rctl_dict_entry_t *rde;
4919 4936                  rctl_hndl_t hndl;
4920 4937                  nvlist_t **nvlarray;
4921 4938                  uint_t i, nelem;
4922 4939                  char *name;
4923 4940  
4924 4941                  error = EINVAL;
4925 4942                  name = nvpair_name(nvp);
4926 4943                  if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
4927 4944                      strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
4928 4945                      nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4929 4946                          goto out;
4930 4947                  }
4931 4948                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4932 4949                          goto out;
4933 4950                  }
4934 4951                  rde = rctl_dict_lookup_hndl(hndl);
4935 4952                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4936 4953                  ASSERT(error == 0);
4937 4954                  for (i = 0; i < nelem; i++) {
4938 4955                          if (error = nvlist2rctlval(nvlarray[i], &rv))
4939 4956                                  goto out;
4940 4957                  }
4941 4958                  if (rctl_invalid_value(rde, &rv)) {
4942 4959                          error = EINVAL;
4943 4960                          goto out;
4944 4961                  }
4945 4962          }
4946 4963          error = 0;
4947 4964          *nvlp = nvl;
4948 4965  out:
4949 4966          kmem_free(kbuf, buflen);
4950 4967          if (error && nvl != NULL)
4951 4968                  nvlist_free(nvl);
4952 4969          return (error);
4953 4970  }
4954 4971  
4955 4972  int
4956 4973  zone_create_error(int er_error, int er_ext, int *er_out)
4957 4974  {
4958 4975          if (er_out != NULL) {
4959 4976                  if (copyout(&er_ext, er_out, sizeof (int))) {
4960 4977                          return (set_errno(EFAULT));
4961 4978                  }
4962 4979          }
4963 4980          return (set_errno(er_error));
4964 4981  }
4965 4982  
4966 4983  static int
4967 4984  zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4968 4985  {
4969 4986          ts_label_t *tsl;
4970 4987          bslabel_t blab;
4971 4988  
4972 4989          /* Get label from user */
4973 4990          if (copyin(lab, &blab, sizeof (blab)) != 0)
4974 4991                  return (EFAULT);
4975 4992          tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4976 4993          if (tsl == NULL)
4977 4994                  return (ENOMEM);
4978 4995  
4979 4996          zone->zone_slabel = tsl;
4980 4997          return (0);
4981 4998  }
4982 4999  
4983 5000  /*
4984 5001   * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
4985 5002   */
4986 5003  static int
4987 5004  parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
4988 5005  {
4989 5006          char *kbuf;
4990 5007          char *dataset, *next;
4991 5008          zone_dataset_t *zd;
4992 5009          size_t len;
4993 5010  
4994 5011          if (ubuf == NULL || buflen == 0)
4995 5012                  return (0);
4996 5013  
4997 5014          if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
4998 5015                  return (ENOMEM);
4999 5016  
5000 5017          if (copyin(ubuf, kbuf, buflen) != 0) {
5001 5018                  kmem_free(kbuf, buflen);
5002 5019                  return (EFAULT);
5003 5020          }
5004 5021  
5005 5022          dataset = next = kbuf;
5006 5023          for (;;) {
5007 5024                  zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
5008 5025  
5009 5026                  next = strchr(dataset, ',');
5010 5027  
5011 5028                  if (next == NULL)
5012 5029                          len = strlen(dataset);
5013 5030                  else
5014 5031                          len = next - dataset;
5015 5032  
5016 5033                  zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
5017 5034                  bcopy(dataset, zd->zd_dataset, len);
5018 5035                  zd->zd_dataset[len] = '\0';
5019 5036  
5020 5037                  list_insert_head(&zone->zone_datasets, zd);
5021 5038  
5022 5039                  if (next == NULL)
5023 5040                          break;
5024 5041  
5025 5042                  dataset = next + 1;
5026 5043          }
5027 5044  
5028 5045          kmem_free(kbuf, buflen);
5029 5046          return (0);
5030 5047  }
5031 5048  
5032 5049  /*
5033 5050   * System call to create/initialize a new zone named 'zone_name', rooted
5034 5051   * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
5035 5052   * and initialized with the zone-wide rctls described in 'rctlbuf', and
5036 5053   * with labeling set by 'match', 'doi', and 'label'.
5037 5054   *
5038 5055   * If extended error is non-null, we may use it to return more detailed
5039 5056   * error information.
5040 5057   */
5041 5058  static zoneid_t
5042 5059  zone_create(const char *zone_name, const char *zone_root,
5043 5060      const priv_set_t *zone_privs, size_t zone_privssz,
5044 5061      caddr_t rctlbuf, size_t rctlbufsz,
5045 5062      caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
5046 5063      int match, uint32_t doi, const bslabel_t *label,
5047 5064      int flags, zoneid_t zone_did)
5048 5065  {
5049 5066          struct zsched_arg zarg;
5050 5067          nvlist_t *rctls = NULL;
5051 5068          proc_t *pp = curproc;
5052 5069          zone_t *zone, *ztmp;
5053 5070          zoneid_t zoneid, start = GLOBAL_ZONEID;
5054 5071          int error;
5055 5072          int error2 = 0;
5056 5073          char *str;
5057 5074          cred_t *zkcr;
5058 5075          boolean_t insert_label_hash;
5059 5076  
5060 5077          if (secpolicy_zone_config(CRED()) != 0)
5061 5078                  return (set_errno(EPERM));
5062 5079  
5063 5080          /* can't boot zone from within chroot environment */
5064 5081          if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
5065 5082                  return (zone_create_error(ENOTSUP, ZE_CHROOTED,
5066 5083                      extended_error));
5067 5084          /*
5068 5085           * As the first step of zone creation, we want to allocate a zoneid.
5069 5086           * This allocation is complicated by the fact that netstacks use the
5070 5087           * zoneid to determine their stackid, but netstacks themselves are
5071 5088           * freed asynchronously with respect to zone destruction.  This means
5072 5089           * that a netstack reference leak (or in principle, an extraordinarily
5073 5090           * long netstack reference hold) could result in a zoneid being
5074 5091           * allocated that in fact corresponds to a stackid from an active
5075 5092           * (referenced) netstack -- unleashing all sorts of havoc when that
5076 5093           * netstack is actually (re)used.  (In the abstract, we might wish a
5077 5094           * zoneid to not be deallocated until its last referencing netstack
5078 5095           * has been released, but netstacks lack a backpointer into their
5079 5096           * referencing zone -- and changing them to have such a pointer would
5080 5097           * be substantial, to put it euphemistically.)  To avoid this, we
5081 5098           * detect this condition on allocation: if we have allocated a zoneid
5082 5099           * that corresponds to a netstack that's still in use, we warn about
5083 5100           * it (as it is much more likely to be a reference leak than an actual
5084 5101           * netstack reference), free it, and allocate another.  That these
5085 5102           * identifers are allocated out of an ID space assures that we won't
5086 5103           * see the identifier we just allocated.
5087 5104           */
5088 5105          for (;;) {
5089 5106                  zoneid = id_alloc(zoneid_space);
5090 5107  
5091 5108                  if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
5092 5109                          break;
5093 5110  
5094 5111                  id_free(zoneid_space, zoneid);
5095 5112  
5096 5113                  if (start == GLOBAL_ZONEID) {
5097 5114                          start = zoneid;
5098 5115                  } else if (zoneid == start) {
5099 5116                          /*
5100 5117                           * We have managed to iterate over the entire available
5101 5118                           * zoneid space -- there are no identifiers available,
5102 5119                           * presumably due to some number of leaked netstack
5103 5120                           * references.  While it's in principle possible for us
5104 5121                           * to continue to try, it seems wiser to give up at
5105 5122                           * this point to warn and fail explicitly with a
5106 5123                           * distinctive error.
5107 5124                           */
5108 5125                          cmn_err(CE_WARN, "zone_create() failed: all available "
5109 5126                              "zone IDs have netstacks still in use");
5110 5127                          return (set_errno(ENFILE));
5111 5128                  }
5112 5129  
5113 5130                  cmn_err(CE_WARN, "unable to reuse zone ID %d; "
5114 5131                      "netstack still in use", zoneid);
5115 5132          }
5116 5133  
5117 5134          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
5118 5135          zone->zone_id = zoneid;
5119 5136          zone->zone_did = zone_did;
5120 5137          zone->zone_status = ZONE_IS_UNINITIALIZED;
5121 5138          zone->zone_pool = pool_default;
5122 5139          zone->zone_pool_mod = gethrtime();
5123 5140          zone->zone_psetid = ZONE_PS_INVAL;
5124 5141          zone->zone_ncpus = 0;
5125 5142          zone->zone_ncpus_online = 0;
5126 5143          zone->zone_restart_init = B_TRUE;
5127 5144          zone->zone_reboot_on_init_exit = B_FALSE;
5128 5145          zone->zone_restart_init_0 = B_FALSE;
5129 5146          zone->zone_init_status = -1;
5130 5147          zone->zone_brand = &native_brand;
5131 5148          zone->zone_initname = NULL;
5132 5149          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
5133 5150          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
5134 5151          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
5135 5152          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
5136 5153          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
5137 5154              offsetof(zone_ref_t, zref_linkage));
5138 5155          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
5139 5156              offsetof(struct zsd_entry, zsd_linkage));
5140 5157          list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
5141 5158              offsetof(zone_dataset_t, zd_linkage));
5142 5159          list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
5143 5160              offsetof(zone_dl_t, zdl_linkage));
5144 5161          rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
5145 5162          rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
5146 5163  
5147 5164          if (flags & ZCF_NET_EXCL) {
5148 5165                  zone->zone_flags |= ZF_NET_EXCL;
5149 5166          }
5150 5167  
5151 5168          if ((error = zone_set_name(zone, zone_name)) != 0) {
5152 5169                  zone_free(zone);
5153 5170                  return (zone_create_error(error, 0, extended_error));
5154 5171          }
5155 5172  
5156 5173          if ((error = zone_set_root(zone, zone_root)) != 0) {
5157 5174                  zone_free(zone);
5158 5175                  return (zone_create_error(error, 0, extended_error));
5159 5176          }
5160 5177          if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
5161 5178                  zone_free(zone);
5162 5179                  return (zone_create_error(error, 0, extended_error));
5163 5180          }
5164 5181  
5165 5182          /* initialize node name to be the same as zone name */
5166 5183          zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
5167 5184          (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
5168 5185          zone->zone_nodename[_SYS_NMLN - 1] = '\0';
5169 5186  
5170 5187          zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
5171 5188          zone->zone_domain[0] = '\0';
5172 5189          zone->zone_hostid = HW_INVALID_HOSTID;
5173 5190          zone->zone_shares = 1;
5174 5191          zone->zone_shmmax = 0;
5175 5192          zone->zone_ipc.ipcq_shmmni = 0;
5176 5193          zone->zone_ipc.ipcq_semmni = 0;
5177 5194          zone->zone_ipc.ipcq_msgmni = 0;
5178 5195          zone->zone_bootargs = NULL;
5179 5196          zone->zone_fs_allowed = NULL;
5180 5197  
5181 5198          psecflags_default(&zone->zone_secflags);
5182 5199  
5183 5200          zone->zone_initname =
5184 5201              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
5185 5202          (void) strcpy(zone->zone_initname, zone_default_initname);
5186 5203          zone->zone_nlwps = 0;
5187 5204          zone->zone_nlwps_ctl = INT_MAX;
5188 5205          zone->zone_nprocs = 0;
5189 5206          zone->zone_nprocs_ctl = INT_MAX;
5190 5207          zone->zone_locked_mem = 0;
5191 5208          zone->zone_locked_mem_ctl = UINT64_MAX;
5192 5209          zone->zone_max_swap = 0;
5193 5210          zone->zone_max_swap_ctl = UINT64_MAX;
5194 5211          zone->zone_max_lofi = 0;
5195 5212          zone->zone_max_lofi_ctl = UINT64_MAX;
5196 5213          zone->zone_lockedmem_kstat = NULL;
5197 5214          zone->zone_swapresv_kstat = NULL;
5198 5215          zone->zone_physmem_kstat = NULL;
5199 5216  
5200 5217          zone_pdata[zoneid].zpers_zfsp =
5201 5218              kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP);
5202 5219          zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1;
5203 5220  
5204 5221          zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
5205 5222  
5206 5223          /*
5207 5224           * Zsched initializes the rctls.
5208 5225           */
5209 5226          zone->zone_rctls = NULL;
5210 5227  
5211 5228          /*
5212 5229           * Ensure page count is 0 (in case zoneid has wrapped).
5213 5230           * Initialize physical memory cap as unlimited.
5214 5231           */
5215 5232          zone_pdata[zoneid].zpers_pg_cnt = 0;
5216 5233          zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX;
5217 5234  
5218 5235          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
5219 5236                  zone_free(zone);
5220 5237                  return (zone_create_error(error, 0, extended_error));
5221 5238          }
5222 5239  
5223 5240          if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
5224 5241                  zone_free(zone);
5225 5242                  return (set_errno(error));
5226 5243          }
5227 5244  
5228 5245          /*
5229 5246           * Read in the trusted system parameters:
5230 5247           * match flag and sensitivity label.
5231 5248           */
5232 5249          zone->zone_match = match;
5233 5250          if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
5234 5251                  /* Fail if requested to set doi to anything but system's doi */
5235 5252                  if (doi != 0 && doi != default_doi) {
5236 5253                          zone_free(zone);
5237 5254                          return (set_errno(EINVAL));
5238 5255                  }
5239 5256                  /* Always apply system's doi to the zone */
5240 5257                  error = zone_set_label(zone, label, default_doi);
5241 5258                  if (error != 0) {
5242 5259                          zone_free(zone);
5243 5260                          return (set_errno(error));
5244 5261                  }
5245 5262                  insert_label_hash = B_TRUE;
5246 5263          } else {
5247 5264                  /* all zones get an admin_low label if system is not labeled */
5248 5265                  zone->zone_slabel = l_admin_low;
5249 5266                  label_hold(l_admin_low);
5250 5267                  insert_label_hash = B_FALSE;
5251 5268          }
5252 5269  
5253 5270          /*
5254 5271           * Stop all lwps since that's what normally happens as part of fork().
5255 5272           * This needs to happen before we grab any locks to avoid deadlock
5256 5273           * (another lwp in the process could be waiting for the held lock).
5257 5274           */
5258 5275          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
5259 5276                  zone_free(zone);
5260 5277                  nvlist_free(rctls);
5261 5278                  return (zone_create_error(error, 0, extended_error));
5262 5279          }
5263 5280  
5264 5281          if (block_mounts(zone) == 0) {
5265 5282                  mutex_enter(&pp->p_lock);
5266 5283                  if (curthread != pp->p_agenttp)
5267 5284                          continuelwps(pp);
5268 5285                  mutex_exit(&pp->p_lock);
5269 5286                  zone_free(zone);
5270 5287                  nvlist_free(rctls);
5271 5288                  return (zone_create_error(error, 0, extended_error));
5272 5289          }
5273 5290  
5274 5291          /*
5275 5292           * Set up credential for kernel access.  After this, any errors
5276 5293           * should go through the dance in errout rather than calling
5277 5294           * zone_free directly.
5278 5295           */
5279 5296          zone->zone_kcred = crdup(kcred);
5280 5297          crsetzone(zone->zone_kcred, zone);
5281 5298          priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
5282 5299          priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
5283 5300          priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
5284 5301          priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
5285 5302  
5286 5303          mutex_enter(&zonehash_lock);
5287 5304          /*
5288 5305           * Make sure zone doesn't already exist.
5289 5306           *
5290 5307           * If the system and zone are labeled,
5291 5308           * make sure no other zone exists that has the same label.
5292 5309           */
5293 5310          if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
5294 5311              (insert_label_hash &&
5295 5312              (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
5296 5313                  zone_status_t status;
5297 5314  
5298 5315                  status = zone_status_get(ztmp);
5299 5316                  if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
5300 5317                          error = EEXIST;
5301 5318                  else
5302 5319                          error = EBUSY;
5303 5320  
5304 5321                  if (insert_label_hash)
5305 5322                          error2 = ZE_LABELINUSE;
5306 5323  
5307 5324                  goto errout;
5308 5325          }
5309 5326  
5310 5327          /*
5311 5328           * Don't allow zone creations which would cause one zone's rootpath to
5312 5329           * be accessible from that of another (non-global) zone.
5313 5330           */
5314 5331          if (zone_is_nested(zone->zone_rootpath)) {
5315 5332                  error = EBUSY;
5316 5333                  goto errout;
5317 5334          }
5318 5335  
5319 5336          ASSERT(zonecount != 0);         /* check for leaks */
5320 5337          if (zonecount + 1 > maxzones) {
5321 5338                  error = ENOMEM;
5322 5339                  goto errout;
5323 5340          }
5324 5341  
5325 5342          if (zone_mount_count(zone->zone_rootpath) != 0) {
5326 5343                  error = EBUSY;
5327 5344                  error2 = ZE_AREMOUNTS;
5328 5345                  goto errout;
5329 5346          }
5330 5347  
5331 5348          /*
5332 5349           * Zone is still incomplete, but we need to drop all locks while
5333 5350           * zsched() initializes this zone's kernel process.  We
5334 5351           * optimistically add the zone to the hashtable and associated
5335 5352           * lists so a parallel zone_create() doesn't try to create the
5336 5353           * same zone.
5337 5354           */
5338 5355          zonecount++;
5339 5356          (void) mod_hash_insert(zonehashbyid,
5340 5357              (mod_hash_key_t)(uintptr_t)zone->zone_id,
5341 5358              (mod_hash_val_t)(uintptr_t)zone);
5342 5359          str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
5343 5360          (void) strcpy(str, zone->zone_name);
5344 5361          (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
5345 5362              (mod_hash_val_t)(uintptr_t)zone);
5346 5363          if (insert_label_hash) {
5347 5364                  (void) mod_hash_insert(zonehashbylabel,
5348 5365                      (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
5349 5366                  zone->zone_flags |= ZF_HASHED_LABEL;
5350 5367          }
5351 5368  
5352 5369          /*
5353 5370           * Insert into active list.  At this point there are no 'hold's
5354 5371           * on the zone, but everyone else knows not to use it, so we can
5355 5372           * continue to use it.  zsched() will do a zone_hold() if the
5356 5373           * newproc() is successful.
5357 5374           */
5358 5375          list_insert_tail(&zone_active, zone);
5359 5376          mutex_exit(&zonehash_lock);
5360 5377  
5361 5378          zarg.zone = zone;
5362 5379          zarg.nvlist = rctls;
5363 5380          /*
5364 5381           * The process, task, and project rctls are probably wrong;
5365 5382           * we need an interface to get the default values of all rctls,
5366 5383           * and initialize zsched appropriately. However, we allow zoneadmd
5367 5384           * to pass down both zone and project rctls for the zone's init.
5368 5385           */
5369 5386          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
5370 5387          if (error != 0) {
5371 5388                  /*
5372 5389                   * We need to undo all globally visible state.
5373 5390                   */
5374 5391                  mutex_enter(&zonehash_lock);
5375 5392                  list_remove(&zone_active, zone);
5376 5393                  if (zone->zone_flags & ZF_HASHED_LABEL) {
5377 5394                          ASSERT(zone->zone_slabel != NULL);
5378 5395                          (void) mod_hash_destroy(zonehashbylabel,
5379 5396                              (mod_hash_key_t)zone->zone_slabel);
5380 5397                  }
5381 5398                  (void) mod_hash_destroy(zonehashbyname,
5382 5399                      (mod_hash_key_t)(uintptr_t)zone->zone_name);
5383 5400                  (void) mod_hash_destroy(zonehashbyid,
5384 5401                      (mod_hash_key_t)(uintptr_t)zone->zone_id);
5385 5402                  ASSERT(zonecount > 1);
5386 5403                  zonecount--;
5387 5404                  goto errout;
5388 5405          }
5389 5406  
5390 5407          /*
5391 5408           * Zone creation can't fail from now on.
5392 5409           */
5393 5410  
5394 5411          /*
5395 5412           * Create zone kstats
5396 5413           */
5397 5414          zone_kstat_create(zone);
5398 5415  
5399 5416          /*
5400 5417           * Let the other lwps continue.
5401 5418           */
5402 5419          mutex_enter(&pp->p_lock);
5403 5420          if (curthread != pp->p_agenttp)
5404 5421                  continuelwps(pp);
5405 5422          mutex_exit(&pp->p_lock);
5406 5423  
5407 5424          /*
5408 5425           * Wait for zsched to finish initializing the zone.
5409 5426           */
5410 5427          zone_status_wait(zone, ZONE_IS_READY);
5411 5428          /*
5412 5429           * The zone is fully visible, so we can let mounts progress.
5413 5430           */
5414 5431          resume_mounts(zone);
5415 5432          nvlist_free(rctls);
5416 5433  
5417 5434          return (zoneid);
5418 5435  
5419 5436  errout:
5420 5437          mutex_exit(&zonehash_lock);
5421 5438          /*
5422 5439           * Let the other lwps continue.
5423 5440           */
5424 5441          mutex_enter(&pp->p_lock);
5425 5442          if (curthread != pp->p_agenttp)
5426 5443                  continuelwps(pp);
5427 5444          mutex_exit(&pp->p_lock);
5428 5445  
5429 5446          resume_mounts(zone);
5430 5447          nvlist_free(rctls);
5431 5448          /*
5432 5449           * There is currently one reference to the zone, a cred_ref from
5433 5450           * zone_kcred.  To free the zone, we call crfree, which will call
5434 5451           * zone_cred_rele, which will call zone_free.
5435 5452           */
5436 5453          ASSERT(zone->zone_cred_ref == 1);
5437 5454          ASSERT(zone->zone_kcred->cr_ref == 1);
5438 5455          ASSERT(zone->zone_ref == 0);
5439 5456          zkcr = zone->zone_kcred;
5440 5457          zone->zone_kcred = NULL;
5441 5458          crfree(zkcr);                           /* triggers call to zone_free */
5442 5459          return (zone_create_error(error, error2, extended_error));
5443 5460  }
5444 5461  
5445 5462  /*
5446 5463   * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
5447 5464   * the heavy lifting.  initname is the path to the program to launch
5448 5465   * at the "top" of the zone; if this is NULL, we use the system default,
5449 5466   * which is stored at zone_default_initname.
5450 5467   */
5451 5468  static int
5452 5469  zone_boot(zoneid_t zoneid)
5453 5470  {
5454 5471          int err;
5455 5472          zone_t *zone;
5456 5473  
5457 5474          if (secpolicy_zone_config(CRED()) != 0)
5458 5475                  return (set_errno(EPERM));
5459 5476          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5460 5477                  return (set_errno(EINVAL));
5461 5478  
5462 5479          mutex_enter(&zonehash_lock);
5463 5480          /*
5464 5481           * Look for zone under hash lock to prevent races with calls to
5465 5482           * zone_shutdown, zone_destroy, etc.
5466 5483           */
5467 5484          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5468 5485                  mutex_exit(&zonehash_lock);
5469 5486                  return (set_errno(EINVAL));
5470 5487          }
5471 5488  
5472 5489          mutex_enter(&zone_status_lock);
5473 5490          if (zone_status_get(zone) != ZONE_IS_READY) {
5474 5491                  mutex_exit(&zone_status_lock);
5475 5492                  mutex_exit(&zonehash_lock);
5476 5493                  return (set_errno(EINVAL));
5477 5494          }
5478 5495          zone_status_set(zone, ZONE_IS_BOOTING);
5479 5496          mutex_exit(&zone_status_lock);
5480 5497  
5481 5498          zone_hold(zone);        /* so we can use the zone_t later */
5482 5499          mutex_exit(&zonehash_lock);
5483 5500  
5484 5501          if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
5485 5502                  zone_rele(zone);
5486 5503                  return (set_errno(EINTR));
5487 5504          }
5488 5505  
5489 5506          /*
5490 5507           * Boot (starting init) might have failed, in which case the zone
5491 5508           * will go to the SHUTTING_DOWN state; an appropriate errno will
5492 5509           * be placed in zone->zone_boot_err, and so we return that.
5493 5510           */
5494 5511          err = zone->zone_boot_err;
5495 5512          zone_rele(zone);
5496 5513          return (err ? set_errno(err) : 0);
5497 5514  }
5498 5515  
5499 5516  /*
5500 5517   * Kills all user processes in the zone, waiting for them all to exit
5501 5518   * before returning.
5502 5519   */
5503 5520  static int
5504 5521  zone_empty(zone_t *zone)
5505 5522  {
5506 5523          int cnt = 0;
5507 5524          int waitstatus;
5508 5525  
5509 5526          /*
5510 5527           * We need to drop zonehash_lock before killing all
5511 5528           * processes, otherwise we'll deadlock with zone_find_*
5512 5529           * which can be called from the exit path.
5513 5530           */
5514 5531          ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
5515 5532          while ((waitstatus = zone_status_timedwait_sig(zone,
5516 5533              ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
5517 5534                  boolean_t force = B_FALSE;
5518 5535  
5519 5536                  /* Every 30 seconds, try harder */
5520 5537                  if (cnt++ >= 30) {
5521 5538                          cmn_err(CE_WARN, "attempt to force kill zone %d\n",
5522 5539                              zone->zone_id);
5523 5540                          force = B_TRUE;
5524 5541                          cnt = 0;
5525 5542                  }
5526 5543                  killall(zone->zone_id, force);
5527 5544          }
5528 5545          /*
5529 5546           * return EINTR if we were signaled
5530 5547           */
5531 5548          if (waitstatus == 0)
5532 5549                  return (EINTR);
5533 5550          return (0);
5534 5551  }
5535 5552  
5536 5553  /*
5537 5554   * This function implements the policy for zone visibility.
5538 5555   *
5539 5556   * In standard Solaris, a non-global zone can only see itself.
5540 5557   *
5541 5558   * In Trusted Extensions, a labeled zone can lookup any zone whose label
5542 5559   * it dominates. For this test, the label of the global zone is treated as
5543 5560   * admin_high so it is special-cased instead of being checked for dominance.
5544 5561   *
5545 5562   * Returns true if zone attributes are viewable, false otherwise.
5546 5563   */
5547 5564  static boolean_t
5548 5565  zone_list_access(zone_t *zone)
5549 5566  {
5550 5567  
5551 5568          if (curproc->p_zone == global_zone ||
5552 5569              curproc->p_zone == zone) {
5553 5570                  return (B_TRUE);
5554 5571          } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
5555 5572                  bslabel_t *curproc_label;
5556 5573                  bslabel_t *zone_label;
5557 5574  
5558 5575                  curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
5559 5576                  zone_label = label2bslabel(zone->zone_slabel);
5560 5577  
5561 5578                  if (zone->zone_id != GLOBAL_ZONEID &&
5562 5579                      bldominates(curproc_label, zone_label)) {
5563 5580                          return (B_TRUE);
5564 5581                  } else {
5565 5582                          return (B_FALSE);
5566 5583                  }
5567 5584          } else {
5568 5585                  return (B_FALSE);
5569 5586          }
5570 5587  }
5571 5588  
5572 5589  /*
5573 5590   * Systemcall to start the zone's halt sequence.  By the time this
5574 5591   * function successfully returns, all user processes and kernel threads
5575 5592   * executing in it will have exited, ZSD shutdown callbacks executed,
5576 5593   * and the zone status set to ZONE_IS_DOWN.
5577 5594   *
5578 5595   * It is possible that the call will interrupt itself if the caller is the
5579 5596   * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
5580 5597   */
5581 5598  static int
5582 5599  zone_shutdown(zoneid_t zoneid)
5583 5600  {
5584 5601          int error;
5585 5602          zone_t *zone;
5586 5603          zone_status_t status;
5587 5604  
5588 5605          if (secpolicy_zone_config(CRED()) != 0)
5589 5606                  return (set_errno(EPERM));
5590 5607          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5591 5608                  return (set_errno(EINVAL));
5592 5609  
5593 5610          mutex_enter(&zonehash_lock);
5594 5611          /*
5595 5612           * Look for zone under hash lock to prevent races with other
5596 5613           * calls to zone_shutdown and zone_destroy.
5597 5614           */
5598 5615          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5599 5616                  mutex_exit(&zonehash_lock);
5600 5617                  return (set_errno(EINVAL));
5601 5618          }
5602 5619  
5603 5620          /*
5604 5621           * We have to drop zonehash_lock before calling block_mounts.
5605 5622           * Hold the zone so we can continue to use the zone_t.
5606 5623           */
5607 5624          zone_hold(zone);
5608 5625          mutex_exit(&zonehash_lock);
5609 5626  
5610 5627          /*
5611 5628           * Block mounts so that VFS_MOUNT() can get an accurate view of
5612 5629           * the zone's status with regards to ZONE_IS_SHUTTING down.
5613 5630           *
5614 5631           * e.g. NFS can fail the mount if it determines that the zone
5615 5632           * has already begun the shutdown sequence.
5616 5633           *
5617 5634           */
5618 5635          if (block_mounts(zone) == 0) {
5619 5636                  zone_rele(zone);
5620 5637                  return (set_errno(EINTR));
5621 5638          }
5622 5639  
5623 5640          mutex_enter(&zonehash_lock);
5624 5641          mutex_enter(&zone_status_lock);
5625 5642          status = zone_status_get(zone);
5626 5643          /*
5627 5644           * Fail if the zone isn't fully initialized yet.
5628 5645           */
5629 5646          if (status < ZONE_IS_READY) {
5630 5647                  mutex_exit(&zone_status_lock);
5631 5648                  mutex_exit(&zonehash_lock);
5632 5649                  resume_mounts(zone);
5633 5650                  zone_rele(zone);
5634 5651                  return (set_errno(EINVAL));
5635 5652          }
5636 5653          /*
5637 5654           * If conditions required for zone_shutdown() to return have been met,
5638 5655           * return success.
5639 5656           */
5640 5657          if (status >= ZONE_IS_DOWN) {
5641 5658                  mutex_exit(&zone_status_lock);
5642 5659                  mutex_exit(&zonehash_lock);
5643 5660                  resume_mounts(zone);
5644 5661                  zone_rele(zone);
5645 5662                  return (0);
5646 5663          }
5647 5664          /*
5648 5665           * If zone_shutdown() hasn't been called before, go through the motions.
5649 5666           * If it has, there's nothing to do but wait for the kernel threads to
5650 5667           * drain.
5651 5668           */
5652 5669          if (status < ZONE_IS_EMPTY) {
5653 5670                  uint_t ntasks;
5654 5671  
5655 5672                  mutex_enter(&zone->zone_lock);
5656 5673                  if ((ntasks = zone->zone_ntasks) != 1) {
5657 5674                          /*
5658 5675                           * There's still stuff running.
5659 5676                           */
5660 5677                          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5661 5678                  }
5662 5679                  mutex_exit(&zone->zone_lock);
5663 5680                  if (ntasks == 1) {
5664 5681                          /*
5665 5682                           * The only way to create another task is through
5666 5683                           * zone_enter(), which will block until we drop
5667 5684                           * zonehash_lock.  The zone is empty.
5668 5685                           */
5669 5686                          if (zone->zone_kthreads == NULL) {
5670 5687                                  /*
5671 5688                                   * Skip ahead to ZONE_IS_DOWN
5672 5689                                   */
5673 5690                                  zone_status_set(zone, ZONE_IS_DOWN);
5674 5691                          } else {
5675 5692                                  zone_status_set(zone, ZONE_IS_EMPTY);
5676 5693                          }
5677 5694                  }
5678 5695          }
5679 5696          mutex_exit(&zone_status_lock);
5680 5697          mutex_exit(&zonehash_lock);
5681 5698          resume_mounts(zone);
5682 5699  
5683 5700          if (error = zone_empty(zone)) {
5684 5701                  zone_rele(zone);
5685 5702                  return (set_errno(error));
5686 5703          }
5687 5704          /*
5688 5705           * After the zone status goes to ZONE_IS_DOWN this zone will no
5689 5706           * longer be notified of changes to the pools configuration, so
5690 5707           * in order to not end up with a stale pool pointer, we point
5691 5708           * ourselves at the default pool and remove all resource
5692 5709           * visibility.  This is especially important as the zone_t may
5693 5710           * languish on the deathrow for a very long time waiting for
5694 5711           * cred's to drain out.
5695 5712           *
5696 5713           * This rebinding of the zone can happen multiple times
5697 5714           * (presumably due to interrupted or parallel systemcalls)
5698 5715           * without any adverse effects.
5699 5716           */
5700 5717          if (pool_lock_intr() != 0) {
5701 5718                  zone_rele(zone);
5702 5719                  return (set_errno(EINTR));
5703 5720          }
5704 5721          if (pool_state == POOL_ENABLED) {
5705 5722                  mutex_enter(&cpu_lock);
5706 5723                  zone_pool_set(zone, pool_default);
5707 5724                  /*
5708 5725                   * The zone no longer needs to be able to see any cpus.
5709 5726                   */
5710 5727                  zone_pset_set(zone, ZONE_PS_INVAL);
5711 5728                  mutex_exit(&cpu_lock);
5712 5729          }
5713 5730          pool_unlock();
5714 5731  
5715 5732          /*
5716 5733           * ZSD shutdown callbacks can be executed multiple times, hence
5717 5734           * it is safe to not be holding any locks across this call.
5718 5735           */
5719 5736          zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
5720 5737  
5721 5738          mutex_enter(&zone_status_lock);
5722 5739          if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
5723 5740                  zone_status_set(zone, ZONE_IS_DOWN);
5724 5741          mutex_exit(&zone_status_lock);
5725 5742  
5726 5743          /*
5727 5744           * Wait for kernel threads to drain.
5728 5745           */
5729 5746          if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
5730 5747                  zone_rele(zone);
5731 5748                  return (set_errno(EINTR));
5732 5749          }
5733 5750  
5734 5751          /*
5735 5752           * Zone can be become down/destroyable even if the above wait
5736 5753           * returns EINTR, so any code added here may never execute.
5737 5754           * (i.e. don't add code here)
5738 5755           */
5739 5756  
5740 5757          zone_rele(zone);
5741 5758          return (0);
5742 5759  }
5743 5760  
5744 5761  /*
5745 5762   * Log the specified zone's reference counts.  The caller should not be
5746 5763   * holding the zone's zone_lock.
5747 5764   */
5748 5765  static void
5749 5766  zone_log_refcounts(zone_t *zone)
5750 5767  {
5751 5768          char *buffer;
5752 5769          char *buffer_position;
5753 5770          uint32_t buffer_size;
5754 5771          uint32_t index;
5755 5772          uint_t ref;
5756 5773          uint_t cred_ref;
5757 5774  
5758 5775          /*
5759 5776           * Construct a string representing the subsystem-specific reference
5760 5777           * counts.  The counts are printed in ascending order by index into the
5761 5778           * zone_t::zone_subsys_ref array.  The list will be surrounded by
5762 5779           * square brackets [] and will only contain nonzero reference counts.
5763 5780           *
5764 5781           * The buffer will hold two square bracket characters plus ten digits,
5765 5782           * one colon, one space, one comma, and some characters for a
5766 5783           * subsystem name per subsystem-specific reference count.  (Unsigned 32-
5767 5784           * bit integers have at most ten decimal digits.)  The last
5768 5785           * reference count's comma is replaced by the closing square
5769 5786           * bracket and a NULL character to terminate the string.
5770 5787           *
5771 5788           * NOTE: We have to grab the zone's zone_lock to create a consistent
5772 5789           * snapshot of the zone's reference counters.
5773 5790           *
5774 5791           * First, figure out how much space the string buffer will need.
5775 5792           * The buffer's size is stored in buffer_size.
5776 5793           */
5777 5794          buffer_size = 2;                        /* for the square brackets */
5778 5795          mutex_enter(&zone->zone_lock);
5779 5796          zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
5780 5797          ref = zone->zone_ref;
5781 5798          cred_ref = zone->zone_cred_ref;
5782 5799          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
5783 5800                  if (zone->zone_subsys_ref[index] != 0)
5784 5801                          buffer_size += strlen(zone_ref_subsys_names[index]) +
5785 5802                              13;
5786 5803          if (buffer_size == 2) {
5787 5804                  /*
5788 5805                   * No subsystems had nonzero reference counts.  Don't bother
5789 5806                   * with allocating a buffer; just log the general-purpose and
5790 5807                   * credential reference counts.
5791 5808                   */
5792 5809                  mutex_exit(&zone->zone_lock);
5793 5810                  (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5794 5811                      "Zone '%s' (ID: %d) is shutting down, but %u zone "
5795 5812                      "references and %u credential references are still extant",
5796 5813                      zone->zone_name, zone->zone_id, ref, cred_ref);
5797 5814                  return;
5798 5815          }
5799 5816  
5800 5817          /*
5801 5818           * buffer_size contains the exact number of characters that the
5802 5819           * buffer will need.  Allocate the buffer and fill it with nonzero
5803 5820           * subsystem-specific reference counts.  Surround the results with
5804 5821           * square brackets afterwards.
5805 5822           */
5806 5823          buffer = kmem_alloc(buffer_size, KM_SLEEP);
5807 5824          buffer_position = &buffer[1];
5808 5825          for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
5809 5826                  /*
5810 5827                   * NOTE: The DDI's version of sprintf() returns a pointer to
5811 5828                   * the modified buffer rather than the number of bytes written
5812 5829                   * (as in snprintf(3C)).  This is unfortunate and annoying.
5813 5830                   * Therefore, we'll use snprintf() with INT_MAX to get the
5814 5831                   * number of bytes written.  Using INT_MAX is safe because
5815 5832                   * the buffer is perfectly sized for the data: we'll never
5816 5833                   * overrun the buffer.
5817 5834                   */
5818 5835                  if (zone->zone_subsys_ref[index] != 0)
5819 5836                          buffer_position += snprintf(buffer_position, INT_MAX,
5820 5837                              "%s: %u,", zone_ref_subsys_names[index],
5821 5838                              zone->zone_subsys_ref[index]);
5822 5839          }
5823 5840          mutex_exit(&zone->zone_lock);
5824 5841          buffer[0] = '[';
5825 5842          ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
5826 5843          ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
5827 5844          buffer_position[-1] = ']';
5828 5845  
5829 5846          /*
5830 5847           * Log the reference counts and free the message buffer.
5831 5848           */
5832 5849          (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
5833 5850              "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
5834 5851              "%u credential references are still extant %s", zone->zone_name,
5835 5852              zone->zone_id, ref, cred_ref, buffer);
5836 5853          kmem_free(buffer, buffer_size);
5837 5854  }
5838 5855  
5839 5856  /*
5840 5857   * Systemcall entry point to finalize the zone halt process.  The caller
5841 5858   * must have already successfully called zone_shutdown().
5842 5859   *
5843 5860   * Upon successful completion, the zone will have been fully destroyed:
5844 5861   * zsched will have exited, destructor callbacks executed, and the zone
5845 5862   * removed from the list of active zones.
5846 5863   */
5847 5864  static int
5848 5865  zone_destroy(zoneid_t zoneid)
5849 5866  {
5850 5867          uint64_t uniqid;
5851 5868          zone_t *zone;
5852 5869          zone_status_t status;
5853 5870          clock_t wait_time;
5854 5871          boolean_t log_refcounts;
5855 5872          zone_persist_t *zp;
5856 5873  
5857 5874          if (secpolicy_zone_config(CRED()) != 0)
5858 5875                  return (set_errno(EPERM));
5859 5876          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5860 5877                  return (set_errno(EINVAL));
5861 5878  
5862 5879          mutex_enter(&zonehash_lock);
5863 5880          /*
5864 5881           * Look for zone under hash lock to prevent races with other
5865 5882           * calls to zone_destroy.
5866 5883           */
5867 5884          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5868 5885                  mutex_exit(&zonehash_lock);
5869 5886                  return (set_errno(EINVAL));
5870 5887          }
5871 5888  
5872 5889          if (zone_mount_count(zone->zone_rootpath) != 0) {
5873 5890                  mutex_exit(&zonehash_lock);
5874 5891                  return (set_errno(EBUSY));
5875 5892          }
5876 5893          mutex_enter(&zone_status_lock);
5877 5894          status = zone_status_get(zone);
5878 5895          if (status < ZONE_IS_DOWN) {
5879 5896                  mutex_exit(&zone_status_lock);
5880 5897                  mutex_exit(&zonehash_lock);
5881 5898                  return (set_errno(EBUSY));
5882 5899          } else if (status == ZONE_IS_DOWN) {
5883 5900                  zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
5884 5901          }
5885 5902          mutex_exit(&zone_status_lock);
5886 5903          zone_hold(zone);
5887 5904          mutex_exit(&zonehash_lock);
5888 5905  
5889 5906          zp = &zone_pdata[zoneid];
5890 5907          mutex_enter(&zp->zpers_zfs_lock);
5891 5908          kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t));
5892 5909          zp->zpers_zfsp = NULL;
5893 5910          mutex_exit(&zp->zpers_zfs_lock);
5894 5911  
5895 5912          /*
5896 5913           * wait for zsched to exit
5897 5914           */
5898 5915          zone_status_wait(zone, ZONE_IS_DEAD);
5899 5916          zone_zsd_callbacks(zone, ZSD_DESTROY);
5900 5917          zone->zone_netstack = NULL;
5901 5918          uniqid = zone->zone_uniqid;
5902 5919          zone_rele(zone);
5903 5920          zone = NULL;    /* potentially free'd */
5904 5921  
5905 5922          log_refcounts = B_FALSE;
5906 5923          wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
5907 5924          mutex_enter(&zonehash_lock);
5908 5925          for (; /* ever */; ) {
5909 5926                  boolean_t unref;
5910 5927                  boolean_t refs_have_been_logged;
5911 5928  
5912 5929                  if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
5913 5930                      zone->zone_uniqid != uniqid) {
5914 5931                          /*
5915 5932                           * The zone has gone away.  Necessary conditions
5916 5933                           * are met, so we return success.
5917 5934                           */
5918 5935                          mutex_exit(&zonehash_lock);
5919 5936                          return (0);
5920 5937                  }
5921 5938                  mutex_enter(&zone->zone_lock);
5922 5939                  unref = ZONE_IS_UNREF(zone);
5923 5940                  refs_have_been_logged = (zone->zone_flags &
5924 5941                      ZF_REFCOUNTS_LOGGED);
5925 5942                  mutex_exit(&zone->zone_lock);
5926 5943                  if (unref) {
5927 5944                          /*
5928 5945                           * There is only one reference to the zone -- that
5929 5946                           * added when the zone was added to the hashtables --
5930 5947                           * and things will remain this way until we drop
5931 5948                           * zonehash_lock... we can go ahead and cleanup the
5932 5949                           * zone.
5933 5950                           */
5934 5951                          break;
5935 5952                  }
5936 5953  
5937 5954                  /*
5938 5955                   * Wait for zone_rele_common() or zone_cred_rele() to signal
5939 5956                   * zone_destroy_cv.  zone_destroy_cv is signaled only when
5940 5957                   * some zone's general-purpose reference count reaches one.
5941 5958                   * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
5942 5959                   * on zone_destroy_cv, then log the zone's reference counts and
5943 5960                   * continue to wait for zone_rele() and zone_cred_rele().
5944 5961                   */
5945 5962                  if (!refs_have_been_logged) {
5946 5963                          if (!log_refcounts) {
5947 5964                                  /*
5948 5965                                   * This thread hasn't timed out waiting on
5949 5966                                   * zone_destroy_cv yet.  Wait wait_time clock
5950 5967                                   * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
5951 5968                                   * seconds) for the zone's references to clear.
5952 5969                                   */
5953 5970                                  ASSERT(wait_time > 0);
5954 5971                                  wait_time = cv_reltimedwait_sig(
5955 5972                                      &zone_destroy_cv, &zonehash_lock, wait_time,
5956 5973                                      TR_SEC);
5957 5974                                  if (wait_time > 0) {
5958 5975                                          /*
5959 5976                                           * A thread in zone_rele() or
5960 5977                                           * zone_cred_rele() signaled
5961 5978                                           * zone_destroy_cv before this thread's
5962 5979                                           * wait timed out.  The zone might have
5963 5980                                           * only one reference left; find out!
5964 5981                                           */
5965 5982                                          continue;
5966 5983                                  } else if (wait_time == 0) {
5967 5984                                          /* The thread's process was signaled. */
5968 5985                                          mutex_exit(&zonehash_lock);
5969 5986                                          return (set_errno(EINTR));
5970 5987                                  }
5971 5988  
5972 5989                                  /*
5973 5990                                   * The thread timed out while waiting on
5974 5991                                   * zone_destroy_cv.  Even though the thread
5975 5992                                   * timed out, it has to check whether another
5976 5993                                   * thread woke up from zone_destroy_cv and
5977 5994                                   * destroyed the zone.
5978 5995                                   *
5979 5996                                   * If the zone still exists and has more than
5980 5997                                   * one unreleased general-purpose reference,
5981 5998                                   * then log the zone's reference counts.
5982 5999                                   */
5983 6000                                  log_refcounts = B_TRUE;
5984 6001                                  continue;
5985 6002                          }
5986 6003  
5987 6004                          /*
5988 6005                           * The thread already timed out on zone_destroy_cv while
5989 6006                           * waiting for subsystems to release the zone's last
5990 6007                           * general-purpose references.  Log the zone's reference
5991 6008                           * counts and wait indefinitely on zone_destroy_cv.
5992 6009                           */
5993 6010                          zone_log_refcounts(zone);
5994 6011                  }
5995 6012                  if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
5996 6013                          /* The thread's process was signaled. */
5997 6014                          mutex_exit(&zonehash_lock);
5998 6015                          return (set_errno(EINTR));
5999 6016                  }
6000 6017          }
6001 6018  
6002 6019          /*
6003 6020           * Remove CPU cap for this zone now since we're not going to
6004 6021           * fail below this point.
6005 6022           */
6006 6023          cpucaps_zone_remove(zone);
6007 6024  
6008 6025          /* Get rid of the zone's kstats */
6009 6026          zone_kstat_delete(zone);
6010 6027  
6011 6028          /* remove the pfexecd doors */
6012 6029          if (zone->zone_pfexecd != NULL) {
6013 6030                  klpd_freelist(&zone->zone_pfexecd);
6014 6031                  zone->zone_pfexecd = NULL;
6015 6032          }
6016 6033  
6017 6034          /* free brand specific data */
6018 6035          if (ZONE_IS_BRANDED(zone))
6019 6036                  ZBROP(zone)->b_free_brand_data(zone);
6020 6037  
6021 6038          /* Say goodbye to brand framework. */
6022 6039          brand_unregister_zone(zone->zone_brand);
6023 6040  
6024 6041          /*
6025 6042           * It is now safe to let the zone be recreated; remove it from the
6026 6043           * lists.  The memory will not be freed until the last cred
6027 6044           * reference goes away.
6028 6045           */
6029 6046          ASSERT(zonecount > 1);  /* must be > 1; can't destroy global zone */
6030 6047          zonecount--;
6031 6048          /* remove from active list and hash tables */
6032 6049          list_remove(&zone_active, zone);
6033 6050          (void) mod_hash_destroy(zonehashbyname,
6034 6051              (mod_hash_key_t)zone->zone_name);
6035 6052          (void) mod_hash_destroy(zonehashbyid,
6036 6053              (mod_hash_key_t)(uintptr_t)zone->zone_id);

↓ open down ↓

1961 lines elided

↑ open up ↑

6037 6054          if (zone->zone_flags & ZF_HASHED_LABEL)
6038 6055                  (void) mod_hash_destroy(zonehashbylabel,
6039 6056                      (mod_hash_key_t)zone->zone_slabel);
6040 6057          mutex_exit(&zonehash_lock);
6041 6058  
6042 6059          /*
6043 6060           * Release the root vnode; we're not using it anymore.  Nor should any
6044 6061           * other thread that might access it exist.
6045 6062           */
6046 6063          if (zone->zone_rootvp != NULL) {
6047      -                VN_RELE(zone->zone_rootvp);
     6064 +                vnode_t *vp = zone->zone_rootvp;
     6065 +
     6066 +                mutex_enter(&vp->v_lock);
     6067 +                vp->v_flag &= ~VZONEROOT;
     6068 +                mutex_exit(&vp->v_lock);
     6069 +                VN_RELE(vp);
6048 6070                  zone->zone_rootvp = NULL;
6049 6071          }
6050 6072  
6051 6073          /* add to deathrow list */
6052 6074          mutex_enter(&zone_deathrow_lock);
6053 6075          list_insert_tail(&zone_deathrow, zone);
6054 6076          mutex_exit(&zone_deathrow_lock);
6055 6077  
6056 6078          /*
6057 6079           * Drop last reference (which was added by zsched()), this will

6058 6080           * free the zone unless there are outstanding cred references.
6059 6081           */
6060 6082          zone_rele(zone);
6061 6083          return (0);
6062 6084  }
6063 6085  
6064 6086  /*
6065 6087   * Systemcall entry point for zone_getattr(2).
6066 6088   */
6067 6089  static ssize_t
6068 6090  zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
6069 6091  {
6070 6092          size_t size;
6071 6093          int error = 0, err;
6072 6094          zone_t *zone;
6073 6095          char *zonepath;
6074 6096          char *outstr;
6075 6097          zone_status_t zone_status;
6076 6098          pid_t initpid;
6077 6099          boolean_t global = (curzone == global_zone);
6078 6100          boolean_t inzone = (curzone->zone_id == zoneid);
6079 6101          ushort_t flags;
6080 6102          zone_net_data_t *zbuf;
6081 6103  
6082 6104          mutex_enter(&zonehash_lock);
6083 6105          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
6084 6106                  mutex_exit(&zonehash_lock);
6085 6107                  return (set_errno(EINVAL));
6086 6108          }
6087 6109          zone_status = zone_status_get(zone);
6088 6110          if (zone_status < ZONE_IS_INITIALIZED) {
6089 6111                  mutex_exit(&zonehash_lock);
6090 6112                  return (set_errno(EINVAL));
6091 6113          }
6092 6114          zone_hold(zone);
6093 6115          mutex_exit(&zonehash_lock);
6094 6116  
6095 6117          /*
6096 6118           * If not in the global zone, don't show information about other zones,
6097 6119           * unless the system is labeled and the local zone's label dominates
6098 6120           * the other zone.
6099 6121           */
6100 6122          if (!zone_list_access(zone)) {
6101 6123                  zone_rele(zone);
6102 6124                  return (set_errno(EINVAL));
6103 6125          }
6104 6126  
6105 6127          switch (attr) {
6106 6128          case ZONE_ATTR_ROOT:
6107 6129                  if (global) {
6108 6130                          /*
6109 6131                           * Copy the path to trim the trailing "/" (except for
6110 6132                           * the global zone).
6111 6133                           */
6112 6134                          if (zone != global_zone)
6113 6135                                  size = zone->zone_rootpathlen - 1;
6114 6136                          else
6115 6137                                  size = zone->zone_rootpathlen;
6116 6138                          zonepath = kmem_alloc(size, KM_SLEEP);
6117 6139                          bcopy(zone->zone_rootpath, zonepath, size);
6118 6140                          zonepath[size - 1] = '\0';
6119 6141                  } else {
6120 6142                          if (inzone || !is_system_labeled()) {
6121 6143                                  /*
6122 6144                                   * Caller is not in the global zone.
6123 6145                                   * if the query is on the current zone
6124 6146                                   * or the system is not labeled,
6125 6147                                   * just return faked-up path for current zone.
6126 6148                                   */
6127 6149                                  zonepath = "/";
6128 6150                                  size = 2;
6129 6151                          } else {
6130 6152                                  /*
6131 6153                                   * Return related path for current zone.
6132 6154                                   */
6133 6155                                  int prefix_len = strlen(zone_prefix);
6134 6156                                  int zname_len = strlen(zone->zone_name);
6135 6157  
6136 6158                                  size = prefix_len + zname_len + 1;
6137 6159                                  zonepath = kmem_alloc(size, KM_SLEEP);
6138 6160                                  bcopy(zone_prefix, zonepath, prefix_len);
6139 6161                                  bcopy(zone->zone_name, zonepath +
6140 6162                                      prefix_len, zname_len);
6141 6163                                  zonepath[size - 1] = '\0';
6142 6164                          }
6143 6165                  }
6144 6166                  if (bufsize > size)
6145 6167                          bufsize = size;
6146 6168                  if (buf != NULL) {
6147 6169                          err = copyoutstr(zonepath, buf, bufsize, NULL);
6148 6170                          if (err != 0 && err != ENAMETOOLONG)
6149 6171                                  error = EFAULT;
6150 6172                  }
6151 6173                  if (global || (is_system_labeled() && !inzone))
6152 6174                          kmem_free(zonepath, size);
6153 6175                  break;
6154 6176  
6155 6177          case ZONE_ATTR_NAME:
6156 6178                  size = strlen(zone->zone_name) + 1;
6157 6179                  if (bufsize > size)
6158 6180                          bufsize = size;
6159 6181                  if (buf != NULL) {
6160 6182                          err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
6161 6183                          if (err != 0 && err != ENAMETOOLONG)
6162 6184                                  error = EFAULT;
6163 6185                  }
6164 6186                  break;
6165 6187  
6166 6188          case ZONE_ATTR_STATUS:
6167 6189                  /*
6168 6190                   * Since we're not holding zonehash_lock, the zone status
6169 6191                   * may be anything; leave it up to userland to sort it out.
6170 6192                   */
6171 6193                  size = sizeof (zone_status);
6172 6194                  if (bufsize > size)
6173 6195                          bufsize = size;
6174 6196                  zone_status = zone_status_get(zone);
6175 6197                  if (buf != NULL &&
6176 6198                      copyout(&zone_status, buf, bufsize) != 0)
6177 6199                          error = EFAULT;
6178 6200                  break;
6179 6201          case ZONE_ATTR_FLAGS:
6180 6202                  size = sizeof (zone->zone_flags);
6181 6203                  if (bufsize > size)
6182 6204                          bufsize = size;
6183 6205                  flags = zone->zone_flags;
6184 6206                  if (buf != NULL &&
6185 6207                      copyout(&flags, buf, bufsize) != 0)
6186 6208                          error = EFAULT;
6187 6209                  break;
6188 6210          case ZONE_ATTR_PRIVSET:
6189 6211                  size = sizeof (priv_set_t);
6190 6212                  if (bufsize > size)
6191 6213                          bufsize = size;
6192 6214                  if (buf != NULL &&
6193 6215                      copyout(zone->zone_privset, buf, bufsize) != 0)
6194 6216                          error = EFAULT;
6195 6217                  break;
6196 6218          case ZONE_ATTR_UNIQID:
6197 6219                  size = sizeof (zone->zone_uniqid);
6198 6220                  if (bufsize > size)
6199 6221                          bufsize = size;
6200 6222                  if (buf != NULL &&
6201 6223                      copyout(&zone->zone_uniqid, buf, bufsize) != 0)
6202 6224                          error = EFAULT;
6203 6225                  break;
6204 6226          case ZONE_ATTR_POOLID:
6205 6227                  {
6206 6228                          pool_t *pool;
6207 6229                          poolid_t poolid;
6208 6230  
6209 6231                          if (pool_lock_intr() != 0) {
6210 6232                                  error = EINTR;
6211 6233                                  break;
6212 6234                          }
6213 6235                          pool = zone_pool_get(zone);
6214 6236                          poolid = pool->pool_id;
6215 6237                          pool_unlock();
6216 6238                          size = sizeof (poolid);
6217 6239                          if (bufsize > size)
6218 6240                                  bufsize = size;
6219 6241                          if (buf != NULL && copyout(&poolid, buf, size) != 0)
6220 6242                                  error = EFAULT;
6221 6243                  }
6222 6244                  break;
6223 6245          case ZONE_ATTR_SLBL:
6224 6246                  size = sizeof (bslabel_t);
6225 6247                  if (bufsize > size)
6226 6248                          bufsize = size;
6227 6249                  if (zone->zone_slabel == NULL)
6228 6250                          error = EINVAL;
6229 6251                  else if (buf != NULL &&
6230 6252                      copyout(label2bslabel(zone->zone_slabel), buf,
6231 6253                      bufsize) != 0)
6232 6254                          error = EFAULT;
6233 6255                  break;
6234 6256          case ZONE_ATTR_INITPID:
6235 6257                  size = sizeof (initpid);
6236 6258                  if (bufsize > size)
6237 6259                          bufsize = size;
6238 6260                  initpid = zone->zone_proc_initpid;
6239 6261                  if (initpid == -1) {
6240 6262                          error = ESRCH;
6241 6263                          break;
6242 6264                  }
6243 6265                  if (buf != NULL &&
6244 6266                      copyout(&initpid, buf, bufsize) != 0)
6245 6267                          error = EFAULT;
6246 6268                  break;
6247 6269          case ZONE_ATTR_BRAND:
6248 6270                  size = strlen(zone->zone_brand->b_name) + 1;
6249 6271  
6250 6272                  if (bufsize > size)
6251 6273                          bufsize = size;
6252 6274                  if (buf != NULL) {
6253 6275                          err = copyoutstr(zone->zone_brand->b_name, buf,
6254 6276                              bufsize, NULL);
6255 6277                          if (err != 0 && err != ENAMETOOLONG)
6256 6278                                  error = EFAULT;
6257 6279                  }
6258 6280                  break;
6259 6281          case ZONE_ATTR_INITNAME:
6260 6282                  size = strlen(zone->zone_initname) + 1;
6261 6283                  if (bufsize > size)
6262 6284                          bufsize = size;
6263 6285                  if (buf != NULL) {
6264 6286                          err = copyoutstr(zone->zone_initname, buf, bufsize,
6265 6287                              NULL);
6266 6288                          if (err != 0 && err != ENAMETOOLONG)
6267 6289                                  error = EFAULT;
6268 6290                  }
6269 6291                  break;
6270 6292          case ZONE_ATTR_BOOTARGS:
6271 6293                  if (zone->zone_bootargs == NULL)
6272 6294                          outstr = "";
6273 6295                  else
6274 6296                          outstr = zone->zone_bootargs;
6275 6297                  size = strlen(outstr) + 1;
6276 6298                  if (bufsize > size)
6277 6299                          bufsize = size;
6278 6300                  if (buf != NULL) {
6279 6301                          err = copyoutstr(outstr, buf, bufsize, NULL);
6280 6302                          if (err != 0 && err != ENAMETOOLONG)
6281 6303                                  error = EFAULT;
6282 6304                  }
6283 6305                  break;
6284 6306          case ZONE_ATTR_SCHED_CLASS:
6285 6307                  mutex_enter(&class_lock);
6286 6308  
6287 6309                  if (zone->zone_defaultcid >= loaded_classes)
6288 6310                          outstr = "";
6289 6311                  else
6290 6312                          outstr = sclass[zone->zone_defaultcid].cl_name;
6291 6313                  size = strlen(outstr) + 1;
6292 6314                  if (bufsize > size)
6293 6315                          bufsize = size;
6294 6316                  if (buf != NULL) {
6295 6317                          err = copyoutstr(outstr, buf, bufsize, NULL);
6296 6318                          if (err != 0 && err != ENAMETOOLONG)
6297 6319                                  error = EFAULT;
6298 6320                  }
6299 6321  
6300 6322                  mutex_exit(&class_lock);
6301 6323                  break;
6302 6324          case ZONE_ATTR_HOSTID:
6303 6325                  if (zone->zone_hostid != HW_INVALID_HOSTID &&
6304 6326                      bufsize == sizeof (zone->zone_hostid)) {
6305 6327                          size = sizeof (zone->zone_hostid);
6306 6328                          if (buf != NULL && copyout(&zone->zone_hostid, buf,
6307 6329                              bufsize) != 0)
6308 6330                                  error = EFAULT;
6309 6331                  } else {
6310 6332                          error = EINVAL;
6311 6333                  }
6312 6334                  break;
6313 6335          case ZONE_ATTR_FS_ALLOWED:
6314 6336                  if (zone->zone_fs_allowed == NULL)
6315 6337                          outstr = "";
6316 6338                  else
6317 6339                          outstr = zone->zone_fs_allowed;
6318 6340                  size = strlen(outstr) + 1;
6319 6341                  if (bufsize > size)
6320 6342                          bufsize = size;
6321 6343                  if (buf != NULL) {
6322 6344                          err = copyoutstr(outstr, buf, bufsize, NULL);
6323 6345                          if (err != 0 && err != ENAMETOOLONG)
6324 6346                                  error = EFAULT;
6325 6347                  }
6326 6348                  break;
6327 6349          case ZONE_ATTR_SECFLAGS:
6328 6350                  size = sizeof (zone->zone_secflags);
6329 6351                  if (bufsize > size)
6330 6352                          bufsize = size;
6331 6353                  if ((err = copyout(&zone->zone_secflags, buf, bufsize)) != 0)
6332 6354                          error = EFAULT;
6333 6355                  break;
6334 6356          case ZONE_ATTR_NETWORK:
6335 6357                  bufsize = MIN(bufsize, PIPE_BUF + sizeof (zone_net_data_t));
6336 6358                  size = bufsize;
6337 6359                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
6338 6360                  if (copyin(buf, zbuf, bufsize) != 0) {
6339 6361                          error = EFAULT;
6340 6362                  } else {
6341 6363                          error = zone_get_network(zoneid, zbuf);
6342 6364                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
6343 6365                                  error = EFAULT;
6344 6366                  }
6345 6367                  kmem_free(zbuf, bufsize);
6346 6368                  break;
6347 6369          case ZONE_ATTR_DID:
6348 6370                  size = sizeof (zoneid_t);
6349 6371                  if (bufsize > size)
6350 6372                          bufsize = size;
6351 6373  
6352 6374                  if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
6353 6375                          error = EFAULT;
6354 6376                  break;
6355 6377          case ZONE_ATTR_SCHED_FIXEDHI:
6356 6378                  size = sizeof (boolean_t);
6357 6379                  if (bufsize > size)
6358 6380                          bufsize = size;
6359 6381  
6360 6382                  if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6361 6383                      bufsize) != 0)
6362 6384                          error = EFAULT;
6363 6385                  break;
6364 6386          default:
6365 6387                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6366 6388                          size = bufsize;
6367 6389                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6368 6390                  } else {
6369 6391                          error = EINVAL;
6370 6392                  }
6371 6393          }
6372 6394          zone_rele(zone);
6373 6395  
6374 6396          if (error)
6375 6397                  return (set_errno(error));
6376 6398          return ((ssize_t)size);
6377 6399  }
6378 6400  
6379 6401  /*
6380 6402   * Systemcall entry point for zone_setattr(2).
6381 6403   */
6382 6404  /*ARGSUSED*/
6383 6405  static int
6384 6406  zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
6385 6407  {
6386 6408          zone_t *zone;
6387 6409          zone_status_t zone_status;
6388 6410          int err = -1;
6389 6411          zone_net_data_t *zbuf;
6390 6412  
6391 6413          if (secpolicy_zone_config(CRED()) != 0)
6392 6414                  return (set_errno(EPERM));
6393 6415  
6394 6416          /*
6395 6417           * No attributes can be set on the global zone.
6396 6418           */
6397 6419          if (zoneid == GLOBAL_ZONEID) {
6398 6420                  return (set_errno(EINVAL));
6399 6421          }
6400 6422  
6401 6423          mutex_enter(&zonehash_lock);
6402 6424          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
6403 6425                  mutex_exit(&zonehash_lock);
6404 6426                  return (set_errno(EINVAL));
6405 6427          }
6406 6428          zone_hold(zone);
6407 6429          mutex_exit(&zonehash_lock);
6408 6430  
6409 6431          /*
6410 6432           * At present attributes can only be set on non-running,
6411 6433           * non-global zones.
6412 6434           */
6413 6435          zone_status = zone_status_get(zone);
6414 6436          if (zone_status > ZONE_IS_READY) {
6415 6437                  err = EINVAL;
6416 6438                  goto done;
6417 6439          }
6418 6440  
6419 6441          switch (attr) {
6420 6442          case ZONE_ATTR_INITNAME:
6421 6443                  err = zone_set_initname(zone, (const char *)buf);
6422 6444                  break;
6423 6445          case ZONE_ATTR_INITNORESTART:
6424 6446                  zone->zone_restart_init = B_FALSE;
6425 6447                  err = 0;
6426 6448                  break;
6427 6449          case ZONE_ATTR_INITRESTART0:
6428 6450                  zone->zone_restart_init_0 = B_TRUE;
6429 6451                  err = 0;
6430 6452                  break;
6431 6453          case ZONE_ATTR_INITREBOOT:
6432 6454                  zone->zone_reboot_on_init_exit = B_TRUE;
6433 6455                  err = 0;
6434 6456                  break;
6435 6457          case ZONE_ATTR_BOOTARGS:
6436 6458                  err = zone_set_bootargs(zone, (const char *)buf);
6437 6459                  break;
6438 6460          case ZONE_ATTR_BRAND:
6439 6461                  err = zone_set_brand(zone, (const char *)buf);
6440 6462                  break;
6441 6463          case ZONE_ATTR_FS_ALLOWED:
6442 6464                  err = zone_set_fs_allowed(zone, (const char *)buf);
6443 6465                  break;
6444 6466          case ZONE_ATTR_SECFLAGS:
6445 6467                  err = zone_set_secflags(zone, (psecflags_t *)buf);
6446 6468                  break;
6447 6469          case ZONE_ATTR_SCHED_CLASS:
6448 6470                  err = zone_set_sched_class(zone, (const char *)buf);
6449 6471                  break;
6450 6472          case ZONE_ATTR_HOSTID:
6451 6473                  if (bufsize == sizeof (zone->zone_hostid)) {
6452 6474                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
6453 6475                                  err = 0;
6454 6476                          else
6455 6477                                  err = EFAULT;
6456 6478                  } else {
6457 6479                          err = EINVAL;
6458 6480                  }
6459 6481                  break;
6460 6482          case ZONE_ATTR_NETWORK:
6461 6483                  if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
6462 6484                          err = EINVAL;
6463 6485                          break;
6464 6486                  }
6465 6487                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
6466 6488                  if (copyin(buf, zbuf, bufsize) != 0) {
6467 6489                          kmem_free(zbuf, bufsize);
6468 6490                          err = EFAULT;
6469 6491                          break;
6470 6492                  }
6471 6493                  err = zone_set_network(zoneid, zbuf);
6472 6494                  kmem_free(zbuf, bufsize);
6473 6495                  break;
6474 6496          case ZONE_ATTR_APP_SVC_CT:
6475 6497                  if (bufsize != sizeof (boolean_t)) {
6476 6498                          err = EINVAL;
6477 6499                  } else {
6478 6500                          zone->zone_setup_app_contract = (boolean_t)buf;
6479 6501                          err = 0;
6480 6502                  }
6481 6503                  break;
6482 6504          case ZONE_ATTR_SCHED_FIXEDHI:
6483 6505                  if (bufsize != sizeof (boolean_t)) {
6484 6506                          err = EINVAL;
6485 6507                  } else {
6486 6508                          zone->zone_fixed_hipri = (boolean_t)buf;
6487 6509                          err = 0;
6488 6510                  }
6489 6511                  break;
6490 6512          default:
6491 6513                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
6492 6514                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
6493 6515                  else
6494 6516                          err = EINVAL;
6495 6517          }
6496 6518  
6497 6519  done:
6498 6520          zone_rele(zone);
6499 6521          ASSERT(err != -1);
6500 6522          return (err != 0 ? set_errno(err) : 0);
6501 6523  }
6502 6524  
6503 6525  /*
6504 6526   * Return zero if the process has at least one vnode mapped in to its
6505 6527   * address space which shouldn't be allowed to change zones.
6506 6528   *
6507 6529   * Also return zero if the process has any shared mappings which reserve
6508 6530   * swap.  This is because the counting for zone.max-swap does not allow swap
6509 6531   * reservation to be shared between zones.  zone swap reservation is counted
6510 6532   * on zone->zone_max_swap.
6511 6533   */
6512 6534  static int
6513 6535  as_can_change_zones(void)
6514 6536  {
6515 6537          proc_t *pp = curproc;
6516 6538          struct seg *seg;
6517 6539          struct as *as = pp->p_as;
6518 6540          vnode_t *vp;
6519 6541          int allow = 1;
6520 6542  
6521 6543          ASSERT(pp->p_as != &kas);
6522 6544          AS_LOCK_ENTER(as, RW_READER);
6523 6545          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
6524 6546  
6525 6547                  /*
6526 6548                   * Cannot enter zone with shared anon memory which
6527 6549                   * reserves swap.  See comment above.
6528 6550                   */
6529 6551                  if (seg_can_change_zones(seg) == B_FALSE) {
6530 6552                          allow = 0;
6531 6553                          break;
6532 6554                  }
6533 6555                  /*
6534 6556                   * if we can't get a backing vnode for this segment then skip
6535 6557                   * it.
6536 6558                   */
6537 6559                  vp = NULL;
6538 6560                  if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
6539 6561                          continue;
6540 6562                  if (!vn_can_change_zones(vp)) { /* bail on first match */
6541 6563                          allow = 0;
6542 6564                          break;
6543 6565                  }
6544 6566          }
6545 6567          AS_LOCK_EXIT(as);
6546 6568          return (allow);
6547 6569  }
6548 6570  
6549 6571  /*
6550 6572   * Count swap reserved by curproc's address space
6551 6573   */
6552 6574  static size_t
6553 6575  as_swresv(void)
6554 6576  {
6555 6577          proc_t *pp = curproc;
6556 6578          struct seg *seg;
6557 6579          struct as *as = pp->p_as;
6558 6580          size_t swap = 0;
6559 6581  
6560 6582          ASSERT(pp->p_as != &kas);
6561 6583          ASSERT(AS_WRITE_HELD(as));
6562 6584          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
6563 6585                  swap += seg_swresv(seg);
6564 6586  
6565 6587          return (swap);
6566 6588  }
6567 6589  
6568 6590  /*
6569 6591   * Systemcall entry point for zone_enter().
6570 6592   *
6571 6593   * The current process is injected into said zone.  In the process
6572 6594   * it will change its project membership, privileges, rootdir/cwd,
6573 6595   * zone-wide rctls, and pool association to match those of the zone.
6574 6596   *
6575 6597   * The first zone_enter() called while the zone is in the ZONE_IS_READY
6576 6598   * state will transition it to ZONE_IS_RUNNING.  Processes may only
6577 6599   * enter a zone that is "ready" or "running".
6578 6600   */
6579 6601  static int
6580 6602  zone_enter(zoneid_t zoneid)
6581 6603  {
6582 6604          zone_t *zone;
6583 6605          vnode_t *vp;
6584 6606          proc_t *pp = curproc;
6585 6607          contract_t *ct;
6586 6608          cont_process_t *ctp;
6587 6609          task_t *tk, *oldtk;
6588 6610          kproject_t *zone_proj0;
6589 6611          cred_t *cr, *newcr;
6590 6612          pool_t *oldpool, *newpool;
6591 6613          sess_t *sp;
6592 6614          uid_t uid;
6593 6615          zone_status_t status;
6594 6616          int err = 0;
6595 6617          rctl_entity_p_t e;
6596 6618          size_t swap;
6597 6619          kthread_id_t t;
6598 6620  
6599 6621          if (secpolicy_zone_config(CRED()) != 0)
6600 6622                  return (set_errno(EPERM));
6601 6623          if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
6602 6624                  return (set_errno(EINVAL));
6603 6625  
6604 6626          /*
6605 6627           * Stop all lwps so we don't need to hold a lock to look at
6606 6628           * curproc->p_zone.  This needs to happen before we grab any
6607 6629           * locks to avoid deadlock (another lwp in the process could
6608 6630           * be waiting for the held lock).
6609 6631           */
6610 6632          if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
6611 6633                  return (set_errno(EINTR));
6612 6634  
6613 6635          /*
6614 6636           * Make sure we're not changing zones with files open or mapped in
6615 6637           * to our address space which shouldn't be changing zones.
6616 6638           */
6617 6639          if (!files_can_change_zones()) {
6618 6640                  err = EBADF;
6619 6641                  goto out;
6620 6642          }
6621 6643          if (!as_can_change_zones()) {
6622 6644                  err = EFAULT;
6623 6645                  goto out;
6624 6646          }
6625 6647  
6626 6648          mutex_enter(&zonehash_lock);
6627 6649          if (pp->p_zone != global_zone) {
6628 6650                  mutex_exit(&zonehash_lock);
6629 6651                  err = EINVAL;
6630 6652                  goto out;
6631 6653          }
6632 6654  
6633 6655          zone = zone_find_all_by_id(zoneid);
6634 6656          if (zone == NULL) {
6635 6657                  mutex_exit(&zonehash_lock);
6636 6658                  err = EINVAL;
6637 6659                  goto out;
6638 6660          }
6639 6661  
6640 6662          /*
6641 6663           * To prevent processes in a zone from holding contracts on
6642 6664           * extrazonal resources, and to avoid process contract
6643 6665           * memberships which span zones, contract holders and processes
6644 6666           * which aren't the sole members of their encapsulating process
6645 6667           * contracts are not allowed to zone_enter.
6646 6668           */
6647 6669          ctp = pp->p_ct_process;
6648 6670          ct = &ctp->conp_contract;
6649 6671          mutex_enter(&ct->ct_lock);
6650 6672          mutex_enter(&pp->p_lock);
6651 6673          if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
6652 6674                  mutex_exit(&pp->p_lock);
6653 6675                  mutex_exit(&ct->ct_lock);
6654 6676                  mutex_exit(&zonehash_lock);
6655 6677                  err = EINVAL;
6656 6678                  goto out;
6657 6679          }
6658 6680  
6659 6681          /*
6660 6682           * Moreover, we don't allow processes whose encapsulating
6661 6683           * process contracts have inherited extrazonal contracts.
6662 6684           * While it would be easier to eliminate all process contracts
6663 6685           * with inherited contracts, we need to be able to give a
6664 6686           * restarted init (or other zone-penetrating process) its
6665 6687           * predecessor's contracts.
6666 6688           */
6667 6689          if (ctp->conp_ninherited != 0) {
6668 6690                  contract_t *next;
6669 6691                  for (next = list_head(&ctp->conp_inherited); next;
6670 6692                      next = list_next(&ctp->conp_inherited, next)) {
6671 6693                          if (contract_getzuniqid(next) != zone->zone_uniqid) {
6672 6694                                  mutex_exit(&pp->p_lock);
6673 6695                                  mutex_exit(&ct->ct_lock);
6674 6696                                  mutex_exit(&zonehash_lock);
6675 6697                                  err = EINVAL;
6676 6698                                  goto out;
6677 6699                          }
6678 6700                  }
6679 6701          }
6680 6702  
6681 6703          mutex_exit(&pp->p_lock);
6682 6704          mutex_exit(&ct->ct_lock);
6683 6705  
6684 6706          status = zone_status_get(zone);
6685 6707          if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
6686 6708                  /*
6687 6709                   * Can't join
6688 6710                   */
6689 6711                  mutex_exit(&zonehash_lock);
6690 6712                  err = EINVAL;
6691 6713                  goto out;
6692 6714          }
6693 6715  
6694 6716          /*
6695 6717           * Make sure new priv set is within the permitted set for caller
6696 6718           */
6697 6719          if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
6698 6720                  mutex_exit(&zonehash_lock);
6699 6721                  err = EPERM;
6700 6722                  goto out;
6701 6723          }
6702 6724          /*
6703 6725           * We want to momentarily drop zonehash_lock while we optimistically
6704 6726           * bind curproc to the pool it should be running in.  This is safe
6705 6727           * since the zone can't disappear (we have a hold on it).
6706 6728           */
6707 6729          zone_hold(zone);
6708 6730          mutex_exit(&zonehash_lock);
6709 6731  
6710 6732          /*
6711 6733           * Grab pool_lock to keep the pools configuration from changing
6712 6734           * and to stop ourselves from getting rebound to another pool
6713 6735           * until we join the zone.
6714 6736           */
6715 6737          if (pool_lock_intr() != 0) {
6716 6738                  zone_rele(zone);
6717 6739                  err = EINTR;
6718 6740                  goto out;
6719 6741          }
6720 6742          ASSERT(secpolicy_pool(CRED()) == 0);
6721 6743          /*
6722 6744           * Bind ourselves to the pool currently associated with the zone.
6723 6745           */
6724 6746          oldpool = curproc->p_pool;
6725 6747          newpool = zone_pool_get(zone);
6726 6748          if (pool_state == POOL_ENABLED && newpool != oldpool &&
6727 6749              (err = pool_do_bind(newpool, P_PID, P_MYID,
6728 6750              POOL_BIND_ALL)) != 0) {
6729 6751                  pool_unlock();
6730 6752                  zone_rele(zone);
6731 6753                  goto out;
6732 6754          }
6733 6755  
6734 6756          /*
6735 6757           * Grab cpu_lock now; we'll need it later when we call
6736 6758           * task_join().
6737 6759           */
6738 6760          mutex_enter(&cpu_lock);
6739 6761          mutex_enter(&zonehash_lock);
6740 6762          /*
6741 6763           * Make sure the zone hasn't moved on since we dropped zonehash_lock.
6742 6764           */
6743 6765          if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
6744 6766                  /*
6745 6767                   * Can't join anymore.
6746 6768                   */
6747 6769                  mutex_exit(&zonehash_lock);
6748 6770                  mutex_exit(&cpu_lock);
6749 6771                  if (pool_state == POOL_ENABLED &&
6750 6772                      newpool != oldpool)
6751 6773                          (void) pool_do_bind(oldpool, P_PID, P_MYID,
6752 6774                              POOL_BIND_ALL);
6753 6775                  pool_unlock();
6754 6776                  zone_rele(zone);
6755 6777                  err = EINVAL;
6756 6778                  goto out;
6757 6779          }
6758 6780  
6759 6781          /*
6760 6782           * a_lock must be held while transfering locked memory and swap
6761 6783           * reservation from the global zone to the non global zone because
6762 6784           * asynchronous faults on the processes' address space can lock
6763 6785           * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
6764 6786           * segments respectively.
6765 6787           */
6766 6788          AS_LOCK_ENTER(pp->p_as, RW_WRITER);
6767 6789          swap = as_swresv();
6768 6790          mutex_enter(&pp->p_lock);
6769 6791          zone_proj0 = zone->zone_zsched->p_task->tk_proj;
6770 6792          /* verify that we do not exceed and task or lwp limits */
6771 6793          mutex_enter(&zone->zone_nlwps_lock);
6772 6794          /* add new lwps to zone and zone's proj0 */
6773 6795          zone_proj0->kpj_nlwps += pp->p_lwpcnt;
6774 6796          zone->zone_nlwps += pp->p_lwpcnt;
6775 6797          /* add 1 task to zone's proj0 */
6776 6798          zone_proj0->kpj_ntasks += 1;
6777 6799  
6778 6800          zone_proj0->kpj_nprocs++;
6779 6801          zone->zone_nprocs++;
6780 6802          mutex_exit(&zone->zone_nlwps_lock);
6781 6803  
6782 6804          mutex_enter(&zone->zone_mem_lock);
6783 6805          zone->zone_locked_mem += pp->p_locked_mem;
6784 6806          zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
6785 6807          zone->zone_max_swap += swap;
6786 6808          mutex_exit(&zone->zone_mem_lock);
6787 6809  
6788 6810          mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
6789 6811          zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
6790 6812          mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
6791 6813  
6792 6814          /* remove lwps and process from proc's old zone and old project */
6793 6815          mutex_enter(&pp->p_zone->zone_nlwps_lock);
6794 6816          pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
6795 6817          pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
6796 6818          pp->p_task->tk_proj->kpj_nprocs--;
6797 6819          pp->p_zone->zone_nprocs--;
6798 6820          mutex_exit(&pp->p_zone->zone_nlwps_lock);
6799 6821  
6800 6822          mutex_enter(&pp->p_zone->zone_mem_lock);
6801 6823          pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
6802 6824          pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
6803 6825          pp->p_zone->zone_max_swap -= swap;
6804 6826          mutex_exit(&pp->p_zone->zone_mem_lock);
6805 6827  
6806 6828          mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6807 6829          pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
6808 6830          mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
6809 6831  
6810 6832          pp->p_flag |= SZONETOP;
6811 6833          pp->p_zone = zone;
6812 6834          mutex_exit(&pp->p_lock);
6813 6835          AS_LOCK_EXIT(pp->p_as);
6814 6836  
6815 6837          /*
6816 6838           * Joining the zone cannot fail from now on.
6817 6839           *
6818 6840           * This means that a lot of the following code can be commonized and
6819 6841           * shared with zsched().
6820 6842           */
6821 6843  
6822 6844          /*
6823 6845           * If the process contract fmri was inherited, we need to
6824 6846           * flag this so that any contract status will not leak
6825 6847           * extra zone information, svc_fmri in this case
6826 6848           */
6827 6849          if (ctp->conp_svc_ctid != ct->ct_id) {
6828 6850                  mutex_enter(&ct->ct_lock);
6829 6851                  ctp->conp_svc_zone_enter = ct->ct_id;
6830 6852                  mutex_exit(&ct->ct_lock);
6831 6853          }
6832 6854  
6833 6855          /*
6834 6856           * Reset the encapsulating process contract's zone.
6835 6857           */
6836 6858          ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
6837 6859          contract_setzuniqid(ct, zone->zone_uniqid);
6838 6860  
6839 6861          /*
6840 6862           * Create a new task and associate the process with the project keyed
6841 6863           * by (projid,zoneid).
6842 6864           *
6843 6865           * We might as well be in project 0; the global zone's projid doesn't
6844 6866           * make much sense in a zone anyhow.
6845 6867           *
6846 6868           * This also increments zone_ntasks, and returns with p_lock held.
6847 6869           */
6848 6870          tk = task_create(0, zone);
6849 6871          oldtk = task_join(tk, 0);
6850 6872          mutex_exit(&cpu_lock);
6851 6873  
6852 6874          /*
6853 6875           * call RCTLOP_SET functions on this proc
6854 6876           */
6855 6877          e.rcep_p.zone = zone;
6856 6878          e.rcep_t = RCENTITY_ZONE;
6857 6879          (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
6858 6880              RCD_CALLBACK);
6859 6881          mutex_exit(&pp->p_lock);
6860 6882  
6861 6883          /*
6862 6884           * We don't need to hold any of zsched's locks here; not only do we know
6863 6885           * the process and zone aren't going away, we know its session isn't
6864 6886           * changing either.
6865 6887           *
6866 6888           * By joining zsched's session here, we mimic the behavior in the
6867 6889           * global zone of init's sid being the pid of sched.  We extend this
6868 6890           * to all zlogin-like zone_enter()'ing processes as well.
6869 6891           */
6870 6892          mutex_enter(&pidlock);
6871 6893          sp = zone->zone_zsched->p_sessp;
6872 6894          sess_hold(zone->zone_zsched);
6873 6895          mutex_enter(&pp->p_lock);
6874 6896          pgexit(pp);
6875 6897          sess_rele(pp->p_sessp, B_TRUE);
6876 6898          pp->p_sessp = sp;
6877 6899          pgjoin(pp, zone->zone_zsched->p_pidp);
6878 6900  
6879 6901          /*
6880 6902           * If any threads are scheduled to be placed on zone wait queue they
6881 6903           * should abandon the idea since the wait queue is changing.
6882 6904           * We need to be holding pidlock & p_lock to do this.
6883 6905           */
6884 6906          if ((t = pp->p_tlist) != NULL) {
6885 6907                  do {
6886 6908                          thread_lock(t);
6887 6909                          /*
6888 6910                           * Kick this thread so that it doesn't sit
6889 6911                           * on a wrong wait queue.
6890 6912                           */
6891 6913                          if (ISWAITING(t))
6892 6914                                  setrun_locked(t);
6893 6915  
6894 6916                          if (t->t_schedflag & TS_ANYWAITQ)
6895 6917                                  t->t_schedflag &= ~ TS_ANYWAITQ;
6896 6918  
6897 6919                          thread_unlock(t);
6898 6920                  } while ((t = t->t_forw) != pp->p_tlist);
6899 6921          }
6900 6922  
6901 6923          /*
6902 6924           * If there is a default scheduling class for the zone and it is not
6903 6925           * the class we are currently in, change all of the threads in the
6904 6926           * process to the new class.  We need to be holding pidlock & p_lock
6905 6927           * when we call parmsset so this is a good place to do it.
6906 6928           */
6907 6929          if (zone->zone_defaultcid > 0 &&
6908 6930              zone->zone_defaultcid != curthread->t_cid) {
6909 6931                  pcparms_t pcparms;
6910 6932  
6911 6933                  pcparms.pc_cid = zone->zone_defaultcid;
6912 6934                  pcparms.pc_clparms[0] = 0;
6913 6935  
6914 6936                  /*
6915 6937                   * If setting the class fails, we still want to enter the zone.
6916 6938                   */
6917 6939                  if ((t = pp->p_tlist) != NULL) {
6918 6940                          do {
6919 6941                                  (void) parmsset(&pcparms, t);
6920 6942                          } while ((t = t->t_forw) != pp->p_tlist);
6921 6943                  }
6922 6944          }
6923 6945  
6924 6946          mutex_exit(&pp->p_lock);
6925 6947          mutex_exit(&pidlock);
6926 6948  
6927 6949          mutex_exit(&zonehash_lock);
6928 6950          /*
6929 6951           * We're firmly in the zone; let pools progress.
6930 6952           */
6931 6953          pool_unlock();
6932 6954          task_rele(oldtk);
6933 6955          /*
6934 6956           * We don't need to retain a hold on the zone since we already
6935 6957           * incremented zone_ntasks, so the zone isn't going anywhere.
6936 6958           */
6937 6959          zone_rele(zone);
6938 6960  
6939 6961          /*
6940 6962           * Chroot
6941 6963           */
6942 6964          vp = zone->zone_rootvp;
6943 6965          zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
6944 6966          zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
6945 6967  
6946 6968          /*
6947 6969           * Change process security flags.  Note that the _effective_ flags
6948 6970           * cannot change
6949 6971           */
6950 6972          secflags_copy(&pp->p_secflags.psf_lower,
6951 6973              &zone->zone_secflags.psf_lower);
6952 6974          secflags_copy(&pp->p_secflags.psf_upper,
6953 6975              &zone->zone_secflags.psf_upper);
6954 6976          secflags_copy(&pp->p_secflags.psf_inherit,
6955 6977              &zone->zone_secflags.psf_inherit);
6956 6978  
6957 6979          /*
6958 6980           * Change process credentials
6959 6981           */
6960 6982          newcr = cralloc();
6961 6983          mutex_enter(&pp->p_crlock);
6962 6984          cr = pp->p_cred;
6963 6985          crcopy_to(cr, newcr);
6964 6986          crsetzone(newcr, zone);
6965 6987          pp->p_cred = newcr;
6966 6988  
6967 6989          /*
6968 6990           * Restrict all process privilege sets to zone limit
6969 6991           */
6970 6992          priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
6971 6993          priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
6972 6994          priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
6973 6995          priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
6974 6996          mutex_exit(&pp->p_crlock);
6975 6997          crset(pp, newcr);
6976 6998  
6977 6999          /*
6978 7000           * Adjust upcount to reflect zone entry.
6979 7001           */
6980 7002          uid = crgetruid(newcr);
6981 7003          mutex_enter(&pidlock);
6982 7004          upcount_dec(uid, GLOBAL_ZONEID);
6983 7005          upcount_inc(uid, zoneid);
6984 7006          mutex_exit(&pidlock);
6985 7007  
6986 7008          /*
6987 7009           * Set up core file path and content.
6988 7010           */
6989 7011          set_core_defaults();
6990 7012  
6991 7013  out:
6992 7014          /*
6993 7015           * Let the other lwps continue.
6994 7016           */
6995 7017          mutex_enter(&pp->p_lock);
6996 7018          if (curthread != pp->p_agenttp)
6997 7019                  continuelwps(pp);
6998 7020          mutex_exit(&pp->p_lock);
6999 7021  
7000 7022          return (err != 0 ? set_errno(err) : 0);
7001 7023  }
7002 7024  
7003 7025  /*
7004 7026   * Systemcall entry point for zone_list(2).
7005 7027   *
7006 7028   * Processes running in a (non-global) zone only see themselves.
7007 7029   * On labeled systems, they see all zones whose label they dominate.
7008 7030   */
7009 7031  static int
7010 7032  zone_list(zoneid_t *zoneidlist, uint_t *numzones)
7011 7033  {
7012 7034          zoneid_t *zoneids;
7013 7035          zone_t *zone, *myzone;
7014 7036          uint_t user_nzones, real_nzones;
7015 7037          uint_t domi_nzones;
7016 7038          int error;
7017 7039  
7018 7040          if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
7019 7041                  return (set_errno(EFAULT));
7020 7042  
7021 7043          myzone = curproc->p_zone;
7022 7044          if (myzone != global_zone) {
7023 7045                  bslabel_t *mybslab;
7024 7046  
7025 7047                  if (!is_system_labeled()) {
7026 7048                          /* just return current zone */
7027 7049                          real_nzones = domi_nzones = 1;
7028 7050                          zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
7029 7051                          zoneids[0] = myzone->zone_id;
7030 7052                  } else {
7031 7053                          /* return all zones that are dominated */
7032 7054                          mutex_enter(&zonehash_lock);
7033 7055                          real_nzones = zonecount;
7034 7056                          domi_nzones = 0;
7035 7057                          if (real_nzones > 0) {
7036 7058                                  zoneids = kmem_alloc(real_nzones *
7037 7059                                      sizeof (zoneid_t), KM_SLEEP);
7038 7060                                  mybslab = label2bslabel(myzone->zone_slabel);
7039 7061                                  for (zone = list_head(&zone_active);
7040 7062                                      zone != NULL;
7041 7063                                      zone = list_next(&zone_active, zone)) {
7042 7064                                          if (zone->zone_id == GLOBAL_ZONEID)
7043 7065                                                  continue;
7044 7066                                          if (zone != myzone &&
7045 7067                                              (zone->zone_flags & ZF_IS_SCRATCH))
7046 7068                                                  continue;
7047 7069                                          /*
7048 7070                                           * Note that a label always dominates
7049 7071                                           * itself, so myzone is always included
7050 7072                                           * in the list.
7051 7073                                           */
7052 7074                                          if (bldominates(mybslab,
7053 7075                                              label2bslabel(zone->zone_slabel))) {
7054 7076                                                  zoneids[domi_nzones++] =
7055 7077                                                      zone->zone_id;
7056 7078                                          }
7057 7079                                  }
7058 7080                          }
7059 7081                          mutex_exit(&zonehash_lock);
7060 7082                  }
7061 7083          } else {
7062 7084                  mutex_enter(&zonehash_lock);
7063 7085                  real_nzones = zonecount;
7064 7086                  domi_nzones = 0;
7065 7087                  if (real_nzones > 0) {
7066 7088                          zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
7067 7089                              KM_SLEEP);
7068 7090                          for (zone = list_head(&zone_active); zone != NULL;
7069 7091                              zone = list_next(&zone_active, zone))
7070 7092                                  zoneids[domi_nzones++] = zone->zone_id;
7071 7093                          ASSERT(domi_nzones == real_nzones);
7072 7094                  }
7073 7095                  mutex_exit(&zonehash_lock);
7074 7096          }
7075 7097  
7076 7098          /*
7077 7099           * If user has allocated space for fewer entries than we found, then
7078 7100           * return only up to their limit.  Either way, tell them exactly how
7079 7101           * many we found.
7080 7102           */
7081 7103          if (domi_nzones < user_nzones)
7082 7104                  user_nzones = domi_nzones;
7083 7105          error = 0;
7084 7106          if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
7085 7107                  error = EFAULT;
7086 7108          } else if (zoneidlist != NULL && user_nzones != 0) {
7087 7109                  if (copyout(zoneids, zoneidlist,
7088 7110                      user_nzones * sizeof (zoneid_t)) != 0)
7089 7111                          error = EFAULT;
7090 7112          }
7091 7113  
7092 7114          if (real_nzones > 0)
7093 7115                  kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
7094 7116  
7095 7117          if (error != 0)
7096 7118                  return (set_errno(error));
7097 7119          else
7098 7120                  return (0);
7099 7121  }
7100 7122  
7101 7123  /*
7102 7124   * Systemcall entry point for zone_lookup(2).
7103 7125   *
7104 7126   * Non-global zones are only able to see themselves and (on labeled systems)
7105 7127   * the zones they dominate.
7106 7128   */
7107 7129  static zoneid_t
7108 7130  zone_lookup(const char *zone_name)
7109 7131  {
7110 7132          char *kname;
7111 7133          zone_t *zone;
7112 7134          zoneid_t zoneid;
7113 7135          int err;
7114 7136  
7115 7137          if (zone_name == NULL) {
7116 7138                  /* return caller's zone id */
7117 7139                  return (getzoneid());
7118 7140          }
7119 7141  
7120 7142          kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
7121 7143          if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
7122 7144                  kmem_free(kname, ZONENAME_MAX);
7123 7145                  return (set_errno(err));
7124 7146          }
7125 7147  
7126 7148          mutex_enter(&zonehash_lock);
7127 7149          zone = zone_find_all_by_name(kname);
7128 7150          kmem_free(kname, ZONENAME_MAX);
7129 7151          /*
7130 7152           * In a non-global zone, can only lookup global and own name.
7131 7153           * In Trusted Extensions zone label dominance rules apply.
7132 7154           */
7133 7155          if (zone == NULL ||
7134 7156              zone_status_get(zone) < ZONE_IS_READY ||
7135 7157              !zone_list_access(zone)) {
7136 7158                  mutex_exit(&zonehash_lock);
7137 7159                  return (set_errno(EINVAL));
7138 7160          } else {
7139 7161                  zoneid = zone->zone_id;
7140 7162                  mutex_exit(&zonehash_lock);
7141 7163                  return (zoneid);
7142 7164          }
7143 7165  }
7144 7166  
7145 7167  static int
7146 7168  zone_version(int *version_arg)
7147 7169  {
7148 7170          int version = ZONE_SYSCALL_API_VERSION;
7149 7171  
7150 7172          if (copyout(&version, version_arg, sizeof (int)) != 0)
7151 7173                  return (set_errno(EFAULT));
7152 7174          return (0);
7153 7175  }
7154 7176  
7155 7177  /* ARGSUSED */
7156 7178  long
7157 7179  zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
7158 7180  {
7159 7181          zone_def zs;
7160 7182          int err;
7161 7183  
7162 7184          switch (cmd) {
7163 7185          case ZONE_CREATE:
7164 7186                  if (get_udatamodel() == DATAMODEL_NATIVE) {
7165 7187                          if (copyin(arg1, &zs, sizeof (zone_def))) {
7166 7188                                  return (set_errno(EFAULT));
7167 7189                          }
7168 7190                  } else {
7169 7191  #ifdef _SYSCALL32_IMPL
7170 7192                          zone_def32 zs32;
7171 7193  
7172 7194                          if (copyin(arg1, &zs32, sizeof (zone_def32))) {
7173 7195                                  return (set_errno(EFAULT));
7174 7196                          }
7175 7197                          zs.zone_name =
7176 7198                              (const char *)(unsigned long)zs32.zone_name;
7177 7199                          zs.zone_root =
7178 7200                              (const char *)(unsigned long)zs32.zone_root;
7179 7201                          zs.zone_privs =
7180 7202                              (const struct priv_set *)
7181 7203                              (unsigned long)zs32.zone_privs;
7182 7204                          zs.zone_privssz = zs32.zone_privssz;
7183 7205                          zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
7184 7206                          zs.rctlbufsz = zs32.rctlbufsz;
7185 7207                          zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
7186 7208                          zs.zfsbufsz = zs32.zfsbufsz;
7187 7209                          zs.extended_error =
7188 7210                              (int *)(unsigned long)zs32.extended_error;
7189 7211                          zs.match = zs32.match;
7190 7212                          zs.doi = zs32.doi;
7191 7213                          zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
7192 7214                          zs.flags = zs32.flags;
7193 7215                          zs.zoneid = zs32.zoneid;
7194 7216  #else
7195 7217                          panic("get_udatamodel() returned bogus result\n");
7196 7218  #endif
7197 7219                  }
7198 7220  
7199 7221                  return (zone_create(zs.zone_name, zs.zone_root,
7200 7222                      zs.zone_privs, zs.zone_privssz,
7201 7223                      (caddr_t)zs.rctlbuf, zs.rctlbufsz,
7202 7224                      (caddr_t)zs.zfsbuf, zs.zfsbufsz,
7203 7225                      zs.extended_error, zs.match, zs.doi,
7204 7226                      zs.label, zs.flags, zs.zoneid));
7205 7227          case ZONE_BOOT:
7206 7228                  return (zone_boot((zoneid_t)(uintptr_t)arg1));
7207 7229          case ZONE_DESTROY:
7208 7230                  return (zone_destroy((zoneid_t)(uintptr_t)arg1));
7209 7231          case ZONE_GETATTR:
7210 7232                  return (zone_getattr((zoneid_t)(uintptr_t)arg1,
7211 7233                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7212 7234          case ZONE_SETATTR:
7213 7235                  return (zone_setattr((zoneid_t)(uintptr_t)arg1,
7214 7236                      (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7215 7237          case ZONE_ENTER:
7216 7238                  return (zone_enter((zoneid_t)(uintptr_t)arg1));
7217 7239          case ZONE_LIST:
7218 7240                  return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
7219 7241          case ZONE_SHUTDOWN:
7220 7242                  return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
7221 7243          case ZONE_LOOKUP:
7222 7244                  return (zone_lookup((const char *)arg1));
7223 7245          case ZONE_VERSION:
7224 7246                  return (zone_version((int *)arg1));
7225 7247          case ZONE_ADD_DATALINK:
7226 7248                  return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
7227 7249                      (datalink_id_t)(uintptr_t)arg2));
7228 7250          case ZONE_DEL_DATALINK:
7229 7251                  return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
7230 7252                      (datalink_id_t)(uintptr_t)arg2));
7231 7253          case ZONE_CHECK_DATALINK: {
7232 7254                  zoneid_t        zoneid;
7233 7255                  boolean_t       need_copyout;
7234 7256  
7235 7257                  if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
7236 7258                          return (EFAULT);
7237 7259                  need_copyout = (zoneid == ALL_ZONES);
7238 7260                  err = zone_check_datalink(&zoneid,
7239 7261                      (datalink_id_t)(uintptr_t)arg2);
7240 7262                  if (err == 0 && need_copyout) {
7241 7263                          if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
7242 7264                                  err = EFAULT;
7243 7265                  }
7244 7266                  return (err == 0 ? 0 : set_errno(err));
7245 7267          }
7246 7268          case ZONE_LIST_DATALINK:
7247 7269                  return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
7248 7270                      (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
7249 7271          default:
7250 7272                  return (set_errno(EINVAL));
7251 7273          }
7252 7274  }
7253 7275  
7254 7276  struct zarg {
7255 7277          zone_t *zone;
7256 7278          zone_cmd_arg_t arg;
7257 7279  };
7258 7280  
7259 7281  static int
7260 7282  zone_lookup_door(const char *zone_name, door_handle_t *doorp)
7261 7283  {
7262 7284          char *buf;
7263 7285          size_t buflen;
7264 7286          int error;
7265 7287  
7266 7288          buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
7267 7289          buf = kmem_alloc(buflen, KM_SLEEP);
7268 7290          (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
7269 7291          error = door_ki_open(buf, doorp);
7270 7292          kmem_free(buf, buflen);
7271 7293          return (error);
7272 7294  }
7273 7295  
7274 7296  static void
7275 7297  zone_release_door(door_handle_t *doorp)
7276 7298  {
7277 7299          door_ki_rele(*doorp);
7278 7300          *doorp = NULL;
7279 7301  }
7280 7302  
7281 7303  static void
7282 7304  zone_ki_call_zoneadmd(struct zarg *zargp)
7283 7305  {
7284 7306          door_handle_t door = NULL;
7285 7307          door_arg_t darg, save_arg;
7286 7308          char *zone_name;
7287 7309          size_t zone_namelen;
7288 7310          zoneid_t zoneid;
7289 7311          zone_t *zone;
7290 7312          zone_cmd_arg_t arg;
7291 7313          uint64_t uniqid;
7292 7314          size_t size;
7293 7315          int error;
7294 7316          int retry;
7295 7317  
7296 7318          zone = zargp->zone;
7297 7319          arg = zargp->arg;
7298 7320          kmem_free(zargp, sizeof (*zargp));
7299 7321  
7300 7322          zone_namelen = strlen(zone->zone_name) + 1;
7301 7323          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
7302 7324          bcopy(zone->zone_name, zone_name, zone_namelen);
7303 7325          zoneid = zone->zone_id;
7304 7326          uniqid = zone->zone_uniqid;
7305 7327          arg.status = zone->zone_init_status;
7306 7328          /*
7307 7329           * zoneadmd may be down, but at least we can empty out the zone.
7308 7330           * We can ignore the return value of zone_empty() since we're called
7309 7331           * from a kernel thread and know we won't be delivered any signals.
7310 7332           */
7311 7333          ASSERT(curproc == &p0);
7312 7334          (void) zone_empty(zone);
7313 7335          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
7314 7336          zone_rele(zone);
7315 7337  
7316 7338          size = sizeof (arg);
7317 7339          darg.rbuf = (char *)&arg;
7318 7340          darg.data_ptr = (char *)&arg;
7319 7341          darg.rsize = size;
7320 7342          darg.data_size = size;
7321 7343          darg.desc_ptr = NULL;
7322 7344          darg.desc_num = 0;
7323 7345  
7324 7346          save_arg = darg;
7325 7347          /*
7326 7348           * Since we're not holding a reference to the zone, any number of
7327 7349           * things can go wrong, including the zone disappearing before we get a
7328 7350           * chance to talk to zoneadmd.
7329 7351           */
7330 7352          for (retry = 0; /* forever */; retry++) {
7331 7353                  if (door == NULL &&
7332 7354                      (error = zone_lookup_door(zone_name, &door)) != 0) {
7333 7355                          goto next;
7334 7356                  }
7335 7357                  ASSERT(door != NULL);
7336 7358  
7337 7359                  if ((error = door_ki_upcall_limited(door, &darg, NULL,
7338 7360                      SIZE_MAX, 0)) == 0) {
7339 7361                          break;
7340 7362                  }
7341 7363                  switch (error) {
7342 7364                  case EINTR:
7343 7365                          /* FALLTHROUGH */
7344 7366                  case EAGAIN:    /* process may be forking */
7345 7367                          /*
7346 7368                           * Back off for a bit
7347 7369                           */
7348 7370                          break;
7349 7371                  case EBADF:
7350 7372                          zone_release_door(&door);
7351 7373                          if (zone_lookup_door(zone_name, &door) != 0) {
7352 7374                                  /*
7353 7375                                   * zoneadmd may be dead, but it may come back to
7354 7376                                   * life later.
7355 7377                                   */
7356 7378                                  break;
7357 7379                          }
7358 7380                          break;
7359 7381                  default:
7360 7382                          cmn_err(CE_WARN,
7361 7383                              "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
7362 7384                              error);
7363 7385                          goto out;
7364 7386                  }
7365 7387  next:
7366 7388                  /*
7367 7389                   * If this isn't the same zone_t that we originally had in mind,
7368 7390                   * then this is the same as if two kadmin requests come in at
7369 7391                   * the same time: the first one wins.  This means we lose, so we
7370 7392                   * bail.
7371 7393                   */
7372 7394                  if ((zone = zone_find_by_id(zoneid)) == NULL) {
7373 7395                          /*
7374 7396                           * Problem is solved.
7375 7397                           */
7376 7398                          break;
7377 7399                  }
7378 7400                  if (zone->zone_uniqid != uniqid) {
7379 7401                          /*
7380 7402                           * zoneid recycled
7381 7403                           */
7382 7404                          zone_rele(zone);
7383 7405                          break;
7384 7406                  }
7385 7407                  /*
7386 7408                   * We could zone_status_timedwait(), but there doesn't seem to
7387 7409                   * be much point in doing that (plus, it would mean that
7388 7410                   * zone_free() isn't called until this thread exits).
7389 7411                   */
7390 7412                  zone_rele(zone);
7391 7413                  delay(hz);
7392 7414                  darg = save_arg;
7393 7415          }
7394 7416  out:
7395 7417          if (door != NULL) {
7396 7418                  zone_release_door(&door);
7397 7419          }
7398 7420          kmem_free(zone_name, zone_namelen);
7399 7421          thread_exit();
7400 7422  }
7401 7423  
7402 7424  /*
7403 7425   * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
7404 7426   * kadmin().  The caller is a process in the zone.
7405 7427   *
7406 7428   * In order to shutdown the zone, we will hand off control to zoneadmd
7407 7429   * (running in the global zone) via a door.  We do a half-hearted job at
7408 7430   * killing all processes in the zone, create a kernel thread to contact
7409 7431   * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
7410 7432   * a form of generation number used to let zoneadmd (as well as
7411 7433   * zone_destroy()) know exactly which zone they're re talking about.
7412 7434   */
7413 7435  int
7414 7436  zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
7415 7437  {
7416 7438          struct zarg *zargp;
7417 7439          zone_cmd_t zcmd;
7418 7440          zone_t *zone;
7419 7441  
7420 7442          zone = curproc->p_zone;
7421 7443          ASSERT(getzoneid() != GLOBAL_ZONEID);
7422 7444  
7423 7445          switch (cmd) {
7424 7446          case A_SHUTDOWN:
7425 7447                  switch (fcn) {
7426 7448                  case AD_HALT:
7427 7449                  case AD_POWEROFF:
7428 7450                          zcmd = Z_HALT;
7429 7451                          break;
7430 7452                  case AD_BOOT:
7431 7453                          zcmd = Z_REBOOT;
7432 7454                          break;
7433 7455                  case AD_IBOOT:
7434 7456                  case AD_SBOOT:
7435 7457                  case AD_SIBOOT:
7436 7458                  case AD_NOSYNC:
7437 7459                          return (ENOTSUP);
7438 7460                  default:
7439 7461                          return (EINVAL);
7440 7462                  }
7441 7463                  break;
7442 7464          case A_REBOOT:
7443 7465                  zcmd = Z_REBOOT;
7444 7466                  break;
7445 7467          case A_FTRACE:
7446 7468          case A_REMOUNT:
7447 7469          case A_FREEZE:
7448 7470          case A_DUMP:
7449 7471          case A_CONFIG:
7450 7472                  return (ENOTSUP);
7451 7473          default:
7452 7474                  ASSERT(cmd != A_SWAPCTL);       /* handled by uadmin() */
7453 7475                  return (EINVAL);
7454 7476          }
7455 7477  
7456 7478          if (secpolicy_zone_admin(credp, B_FALSE))
7457 7479                  return (EPERM);
7458 7480          mutex_enter(&zone_status_lock);
7459 7481  
7460 7482          /*
7461 7483           * zone_status can't be ZONE_IS_EMPTY or higher since curproc
7462 7484           * is in the zone.
7463 7485           */
7464 7486          ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
7465 7487          if (zone_status_get(zone) > ZONE_IS_RUNNING) {
7466 7488                  /*
7467 7489                   * This zone is already on its way down.
7468 7490                   */
7469 7491                  mutex_exit(&zone_status_lock);
7470 7492                  return (0);
7471 7493          }
7472 7494          /*
7473 7495           * Prevent future zone_enter()s
7474 7496           */
7475 7497          zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
7476 7498          mutex_exit(&zone_status_lock);
7477 7499  
7478 7500          /*
7479 7501           * Kill everyone now and call zoneadmd later.
7480 7502           * zone_ki_call_zoneadmd() will do a more thorough job of this
7481 7503           * later.
7482 7504           */
7483 7505          killall(zone->zone_id, B_FALSE);
7484 7506          /*
7485 7507           * Now, create the thread to contact zoneadmd and do the rest of the
7486 7508           * work.  This thread can't be created in our zone otherwise
7487 7509           * zone_destroy() would deadlock.
7488 7510           */
7489 7511          zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
7490 7512          zargp->arg.cmd = zcmd;
7491 7513          zargp->arg.uniqid = zone->zone_uniqid;
7492 7514          zargp->zone = zone;
7493 7515          (void) strcpy(zargp->arg.locale, "C");
7494 7516          /* mdep was already copied in for us by uadmin */
7495 7517          if (mdep != NULL)
7496 7518                  (void) strlcpy(zargp->arg.bootbuf, mdep,
7497 7519                      sizeof (zargp->arg.bootbuf));
7498 7520          zone_hold(zone);
7499 7521  
7500 7522          (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
7501 7523              TS_RUN, minclsyspri);
7502 7524          exit(CLD_EXITED, 0);
7503 7525  
7504 7526          return (EINVAL);
7505 7527  }
7506 7528  
7507 7529  /*
7508 7530   * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
7509 7531   * status to ZONE_IS_SHUTTING_DOWN.
7510 7532   *
7511 7533   * This function also shuts down all running zones to ensure that they won't
7512 7534   * fork new processes.
7513 7535   */
7514 7536  void
7515 7537  zone_shutdown_global(void)
7516 7538  {
7517 7539          zone_t *current_zonep;
7518 7540  
7519 7541          ASSERT(INGLOBALZONE(curproc));
7520 7542          mutex_enter(&zonehash_lock);
7521 7543          mutex_enter(&zone_status_lock);
7522 7544  
7523 7545          /* Modify the global zone's status first. */
7524 7546          ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
7525 7547          zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
7526 7548  
7527 7549          /*
7528 7550           * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
7529 7551           * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
7530 7552           * could cause assertions to fail (e.g., assertions about a zone's
7531 7553           * state during initialization, readying, or booting) or produce races.
7532 7554           * We'll let threads continue to initialize and ready new zones: they'll
7533 7555           * fail to boot the new zones when they see that the global zone is
7534 7556           * shutting down.
7535 7557           */
7536 7558          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
7537 7559              current_zonep = list_next(&zone_active, current_zonep)) {
7538 7560                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
7539 7561                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
7540 7562          }
7541 7563          mutex_exit(&zone_status_lock);
7542 7564          mutex_exit(&zonehash_lock);
7543 7565  }
7544 7566  
7545 7567  /*
7546 7568   * Returns true if the named dataset is visible in the specified zone.
7547 7569   * The 'write' parameter is set to 1 if the dataset is also writable.
7548 7570   */
7549 7571  int
7550 7572  zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
7551 7573  {
7552 7574          static int zfstype = -1;
7553 7575          zone_dataset_t *zd;
7554 7576          size_t len;
7555 7577          const char *name = NULL;
7556 7578          vfs_t *vfsp = NULL;
7557 7579  
7558 7580          if (dataset[0] == '\0')
7559 7581                  return (0);
7560 7582  
7561 7583          /*
7562 7584           * Walk the list once, looking for datasets which match exactly, or
7563 7585           * specify a dataset underneath an exported dataset.  If found, return
7564 7586           * true and note that it is writable.
7565 7587           */
7566 7588          for (zd = list_head(&zone->zone_datasets); zd != NULL;
7567 7589              zd = list_next(&zone->zone_datasets, zd)) {
7568 7590  
7569 7591                  len = strlen(zd->zd_dataset);
7570 7592                  if (strlen(dataset) >= len &&
7571 7593                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
7572 7594                      (dataset[len] == '\0' || dataset[len] == '/' ||
7573 7595                      dataset[len] == '@')) {
7574 7596                          if (write)
7575 7597                                  *write = 1;
7576 7598                          return (1);
7577 7599                  }
7578 7600          }
7579 7601  
7580 7602          /*
7581 7603           * Walk the list a second time, searching for datasets which are parents
7582 7604           * of exported datasets.  These should be visible, but read-only.
7583 7605           *
7584 7606           * Note that we also have to support forms such as 'pool/dataset/', with
7585 7607           * a trailing slash.
7586 7608           */
7587 7609          for (zd = list_head(&zone->zone_datasets); zd != NULL;
7588 7610              zd = list_next(&zone->zone_datasets, zd)) {
7589 7611  
7590 7612                  len = strlen(dataset);
7591 7613                  if (dataset[len - 1] == '/')
7592 7614                          len--;  /* Ignore trailing slash */
7593 7615                  if (len < strlen(zd->zd_dataset) &&
7594 7616                      bcmp(dataset, zd->zd_dataset, len) == 0 &&
7595 7617                      zd->zd_dataset[len] == '/') {
7596 7618                          if (write)
7597 7619                                  *write = 0;
7598 7620                          return (1);
7599 7621                  }
7600 7622          }
7601 7623  
7602 7624          /*
7603 7625           * We reach here if the given dataset is not found in the zone_dataset
7604 7626           * list. Check if this dataset was added as a filesystem (ie. "add fs")
7605 7627           * instead of delegation. For this we search for the dataset in the
7606 7628           * zone_vfslist of this zone. If found, return true and note that it is
7607 7629           * not writable.
7608 7630           */
7609 7631  
7610 7632          /*
7611 7633           * Initialize zfstype if it is not initialized yet.
7612 7634           */
7613 7635          if (zfstype == -1) {
7614 7636                  struct vfssw *vswp = vfs_getvfssw("zfs");
7615 7637                  zfstype = vswp - vfssw;
7616 7638                  vfs_unrefvfssw(vswp);
7617 7639          }
7618 7640  
7619 7641          vfs_list_read_lock();
7620 7642          vfsp = zone->zone_vfslist;
7621 7643          do {
7622 7644                  if (vfsp == NULL)
7623 7645                          break;
7624 7646                  if (vfsp->vfs_fstype == zfstype) {
7625 7647                          name = refstr_value(vfsp->vfs_resource);
7626 7648  
7627 7649                          /*
7628 7650                           * Check if we have an exact match.
7629 7651                           */
7630 7652                          if (strcmp(dataset, name) == 0) {
7631 7653                                  vfs_list_unlock();
7632 7654                                  if (write)
7633 7655                                          *write = 0;
7634 7656                                  return (1);
7635 7657                          }
7636 7658                          /*
7637 7659                           * We need to check if we are looking for parents of
7638 7660                           * a dataset. These should be visible, but read-only.
7639 7661                           */
7640 7662                          len = strlen(dataset);
7641 7663                          if (dataset[len - 1] == '/')
7642 7664                                  len--;
7643 7665  
7644 7666                          if (len < strlen(name) &&
7645 7667                              bcmp(dataset, name, len) == 0 && name[len] == '/') {
7646 7668                                  vfs_list_unlock();
7647 7669                                  if (write)
7648 7670                                          *write = 0;
7649 7671                                  return (1);
7650 7672                          }
7651 7673                  }
7652 7674                  vfsp = vfsp->vfs_zone_next;
7653 7675          } while (vfsp != zone->zone_vfslist);
7654 7676  
7655 7677          vfs_list_unlock();
7656 7678          return (0);
7657 7679  }
7658 7680  
7659 7681  /*
7660 7682   * Returns true if the named dataset is visible in the current zone.
7661 7683   * The 'write' parameter is set to 1 if the dataset is also writable.
7662 7684   */
7663 7685  int
7664 7686  zone_dataset_visible(const char *dataset, int *write)
7665 7687  {
7666 7688          zone_t *zone = curproc->p_zone;
7667 7689  
7668 7690          return (zone_dataset_visible_inzone(zone, dataset, write));
7669 7691  }
7670 7692  
7671 7693  /*
7672 7694   * zone_find_by_any_path() -
7673 7695   *
7674 7696   * kernel-private routine similar to zone_find_by_path(), but which
7675 7697   * effectively compares against zone paths rather than zonerootpath
7676 7698   * (i.e., the last component of zonerootpaths, which should be "root/",
7677 7699   * are not compared.)  This is done in order to accurately identify all
7678 7700   * paths, whether zone-visible or not, including those which are parallel
7679 7701   * to /root/, such as /dev/, /home/, etc...
7680 7702   *
7681 7703   * If the specified path does not fall under any zone path then global
7682 7704   * zone is returned.
7683 7705   *
7684 7706   * The treat_abs parameter indicates whether the path should be treated as
7685 7707   * an absolute path although it does not begin with "/".  (This supports
7686 7708   * nfs mount syntax such as host:any/path.)
7687 7709   *
7688 7710   * The caller is responsible for zone_rele of the returned zone.
7689 7711   */
7690 7712  zone_t *
7691 7713  zone_find_by_any_path(const char *path, boolean_t treat_abs)
7692 7714  {
7693 7715          zone_t *zone;
7694 7716          int path_offset = 0;
7695 7717  
7696 7718          if (path == NULL) {
7697 7719                  zone_hold(global_zone);
7698 7720                  return (global_zone);
7699 7721          }
7700 7722  
7701 7723          if (*path != '/') {
7702 7724                  ASSERT(treat_abs);
7703 7725                  path_offset = 1;
7704 7726          }
7705 7727  
7706 7728          mutex_enter(&zonehash_lock);
7707 7729          for (zone = list_head(&zone_active); zone != NULL;
7708 7730              zone = list_next(&zone_active, zone)) {
7709 7731                  char    *c;
7710 7732                  size_t  pathlen;
7711 7733                  char *rootpath_start;
7712 7734  
7713 7735                  if (zone == global_zone)        /* skip global zone */
7714 7736                          continue;
7715 7737  
7716 7738                  /* scan backwards to find start of last component */
7717 7739                  c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
7718 7740                  do {
7719 7741                          c--;
7720 7742                  } while (*c != '/');
7721 7743  
7722 7744                  pathlen = c - zone->zone_rootpath + 1 - path_offset;
7723 7745                  rootpath_start = (zone->zone_rootpath + path_offset);
7724 7746                  if (strncmp(path, rootpath_start, pathlen) == 0)
7725 7747                          break;
7726 7748          }
7727 7749          if (zone == NULL)
7728 7750                  zone = global_zone;
7729 7751          zone_hold(zone);
7730 7752          mutex_exit(&zonehash_lock);
7731 7753          return (zone);
7732 7754  }
7733 7755  
7734 7756  /*
7735 7757   * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
7736 7758   * zone_dl_t pointer if found, and NULL otherwise.
7737 7759   */
7738 7760  static zone_dl_t *
7739 7761  zone_find_dl(zone_t *zone, datalink_id_t linkid)
7740 7762  {
7741 7763          zone_dl_t *zdl;
7742 7764  
7743 7765          ASSERT(mutex_owned(&zone->zone_lock));
7744 7766          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7745 7767              zdl = list_next(&zone->zone_dl_list, zdl)) {
7746 7768                  if (zdl->zdl_id == linkid)
7747 7769                          break;
7748 7770          }
7749 7771          return (zdl);
7750 7772  }
7751 7773  
7752 7774  static boolean_t
7753 7775  zone_dl_exists(zone_t *zone, datalink_id_t linkid)
7754 7776  {
7755 7777          boolean_t exists;
7756 7778  
7757 7779          mutex_enter(&zone->zone_lock);
7758 7780          exists = (zone_find_dl(zone, linkid) != NULL);
7759 7781          mutex_exit(&zone->zone_lock);
7760 7782          return (exists);
7761 7783  }
7762 7784  
7763 7785  /*
7764 7786   * Add an data link name for the zone.
7765 7787   */
7766 7788  static int
7767 7789  zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
7768 7790  {
7769 7791          zone_dl_t *zdl;
7770 7792          zone_t *zone;
7771 7793          zone_t *thiszone;
7772 7794  
7773 7795          /*
7774 7796           * Only the GZ may add a datalink to a zone's list.
7775 7797           */
7776 7798          if (getzoneid() != GLOBAL_ZONEID)
7777 7799                  return (set_errno(EPERM));
7778 7800  
7779 7801          /*
7780 7802           * Only a process with the datalink config priv may add a
7781 7803           * datalink to a zone's list.
7782 7804           */
7783 7805          if (secpolicy_dl_config(CRED()) != 0)
7784 7806                  return (set_errno(EPERM));
7785 7807  
7786 7808          /*
7787 7809           * When links exist in the GZ, they aren't added to the GZ's
7788 7810           * zone_dl_list. We must enforce this because link_activate()
7789 7811           * depends on zone_check_datalink() returning only NGZs.
7790 7812           */
7791 7813          if (zoneid == GLOBAL_ZONEID)
7792 7814                  return (set_errno(EINVAL));
7793 7815  
7794 7816          if ((thiszone = zone_find_by_id(zoneid)) == NULL)
7795 7817                  return (set_errno(ENXIO));
7796 7818  
7797 7819          /* Verify that the datalink ID doesn't already belong to a zone. */
7798 7820          mutex_enter(&zonehash_lock);
7799 7821          for (zone = list_head(&zone_active); zone != NULL;
7800 7822              zone = list_next(&zone_active, zone)) {
7801 7823                  if (zone_dl_exists(zone, linkid)) {
7802 7824                          mutex_exit(&zonehash_lock);
7803 7825                          zone_rele(thiszone);
7804 7826                          return (set_errno((zone == thiszone) ? EEXIST : EPERM));
7805 7827                  }
7806 7828          }
7807 7829  
7808 7830          zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
7809 7831          zdl->zdl_id = linkid;
7810 7832          zdl->zdl_net = NULL;
7811 7833          mutex_enter(&thiszone->zone_lock);
7812 7834          list_insert_head(&thiszone->zone_dl_list, zdl);
7813 7835          mutex_exit(&thiszone->zone_lock);
7814 7836          mutex_exit(&zonehash_lock);
7815 7837          zone_rele(thiszone);
7816 7838          return (0);
7817 7839  }
7818 7840  
7819 7841  static int
7820 7842  zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
7821 7843  {
7822 7844          zone_dl_t *zdl;
7823 7845          zone_t *zone;
7824 7846          int err = 0;
7825 7847  
7826 7848          /*
7827 7849           * Only the GZ may remove a datalink from a zone's list.
7828 7850           */
7829 7851          if (getzoneid() != GLOBAL_ZONEID)
7830 7852                  return (set_errno(EPERM));
7831 7853  
7832 7854          /*
7833 7855           * Only a process with the datalink config priv may remove a
7834 7856           * datalink from a zone's list.
7835 7857           */
7836 7858          if (secpolicy_dl_config(CRED()) != 0)
7837 7859                  return (set_errno(EPERM));
7838 7860  
7839 7861          /*
7840 7862           * If we can't add a datalink to the GZ's zone_dl_list then we
7841 7863           * certainly can't remove them either.
7842 7864           */
7843 7865          if (zoneid == GLOBAL_ZONEID)
7844 7866                  return (set_errno(EINVAL));
7845 7867  
7846 7868          if ((zone = zone_find_by_id(zoneid)) == NULL)
7847 7869                  return (set_errno(EINVAL));
7848 7870  
7849 7871          mutex_enter(&zone->zone_lock);
7850 7872          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
7851 7873                  err = ENXIO;
7852 7874          } else {
7853 7875                  list_remove(&zone->zone_dl_list, zdl);
7854 7876                  nvlist_free(zdl->zdl_net);
7855 7877                  kmem_free(zdl, sizeof (zone_dl_t));
7856 7878          }
7857 7879          mutex_exit(&zone->zone_lock);
7858 7880          zone_rele(zone);
7859 7881          return (err == 0 ? 0 : set_errno(err));
7860 7882  }
7861 7883  
7862 7884  /*
7863 7885   *
7864 7886   * This function may be used in two ways:
7865 7887   *
7866 7888   * 1. to get the zoneid of the zone this link is under, or
7867 7889   *
7868 7890   * 2. to verify that the link is under a specific zone.
7869 7891   *
7870 7892   * The first use is achieved by passing a zoneid of ALL_ZONES. The
7871 7893   * function then iterates the datalink list of every zone on the
7872 7894   * system until it finds the linkid. If the linkid is found then the
7873 7895   * function returns 0 and zoneidp is updated. Otherwise, ENXIO is
7874 7896   * returned and zoneidp is not modified. The use of ALL_ZONES is
7875 7897   * limited to callers in the GZ to prevent leaking information to
7876 7898   * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed
7877 7899   * to the second type in the list above.
7878 7900   *
7879 7901   * The second use is achieved by passing a specific zoneid. The GZ can
7880 7902   * use this to verify a link is under a particular zone. An NGZ can
7881 7903   * use this to verify a link is under itself. But an NGZ cannot use
7882 7904   * this to determine if a link is under some other zone as that would
7883 7905   * result in information leakage. If the link exists under the zone
7884 7906   * then 0 is returned. Otherwise, ENXIO is returned.
7885 7907   */
7886 7908  int
7887 7909  zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
7888 7910  {
7889 7911          zone_t *zone;
7890 7912          zoneid_t zoneid = *zoneidp;
7891 7913          zoneid_t caller = getzoneid();
7892 7914          int err = ENXIO;
7893 7915  
7894 7916          /*
7895 7917           * Only the GZ may enquire about all zones; an NGZ may only
7896 7918           * enuqire about itself.
7897 7919           */
7898 7920          if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID)
7899 7921                  zoneid = caller;
7900 7922  
7901 7923          if (zoneid != caller && caller != GLOBAL_ZONEID)
7902 7924                  return (err);
7903 7925  
7904 7926          if (zoneid != ALL_ZONES) {
7905 7927                  if ((zone = zone_find_by_id(zoneid)) != NULL) {
7906 7928                          if (zone_dl_exists(zone, linkid)) {
7907 7929                                  /*
7908 7930                                   * We need to set this in case an NGZ
7909 7931                                   * passes ALL_ZONES.
7910 7932                                   */
7911 7933                                  *zoneidp = zoneid;
7912 7934                                  err = 0;
7913 7935                          }
7914 7936                          zone_rele(zone);
7915 7937                  }
7916 7938                  return (err);
7917 7939          }
7918 7940  
7919 7941          ASSERT(caller == GLOBAL_ZONEID);
7920 7942          mutex_enter(&zonehash_lock);
7921 7943          for (zone = list_head(&zone_active); zone != NULL;
7922 7944              zone = list_next(&zone_active, zone)) {
7923 7945                  if (zone_dl_exists(zone, linkid)) {
7924 7946                          *zoneidp = zone->zone_id;
7925 7947                          err = 0;
7926 7948                          break;
7927 7949                  }
7928 7950          }
7929 7951          mutex_exit(&zonehash_lock);
7930 7952  
7931 7953          return (err);
7932 7954  }
7933 7955  
7934 7956  /*
7935 7957   * Get the list of datalink IDs assigned to a zone.
7936 7958   *
7937 7959   * On input, *nump is the number of datalink IDs that can fit in the supplied
7938 7960   * idarray.  Upon return, *nump is either set to the number of datalink IDs
7939 7961   * that were placed in the array if the array was large enough, or to the
7940 7962   * number of datalink IDs that the function needs to place in the array if the
7941 7963   * array is too small.
7942 7964   */
7943 7965  static int
7944 7966  zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
7945 7967  {
7946 7968          uint_t num, dlcount;
7947 7969          zone_t *zone;
7948 7970          zone_dl_t *zdl;
7949 7971          datalink_id_t *idptr = idarray;
7950 7972  
7951 7973          /*
7952 7974           * Only the GZ or the owning zone may look at the datalink list.
7953 7975           */
7954 7976          if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid))
7955 7977                  return (set_errno(EPERM));
7956 7978  
7957 7979          if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
7958 7980                  return (set_errno(EFAULT));
7959 7981          if ((zone = zone_find_by_id(zoneid)) == NULL)
7960 7982                  return (set_errno(ENXIO));
7961 7983  
7962 7984          num = 0;
7963 7985          mutex_enter(&zone->zone_lock);
7964 7986          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
7965 7987              zdl = list_next(&zone->zone_dl_list, zdl)) {
7966 7988                  /*
7967 7989                   * If the list is bigger than what the caller supplied, just
7968 7990                   * count, don't do copyout.
7969 7991                   */
7970 7992                  if (++num > dlcount)
7971 7993                          continue;
7972 7994                  if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
7973 7995                          mutex_exit(&zone->zone_lock);
7974 7996                          zone_rele(zone);
7975 7997                          return (set_errno(EFAULT));
7976 7998                  }
7977 7999                  idptr++;
7978 8000          }
7979 8001          mutex_exit(&zone->zone_lock);
7980 8002          zone_rele(zone);
7981 8003  
7982 8004          /*
7983 8005           * Prevent returning negative nump values -- we should never
7984 8006           * have this many links anyways.
7985 8007           */
7986 8008          if (num > INT_MAX)
7987 8009                  return (set_errno(EOVERFLOW));
7988 8010  
7989 8011          /* Increased or decreased, caller should be notified. */
7990 8012          if (num != dlcount) {
7991 8013                  if (copyout(&num, nump, sizeof (num)) != 0)
7992 8014                          return (set_errno(EFAULT));
7993 8015          }
7994 8016          return (0);
7995 8017  }
7996 8018  
7997 8019  /*
7998 8020   * Public interface for looking up a zone by zoneid. It's a customized version
7999 8021   * for netstack_zone_create(). It can only be called from the zsd create
8000 8022   * callbacks, since it doesn't have reference on the zone structure hence if
8001 8023   * it is called elsewhere the zone could disappear after the zonehash_lock
8002 8024   * is dropped.
8003 8025   *
8004 8026   * Furthermore it
8005 8027   * 1. Doesn't check the status of the zone.
8006 8028   * 2. It will be called even before zone_init is called, in that case the
8007 8029   *    address of zone0 is returned directly, and netstack_zone_create()
8008 8030   *    will only assign a value to zone0.zone_netstack, won't break anything.
8009 8031   * 3. Returns without the zone being held.
8010 8032   */
8011 8033  zone_t *
8012 8034  zone_find_by_id_nolock(zoneid_t zoneid)
8013 8035  {
8014 8036          zone_t *zone;
8015 8037  
8016 8038          mutex_enter(&zonehash_lock);
8017 8039          if (zonehashbyid == NULL)
8018 8040                  zone = &zone0;
8019 8041          else
8020 8042                  zone = zone_find_all_by_id(zoneid);
8021 8043          mutex_exit(&zonehash_lock);
8022 8044          return (zone);
8023 8045  }
8024 8046  
8025 8047  /*
8026 8048   * Walk the datalinks for a given zone
8027 8049   */
8028 8050  int
8029 8051  zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
8030 8052      void *data)
8031 8053  {
8032 8054          zone_t          *zone;
8033 8055          zone_dl_t       *zdl;
8034 8056          datalink_id_t   *idarray;
8035 8057          uint_t          idcount = 0;
8036 8058          int             i, ret = 0;
8037 8059  
8038 8060          if ((zone = zone_find_by_id(zoneid)) == NULL)
8039 8061                  return (ENOENT);
8040 8062  
8041 8063          /*
8042 8064           * We first build an array of linkid's so that we can walk these and
8043 8065           * execute the callback with the zone_lock dropped.
8044 8066           */
8045 8067          mutex_enter(&zone->zone_lock);
8046 8068          for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
8047 8069              zdl = list_next(&zone->zone_dl_list, zdl)) {
8048 8070                  idcount++;
8049 8071          }
8050 8072  
8051 8073          if (idcount == 0) {
8052 8074                  mutex_exit(&zone->zone_lock);
8053 8075                  zone_rele(zone);
8054 8076                  return (0);
8055 8077          }
8056 8078  
8057 8079          idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
8058 8080          if (idarray == NULL) {
8059 8081                  mutex_exit(&zone->zone_lock);
8060 8082                  zone_rele(zone);
8061 8083                  return (ENOMEM);
8062 8084          }
8063 8085  
8064 8086          for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
8065 8087              i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
8066 8088                  idarray[i] = zdl->zdl_id;
8067 8089          }
8068 8090  
8069 8091          mutex_exit(&zone->zone_lock);
8070 8092  
8071 8093          for (i = 0; i < idcount && ret == 0; i++) {
8072 8094                  if ((ret = (*cb)(idarray[i], data)) != 0)
8073 8095                          break;
8074 8096          }
8075 8097  
8076 8098          zone_rele(zone);
8077 8099          kmem_free(idarray, sizeof (datalink_id_t) * idcount);
8078 8100          return (ret);
8079 8101  }
8080 8102  
8081 8103  static char *
8082 8104  zone_net_type2name(int type)
8083 8105  {
8084 8106          switch (type) {
8085 8107          case ZONE_NETWORK_ADDRESS:
8086 8108                  return (ZONE_NET_ADDRNAME);
8087 8109          case ZONE_NETWORK_DEFROUTER:
8088 8110                  return (ZONE_NET_RTRNAME);
8089 8111          default:
8090 8112                  return (NULL);
8091 8113          }
8092 8114  }
8093 8115  
8094 8116  static int
8095 8117  zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf)
8096 8118  {
8097 8119          zone_t *zone;
8098 8120          zone_dl_t *zdl;
8099 8121          nvlist_t *nvl;
8100 8122          int err = 0;
8101 8123          uint8_t *new = NULL;
8102 8124          char *nvname;
8103 8125          int bufsize;
8104 8126          datalink_id_t linkid = znbuf->zn_linkid;
8105 8127  
8106 8128          if (secpolicy_zone_config(CRED()) != 0)
8107 8129                  return (set_errno(EPERM));
8108 8130  
8109 8131          if (zoneid == GLOBAL_ZONEID)
8110 8132                  return (set_errno(EINVAL));
8111 8133  
8112 8134          nvname = zone_net_type2name(znbuf->zn_type);
8113 8135          bufsize = znbuf->zn_len;
8114 8136          new = znbuf->zn_val;
8115 8137          if (nvname == NULL)
8116 8138                  return (set_errno(EINVAL));
8117 8139  
8118 8140          if ((zone = zone_find_by_id(zoneid)) == NULL) {
8119 8141                  return (set_errno(EINVAL));
8120 8142          }
8121 8143  
8122 8144          mutex_enter(&zone->zone_lock);
8123 8145          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
8124 8146                  err = ENXIO;
8125 8147                  goto done;
8126 8148          }
8127 8149          if ((nvl = zdl->zdl_net) == NULL) {
8128 8150                  if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) {
8129 8151                          err = ENOMEM;
8130 8152                          goto done;
8131 8153                  } else {
8132 8154                          zdl->zdl_net = nvl;
8133 8155                  }
8134 8156          }
8135 8157          if (nvlist_exists(nvl, nvname)) {
8136 8158                  err = EINVAL;
8137 8159                  goto done;
8138 8160          }
8139 8161          err = nvlist_add_uint8_array(nvl, nvname, new, bufsize);
8140 8162          ASSERT(err == 0);
8141 8163  done:
8142 8164          mutex_exit(&zone->zone_lock);
8143 8165          zone_rele(zone);
8144 8166          if (err != 0)
8145 8167                  return (set_errno(err));
8146 8168          else
8147 8169                  return (0);
8148 8170  }
8149 8171  
8150 8172  static int
8151 8173  zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf)
8152 8174  {
8153 8175          zone_t *zone;
8154 8176          zone_dl_t *zdl;
8155 8177          nvlist_t *nvl;
8156 8178          uint8_t *ptr;
8157 8179          uint_t psize;
8158 8180          int err = 0;
8159 8181          char *nvname;
8160 8182          int bufsize;
8161 8183          void *buf;
8162 8184          datalink_id_t linkid = znbuf->zn_linkid;
8163 8185  
8164 8186          if (zoneid == GLOBAL_ZONEID)
8165 8187                  return (set_errno(EINVAL));
8166 8188  
8167 8189          nvname = zone_net_type2name(znbuf->zn_type);
8168 8190          bufsize = znbuf->zn_len;
8169 8191          buf = znbuf->zn_val;
8170 8192  
8171 8193          if (nvname == NULL)
8172 8194                  return (set_errno(EINVAL));
8173 8195          if ((zone = zone_find_by_id(zoneid)) == NULL)
8174 8196                  return (set_errno(EINVAL));
8175 8197  
8176 8198          mutex_enter(&zone->zone_lock);
8177 8199          if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
8178 8200                  err = ENXIO;
8179 8201                  goto done;
8180 8202          }
8181 8203          if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) {
8182 8204                  err = ENOENT;
8183 8205                  goto done;
8184 8206          }
8185 8207          err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize);
8186 8208          ASSERT(err == 0);
8187 8209  
8188 8210          if (psize > bufsize) {
8189 8211                  err = ENOBUFS;
8190 8212                  goto done;
8191 8213          }
8192 8214          znbuf->zn_len = psize;
8193 8215          bcopy(ptr, buf, psize);
8194 8216  done:
8195 8217          mutex_exit(&zone->zone_lock);
8196 8218          zone_rele(zone);
8197 8219          if (err != 0)
8198 8220                  return (set_errno(err));
8199 8221          else
8200 8222                  return (0);
8201 8223  }
8202 8224  
8203 8225  static void
8204 8226  zone_incr_capped(zoneid_t zid)
8205 8227  {
8206 8228          zone_persist_t *zp = &zone_pdata[zid];
8207 8229  
8208 8230          /* See if over (unlimited is UINT32_MAX), or already marked that way. */
8209 8231          if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) {
8210 8232                  return;
8211 8233          }
8212 8234  
8213 8235          mutex_enter(&zone_physcap_lock);
8214 8236          /* Recheck setting under mutex */
8215 8237          if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) {
8216 8238                  zp->zpers_over = 1;
8217 8239                  zp->zpers_nover++;
8218 8240                  zone_num_over_cap++;
8219 8241                  DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid);
8220 8242          }
8221 8243          mutex_exit(&zone_physcap_lock);
8222 8244  }
8223 8245  
8224 8246  /*
8225 8247   * We want some hysteresis when the zone is going under its cap so that we're
8226 8248   * not continuously toggling page scanning back and forth by a single page
8227 8249   * around the cap. Using ~1% of the zone's page limit seems to be a good
8228 8250   * quantity. This table shows some various zone memory caps and the number of
8229 8251   * pages (assuming a 4k page size). Given this, we choose to shift the page
8230 8252   * limit by 7 places to get a hysteresis that is slightly less than 1%.
8231 8253   *
8232 8254   *   cap    pages     pages     1% shift7  shift7
8233 8255   *  128M    32768 0x0008000    327    256 0x00100
8234 8256   *  512M   131072 0x0020000   1310   1024 0x00400
8235 8257   *    1G   262144 0x0040000   2621   2048 0x00800
8236 8258   *    4G  1048576 0x0100000  10485   8192 0x02000
8237 8259   *    8G  2097152 0x0200000  20971  16384 0x04000
8238 8260   *   16G  4194304 0x0400000  41943  32768 0x08000
8239 8261   *   32G  8388608 0x0800000  83886  65536 0x10000
8240 8262   *   64G 16777216 0x1000000 167772 131072 0x20000
8241 8263   */
8242 8264  static void
8243 8265  zone_decr_capped(zoneid_t zid)
8244 8266  {
8245 8267          zone_persist_t *zp = &zone_pdata[zid];
8246 8268          uint32_t adjusted_limit;
8247 8269  
8248 8270          /*
8249 8271           * See if under, or already marked that way. There is no need to
8250 8272           * check for an unlimited cap (zpers_pg_limit == UINT32_MAX)
8251 8273           * since we'll never set zpers_over in zone_incr_capped().
8252 8274           */
8253 8275          if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) {
8254 8276                  return;
8255 8277          }
8256 8278  
8257 8279          adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7);
8258 8280  
8259 8281          /* Recheck, accounting for our hysteresis. */
8260 8282          if (zp->zpers_pg_cnt >= adjusted_limit) {
8261 8283                  return;
8262 8284          }
8263 8285  
8264 8286          mutex_enter(&zone_physcap_lock);
8265 8287          /* Recheck under mutex. */
8266 8288          if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) {
8267 8289                  zp->zpers_over = 0;
8268 8290                  ASSERT(zone_num_over_cap > 0);
8269 8291                  zone_num_over_cap--;
8270 8292                  DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid);
8271 8293          }
8272 8294          mutex_exit(&zone_physcap_lock);
8273 8295  }
8274 8296  
8275 8297  /*
8276 8298   * For zone_add_page() and zone_rm_page(), access to the page we're touching is
8277 8299   * controlled by our caller's locking.
8278 8300   * On x86 our callers already did: ASSERT(x86_hm_held(pp))
8279 8301   * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp))
8280 8302   */
8281 8303  void
8282 8304  zone_add_page(page_t *pp)
8283 8305  {
8284 8306          uint_t pcnt;
8285 8307          zone_persist_t *zp;
8286 8308          zoneid_t zid;
8287 8309  
8288 8310          /* Skip pages in segkmem, etc. (KV_KVP, ...) */
8289 8311          if (PP_ISKAS(pp))
8290 8312                  return;
8291 8313  
8292 8314          ASSERT(!PP_ISFREE(pp));
8293 8315  
8294 8316          zid = curzone->zone_id;
8295 8317          if (pp->p_zoneid == zid) {
8296 8318                  /* Another mapping to this page for this zone, do nothing */
8297 8319                  return;
8298 8320          }
8299 8321  
8300 8322          if (pp->p_szc == 0) {
8301 8323                  pcnt = 1;
8302 8324          } else {
8303 8325                  /* large page */
8304 8326                  pcnt = page_get_pagecnt(pp->p_szc);
8305 8327          }
8306 8328  
8307 8329          if (pp->p_share == 0) {
8308 8330                  /* First mapping to this page. */
8309 8331                  pp->p_zoneid = zid;
8310 8332                  zp = &zone_pdata[zid];
8311 8333                  ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX);
8312 8334                  atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt);
8313 8335                  zone_incr_capped(zid);
8314 8336                  return;
8315 8337          }
8316 8338  
8317 8339          if (pp->p_zoneid != ALL_ZONES) {
8318 8340                  /*
8319 8341                   * The page is now being shared across a different zone.
8320 8342                   * Decrement the original zone's usage.
8321 8343                   */
8322 8344                  zid = pp->p_zoneid;
8323 8345                  pp->p_zoneid = ALL_ZONES;
8324 8346                  ASSERT(zid >= 0 && zid <= MAX_ZONEID);
8325 8347                  zp = &zone_pdata[zid];
8326 8348  
8327 8349                  if (zp->zpers_pg_cnt > 0) {
8328 8350                          atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
8329 8351                  }
8330 8352                  zone_decr_capped(zid);
8331 8353          }
8332 8354  }
8333 8355  
8334 8356  void
8335 8357  zone_rm_page(page_t *pp)
8336 8358  {
8337 8359          uint_t pcnt;
8338 8360          zone_persist_t *zp;
8339 8361          zoneid_t zid;
8340 8362  
8341 8363          /* Skip pages in segkmem, etc. (KV_KVP, ...) */
8342 8364          if (PP_ISKAS(pp))
8343 8365                  return;
8344 8366  
8345 8367          zid = pp->p_zoneid;
8346 8368          if (zid == ALL_ZONES || pp->p_share != 0)
8347 8369                  return;
8348 8370  
8349 8371          /* This is the last mapping to the page for a zone. */
8350 8372          if (pp->p_szc == 0) {
8351 8373                  pcnt = 1;
8352 8374          } else {
8353 8375                  /* large page */
8354 8376                  pcnt = (int64_t)page_get_pagecnt(pp->p_szc);
8355 8377          }
8356 8378  
8357 8379          ASSERT(zid >= 0 && zid <= MAX_ZONEID);
8358 8380          zp = &zone_pdata[zid];
8359 8381          if (zp->zpers_pg_cnt > 0) {
8360 8382                  atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt);
8361 8383          }
8362 8384          zone_decr_capped(zid);
8363 8385          pp->p_zoneid = ALL_ZONES;
8364 8386  }
8365 8387  
8366 8388  void
8367 8389  zone_pageout_stat(int zid, zone_pageout_op_t op)
8368 8390  {
8369 8391          zone_persist_t *zp;
8370 8392  
8371 8393          if (zid == ALL_ZONES)
8372 8394                  return;
8373 8395  
8374 8396          ASSERT(zid >= 0 && zid <= MAX_ZONEID);
8375 8397          zp = &zone_pdata[zid];
8376 8398  
8377 8399  #ifndef DEBUG
8378 8400          atomic_add_64(&zp->zpers_pg_out, 1);
8379 8401  #else
8380 8402          switch (op) {
8381 8403          case ZPO_DIRTY:
8382 8404                  atomic_add_64(&zp->zpers_pg_fsdirty, 1);
8383 8405                  break;
8384 8406          case ZPO_FS:
8385 8407                  atomic_add_64(&zp->zpers_pg_fs, 1);
8386 8408                  break;
8387 8409          case ZPO_ANON:
8388 8410                  atomic_add_64(&zp->zpers_pg_anon, 1);
8389 8411                  break;
8390 8412          case ZPO_ANONDIRTY:
8391 8413                  atomic_add_64(&zp->zpers_pg_anondirty, 1);
8392 8414                  break;
8393 8415          default:
8394 8416                  cmn_err(CE_PANIC, "Invalid pageout operator %d", op);
8395 8417                  break;
8396 8418          }
8397 8419  #endif
8398 8420  }
8399 8421  
8400 8422  /*
8401 8423   * Return the zone's physical memory cap and current free memory (in pages).
8402 8424   */
8403 8425  void
8404 8426  zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free)
8405 8427  {
8406 8428          zone_persist_t *zp;
8407 8429  
8408 8430          ASSERT(zid >= 0 && zid <= MAX_ZONEID);
8409 8431          zp = &zone_pdata[zid];
8410 8432  
8411 8433          /*
8412 8434           * If memory or swap limits are set on the zone, use those, otherwise
8413 8435           * use the system values. physmem and freemem are also in pages.
8414 8436           */
8415 8437          if (zp->zpers_pg_limit == UINT32_MAX) {
8416 8438                  *memcap = physmem;
8417 8439                  *free = freemem;
8418 8440          } else {
8419 8441                  int64_t freemem;
8420 8442  
8421 8443                  *memcap = (pgcnt_t)zp->zpers_pg_limit;
8422 8444                  freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt;
8423 8445                  if (freemem > 0) {
8424 8446                          *free = (pgcnt_t)freemem;
8425 8447                  } else {
8426 8448                          *free = (pgcnt_t)0;
8427 8449                  }
8428 8450          }
8429 8451  }

↓ open down ↓

2372 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX