Print this page
Remove most KEBE comments and accompanying unused code or variables/fields.
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
Mismerge in zone.c frees stats that aren't there.
Mismerge in zone.c create zone mcap kstats too many times
OS-338 Kstat counters to show "slow" VFS operations
OS-5189 lx dev enumeration can deadlock with zfs
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5187 improve /proc/diskstat handling
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5179 flatten zvol entries for /dev and /proc/partitions
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Undo merge damage from zone kstats
OS-4915 want FX high priority zone configuration option
OS-4925 ps pri shows misleading value for zone in RT scheduling class
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4781 would like to be able to add CT_PR_EV_EXIT to fatal event set of current contract
OS-4017 would like zfs-io-priority values > 1024
OS-3820 lxbrand ptrace(2): the next generation
OS-3685 lxbrand PTRACE_O_TRACEFORK race condition
OS-3834 lxbrand 64-bit strace(1) reports 64-bit process as using x32 ABI
OS-3794 lxbrand panic on init signal death
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-3776 project rctls should be in sync with zone rctls
OS-3429 Expose zone's init exit status
OS-3342 dlmgmtd needs to be mindful of lock ordering
OS-2608 dlmgmtd needs to record zone identifiers
OS-3492 zone_free asserts to its destruction when dlmgmtd has fallen
OS-3494 zoneadmd tears down networking too soon when boot fails
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-803 make phys mem cap a bit harder
OS-1043 minimize vm_getusage impact
OS-11 rcapd behaves poorly when under extreme load
OS-399 zone phys. mem. cap should be a rctl and have associated kstat


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent Inc. All rights reserved.
  25  */
  26 
  27 /*
  28  * Zones
  29  *
  30  *   A zone is a named collection of processes, namespace constraints,
  31  *   and other system resources which comprise a secure and manageable
  32  *   application containment facility.
  33  *
  34  *   Zones (represented by the reference counted zone_t) are tracked in
  35  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36  *   (zoneid_t) are used to track zone association.  Zone IDs are
  37  *   dynamically generated when the zone is created; if a persistent
  38  *   identifier is needed (core files, accounting logs, audit trail,
  39  *   etc.), the zone name should be used.
  40  *
  41  *
  42  *   Global Zone:
  43  *
  44  *   The global zone (zoneid 0) is automatically associated with all


 233 #include <sys/klpd.h>
 234 
 235 #include <sys/door.h>
 236 #include <sys/cpuvar.h>
 237 #include <sys/sdt.h>
 238 
 239 #include <sys/uadmin.h>
 240 #include <sys/session.h>
 241 #include <sys/cmn_err.h>
 242 #include <sys/modhash.h>
 243 #include <sys/sunddi.h>
 244 #include <sys/nvpair.h>
 245 #include <sys/rctl.h>
 246 #include <sys/fss.h>
 247 #include <sys/brand.h>
 248 #include <sys/zone.h>
 249 #include <net/if.h>
 250 #include <sys/cpucaps.h>
 251 #include <vm/seg.h>
 252 #include <sys/mac.h>


 253 
 254 /*
 255  * This constant specifies the number of seconds that threads waiting for
 256  * subsystems to release a zone's general-purpose references will wait before
 257  * they log the zone's reference counts.  The constant's value shouldn't
 258  * be so small that reference counts are unnecessarily reported for zones
 259  * whose references are slowly released.  On the other hand, it shouldn't be so
 260  * large that users reboot their systems out of frustration over hung zones
 261  * before the system logs the zones' reference counts.
 262  */
 263 #define ZONE_DESTROY_TIMEOUT_SECS       60
 264 
 265 /* List of data link IDs which are accessible from the zone */
 266 typedef struct zone_dl {
 267         datalink_id_t   zdl_id;
 268         nvlist_t        *zdl_net;
 269         list_node_t     zdl_linkage;
 270 } zone_dl_t;
 271 
 272 /*


 353 /*
 354  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 355  * (see sys/zone.h).
 356  */
 357 static char *zone_ref_subsys_names[] = {
 358         "NFS",          /* ZONE_REF_NFS */
 359         "NFSv4",        /* ZONE_REF_NFSV4 */
 360         "SMBFS",        /* ZONE_REF_SMBFS */
 361         "MNTFS",        /* ZONE_REF_MNTFS */
 362         "LOFI",         /* ZONE_REF_LOFI */
 363         "VFS",          /* ZONE_REF_VFS */
 364         "IPC"           /* ZONE_REF_IPC */
 365 };
 366 
 367 /*
 368  * This isn't static so lint doesn't complain.
 369  */
 370 rctl_hndl_t rc_zone_cpu_shares;
 371 rctl_hndl_t rc_zone_locked_mem;
 372 rctl_hndl_t rc_zone_max_swap;

 373 rctl_hndl_t rc_zone_max_lofi;
 374 rctl_hndl_t rc_zone_cpu_cap;
 375 rctl_hndl_t rc_zone_zfs_io_pri;
 376 rctl_hndl_t rc_zone_nlwps;
 377 rctl_hndl_t rc_zone_nprocs;
 378 rctl_hndl_t rc_zone_shmmax;
 379 rctl_hndl_t rc_zone_shmmni;
 380 rctl_hndl_t rc_zone_semmni;
 381 rctl_hndl_t rc_zone_msgmni;
 382 
 383 const char * const zone_default_initname = "/sbin/init";
 384 static char * const zone_prefix = "/zone/";
 385 static int zone_shutdown(zoneid_t zoneid);
 386 static int zone_add_datalink(zoneid_t, datalink_id_t);
 387 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 388 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 389 static int zone_set_network(zoneid_t, zone_net_data_t *);
 390 static int zone_get_network(zoneid_t, zone_net_data_t *);
 391 
 392 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);


1725 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1726     rctl_qty_t nv)
1727 {
1728         ASSERT(MUTEX_HELD(&p->p_lock));
1729         ASSERT(e->rcep_t == RCENTITY_ZONE);
1730         if (e->rcep_p.zone == NULL)
1731                 return (0);
1732         e->rcep_p.zone->zone_max_swap_ctl = nv;
1733         return (0);
1734 }
1735 
1736 static rctl_ops_t zone_max_swap_ops = {
1737         rcop_no_action,
1738         zone_max_swap_usage,
1739         zone_max_swap_set,
1740         zone_max_swap_test
1741 };
1742 
1743 /*ARGSUSED*/
1744 static rctl_qty_t

































1745 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1746 {
1747         rctl_qty_t q;
1748         zone_t *z = p->p_zone;
1749 
1750         ASSERT(MUTEX_HELD(&p->p_lock));
1751         mutex_enter(&z->zone_rctl_lock);
1752         q = z->zone_max_lofi;
1753         mutex_exit(&z->zone_rctl_lock);
1754         return (q);
1755 }
1756 
1757 /*ARGSUSED*/
1758 static int
1759 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1760     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1761 {
1762         rctl_qty_t q;
1763         zone_t *z;
1764 


1818         crhold(cr);
1819         zone_rele(zone);
1820         return (cr);
1821 }
1822 
1823 static int
1824 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1825 {
1826         zone_t *zone = ksp->ks_private;
1827         zone_kstat_t *zk = ksp->ks_data;
1828 
1829         if (rw == KSTAT_WRITE)
1830                 return (EACCES);
1831 
1832         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1833         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1834         return (0);
1835 }
1836 
1837 static int














1838 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1839 {
1840         zone_t *zone = ksp->ks_private;
1841         zone_kstat_t *zk = ksp->ks_data;
1842 
1843         if (rw == KSTAT_WRITE)
1844                 return (EACCES);
1845 
1846         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1847         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1848         return (0);
1849 }
1850 
1851 static int
1852 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1853 {
1854         zone_t *zone = ksp->ks_private;
1855         zone_kstat_t *zk = ksp->ks_data;
1856 
1857         if (rw == KSTAT_WRITE)


1871 
1872         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1873             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1874             KSTAT_FLAG_VIRTUAL);
1875 
1876         if (ksp == NULL)
1877                 return (NULL);
1878 
1879         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1880         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1881         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1882         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1883         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1884         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1885         ksp->ks_update = updatefunc;
1886         ksp->ks_private = zone;
1887         kstat_install(ksp);
1888         return (ksp);
1889 }
1890 






1891 














































































1892 static int






































































1893 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1894 {
1895         zone_t *zone = ksp->ks_private;
1896         zone_mcap_kstat_t *zmp = ksp->ks_data;
1897 
1898         if (rw == KSTAT_WRITE)
1899                 return (EACCES);
1900 






1901         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1902         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1903         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1904         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1905         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;


1906 
1907         return (0);
1908 }
1909 
1910 static kstat_t *
1911 zone_mcap_kstat_create(zone_t *zone)
1912 {
1913         kstat_t *ksp;
1914         zone_mcap_kstat_t *zmp;
1915 
1916         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1917             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1918             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1919             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1920                 return (NULL);
1921 
1922         if (zone->zone_id != GLOBAL_ZONEID)
1923                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1924 
1925         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1926         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1927         ksp->ks_lock = &zone->zone_mcap_lock;
1928         zone->zone_mcap_stats = zmp;
1929 
1930         /* The kstat "name" field is not large enough for a full zonename */
1931         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1932         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);







1933         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1934         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1935         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1936         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1937         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1938             KSTAT_DATA_UINT64);




1939 
1940         ksp->ks_update = zone_mcap_kstat_update;
1941         ksp->ks_private = zone;
1942 
1943         kstat_install(ksp);
1944         return (ksp);
1945 }
1946 
1947 static int
1948 zone_misc_kstat_update(kstat_t *ksp, int rw)
1949 {
1950         zone_t *zone = ksp->ks_private;
1951         zone_misc_kstat_t *zmp = ksp->ks_data;
1952         hrtime_t tmp;
1953 
1954         if (rw == KSTAT_WRITE)
1955                 return (EACCES);
1956 
1957         tmp = zone->zone_utime;
1958         scalehrtime(&tmp);


2018         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2019         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2020             KSTAT_DATA_UINT32);
2021         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2022         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2023 
2024         ksp->ks_update = zone_misc_kstat_update;
2025         ksp->ks_private = zone;
2026 
2027         kstat_install(ksp);
2028         return (ksp);
2029 }
2030 
2031 static void
2032 zone_kstat_create(zone_t *zone)
2033 {
2034         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2035             "lockedmem", zone_lockedmem_kstat_update);
2036         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2037             "swapresv", zone_swapresv_kstat_update);


2038         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2039             "nprocs", zone_nprocs_kstat_update);
2040 





2041         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2042                 zone->zone_mcap_stats = kmem_zalloc(
2043                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2044         }
2045 
2046         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2047                 zone->zone_misc_stats = kmem_zalloc(
2048                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2049         }

2050 }
2051 
2052 static void
2053 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2054 {
2055         void *data;
2056 
2057         if (*pkstat != NULL) {
2058                 data = (*pkstat)->ks_data;
2059                 kstat_delete(*pkstat);
2060                 kmem_free(data, datasz);
2061                 *pkstat = NULL;
2062         }
2063 }
2064 
2065 static void
2066 zone_kstat_delete(zone_t *zone)
2067 {
2068         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2069             sizeof (zone_kstat_t));
2070         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2071             sizeof (zone_kstat_t));


2072         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2073             sizeof (zone_kstat_t));



2074         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2075             sizeof (zone_mcap_kstat_t));
2076         zone_kstat_delete_common(&zone->zone_misc_ksp,
2077             sizeof (zone_misc_kstat_t));

2078 }
2079 
2080 /*
2081  * Called very early on in boot to initialize the ZSD list so that
2082  * zone_key_create() can be called before zone_init().  It also initializes
2083  * portions of zone0 which may be used before zone_init() is called.  The
2084  * variable "global_zone" will be set when zone0 is fully initialized by
2085  * zone_init().
2086  */
2087 void
2088 zone_zsd_init(void)
2089 {
2090         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2091         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2092         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2093             offsetof(struct zsd_entry, zsd_linkage));
2094         list_create(&zone_active, sizeof (zone_t),
2095             offsetof(zone_t, zone_linkage));
2096         list_create(&zone_deathrow, sizeof (zone_t),
2097             offsetof(zone_t, zone_linkage));
2098 
2099         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2100         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2101         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2102         zone0.zone_shares = 1;
2103         zone0.zone_nlwps = 0;
2104         zone0.zone_nlwps_ctl = INT_MAX;
2105         zone0.zone_nprocs = 0;
2106         zone0.zone_nprocs_ctl = INT_MAX;
2107         zone0.zone_locked_mem = 0;
2108         zone0.zone_locked_mem_ctl = UINT64_MAX;
2109         ASSERT(zone0.zone_max_swap == 0);
2110         zone0.zone_max_swap_ctl = UINT64_MAX;


2111         zone0.zone_max_lofi = 0;
2112         zone0.zone_max_lofi_ctl = UINT64_MAX;
2113         zone0.zone_shmmax = 0;
2114         zone0.zone_ipc.ipcq_shmmni = 0;
2115         zone0.zone_ipc.ipcq_semmni = 0;
2116         zone0.zone_ipc.ipcq_msgmni = 0;
2117         zone0.zone_name = GLOBAL_ZONENAME;
2118         zone0.zone_nodename = utsname.nodename;
2119         zone0.zone_domain = srpc_domain;
2120         zone0.zone_hostid = HW_INVALID_HOSTID;
2121         zone0.zone_fs_allowed = NULL;
2122         zone0.zone_ref = 1;
2123         zone0.zone_id = GLOBAL_ZONEID;
2124         zone0.zone_status = ZONE_IS_RUNNING;
2125         zone0.zone_rootpath = "/";
2126         zone0.zone_rootpathlen = 2;
2127         zone0.zone_psetid = ZONE_PS_INVAL;
2128         zone0.zone_ncpus = 0;
2129         zone0.zone_ncpus_online = 0;
2130         zone0.zone_proc_initpid = 1;
2131         zone0.zone_initname = initname;
2132         zone0.zone_lockedmem_kstat = NULL;
2133         zone0.zone_swapresv_kstat = NULL;

2134         zone0.zone_nprocs_kstat = NULL;
2135         zone0.zone_zfs_io_pri = 1;
2136 
2137         zone0.zone_stime = 0;
2138         zone0.zone_utime = 0;
2139         zone0.zone_wtime = 0;
2140 
2141         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2142             offsetof(zone_ref_t, zref_linkage));
2143         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2144             offsetof(struct zsd_entry, zsd_linkage));
2145         list_insert_head(&zone_active, &zone0);
2146 
2147         /*
2148          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2149          * to anything meaningful.  It is assigned to be 'rootdir' in
2150          * vfs_mountroot().
2151          */
2152         zone0.zone_rootvp = NULL;
2153         zone0.zone_vfslist = NULL;


2230          */
2231         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2232 
2233         /*
2234          * Initialize generic zone resource controls, if any.
2235          */
2236         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2237             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2238             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2239             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2240 
2241         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2242             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2243             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2244             RCTL_GLOBAL_INFINITE,
2245             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2246 
2247         rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2248             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2249             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2250             1024, 1024, &zone_zfs_io_pri_ops);
2251 
2252         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2253             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2254             INT_MAX, INT_MAX, &zone_lwps_ops);
2255 
2256         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2257             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2258             INT_MAX, INT_MAX, &zone_procs_ops);
2259 
2260         /*
2261          * System V IPC resource controls
2262          */
2263         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2264             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2265             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2266 
2267         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2268             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2269             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2270 


2283         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2284         bzero(dval, sizeof (rctl_val_t));
2285         dval->rcv_value = 1;
2286         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2287         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2288         dval->rcv_action_recip_pid = -1;
2289 
2290         rde = rctl_dict_lookup("zone.cpu-shares");
2291         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2292 
2293         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2294             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2295             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2296             &zone_locked_mem_ops);
2297 
2298         rc_zone_max_swap = rctl_register("zone.max-swap",
2299             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2300             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2301             &zone_max_swap_ops);
2302 





2303         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2304             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2305             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2306             &zone_max_lofi_ops);
2307 
2308         /*
2309          * Initialize the ``global zone''.
2310          */
2311         set = rctl_set_create();
2312         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2313         mutex_enter(&p0.p_lock);
2314         e.rcep_p.zone = &zone0;
2315         e.rcep_t = RCENTITY_ZONE;
2316         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2317             gp);
2318 
2319         zone0.zone_nlwps = p0.p_lwpcnt;
2320         zone0.zone_nprocs = 1;
2321         zone0.zone_ntasks = 1;
2322         mutex_exit(&p0.p_lock);
2323         zone0.zone_restart_init = B_TRUE;


2324         zone0.zone_brand = &native_brand;
2325         rctl_prealloc_destroy(gp);
2326         /*
2327          * pool_default hasn't been initialized yet, so we let pool_init()
2328          * take care of making sure the global zone is in the default pool.
2329          */
2330 
2331         /*
2332          * Initialize global zone kstats
2333          */
2334         zone_kstat_create(&zone0);
2335 
2336         /*
2337          * Initialize zone label.
2338          * mlp are initialized when tnzonecfg is loaded.
2339          */
2340         zone0.zone_slabel = l_admin_low;
2341         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2342         label_hold(l_admin_low);
2343 


2383         /*
2384          * The global zone is fully initialized (except for zone_rootvp which
2385          * will be set when the root filesystem is mounted).
2386          */
2387         global_zone = &zone0;
2388 
2389         /*
2390          * Setup an event channel to send zone status change notifications on
2391          */
2392         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2393             EVCH_CREAT);
2394 
2395         if (res)
2396                 panic("Sysevent_evc_bind failed during zone setup.\n");
2397 
2398 }
2399 
2400 static void
2401 zone_free(zone_t *zone)
2402 {


2403         ASSERT(zone != global_zone);
2404         ASSERT(zone->zone_ntasks == 0);
2405         ASSERT(zone->zone_nlwps == 0);
2406         ASSERT(zone->zone_nprocs == 0);
2407         ASSERT(zone->zone_cred_ref == 0);
2408         ASSERT(zone->zone_kcred == NULL);
2409         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2410             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2411         ASSERT(list_is_empty(&zone->zone_ref_list));
2412 
2413         /*
2414          * Remove any zone caps.
2415          */
2416         cpucaps_zone_remove(zone);
2417 
2418         ASSERT(zone->zone_cpucap == NULL);
2419 
2420         /* remove from deathrow list */
2421         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2422                 ASSERT(zone->zone_ref == 0);
2423                 mutex_enter(&zone_deathrow_lock);
2424                 list_remove(&zone_deathrow, zone);
2425                 mutex_exit(&zone_deathrow_lock);
2426         }
2427 
2428         list_destroy(&zone->zone_ref_list);
2429         zone_free_zsd(zone);
2430         zone_free_datasets(zone);













2431         list_destroy(&zone->zone_dl_list);
2432 
2433         if (zone->zone_rootvp != NULL)
2434                 VN_RELE(zone->zone_rootvp);
2435         if (zone->zone_rootpath)
2436                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2437         if (zone->zone_name != NULL)
2438                 kmem_free(zone->zone_name, ZONENAME_MAX);
2439         if (zone->zone_slabel != NULL)
2440                 label_rele(zone->zone_slabel);
2441         if (zone->zone_nodename != NULL)
2442                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2443         if (zone->zone_domain != NULL)
2444                 kmem_free(zone->zone_domain, _SYS_NMLN);
2445         if (zone->zone_privset != NULL)
2446                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2447         if (zone->zone_rctls != NULL)
2448                 rctl_set_free(zone->zone_rctls);
2449         if (zone->zone_bootargs != NULL)
2450                 strfree(zone->zone_bootargs);


2546         kmem_free(attrp, sizeof (struct brand_attr));
2547         if (bp == NULL)
2548                 return (EINVAL);
2549 
2550         /*
2551          * This is the only place where a zone can change it's brand.
2552          * We already need to hold zone_status_lock to check the zone
2553          * status, so we'll just use that lock to serialize zone
2554          * branding requests as well.
2555          */
2556         mutex_enter(&zone_status_lock);
2557 
2558         /* Re-Branding is not allowed and the zone can't be booted yet */
2559         if ((ZONE_IS_BRANDED(zone)) ||
2560             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2561                 mutex_exit(&zone_status_lock);
2562                 brand_unregister_zone(bp);
2563                 return (EINVAL);
2564         }
2565 
2566         /* set up the brand specific data */





2567         zone->zone_brand = bp;
2568         ZBROP(zone)->b_init_brand_data(zone);
2569 
2570         mutex_exit(&zone_status_lock);
2571         return (0);
2572 }
2573 
2574 static int
2575 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2576 {
2577         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2578         int err = 0;
2579 
2580         ASSERT(zone != global_zone);
2581         if ((err = copyinstr(zone_fs_allowed, buf,
2582             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2583                 goto done;
2584 
2585         if (zone->zone_fs_allowed != NULL)
2586                 strfree(zone->zone_fs_allowed);
2587 
2588         zone->zone_fs_allowed = strdup(buf);


2594 
2595 static int
2596 zone_set_initname(zone_t *zone, const char *zone_initname)
2597 {
2598         char initname[INITNAME_SZ];
2599         size_t len;
2600         int err = 0;
2601 
2602         ASSERT(zone != global_zone);
2603         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2604                 return (err);   /* EFAULT or ENAMETOOLONG */
2605 
2606         if (zone->zone_initname != NULL)
2607                 strfree(zone->zone_initname);
2608 
2609         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2610         (void) strcpy(zone->zone_initname, initname);
2611         return (0);
2612 }
2613 









2614 static int
2615 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2616 {
2617         uint64_t mcap;
2618         int err = 0;
2619 
2620         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2621                 zone->zone_phys_mcap = mcap;
2622 









2623         return (err);
2624 }
2625 






2626 static int




























2627 zone_set_sched_class(zone_t *zone, const char *new_class)
2628 {
2629         char sched_class[PC_CLNMSZ];
2630         id_t classid;
2631         int err;
2632 
2633         ASSERT(zone != global_zone);
2634         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2635                 return (err);   /* EFAULT or ENAMETOOLONG */
2636 
2637         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2638                 return (set_errno(EINVAL));
2639         zone->zone_defaultcid = classid;
2640         ASSERT(zone->zone_defaultcid > 0 &&
2641             zone->zone_defaultcid < loaded_classes);
2642 
2643         return (0);
2644 }
2645 
2646 /*


3756         return (0);
3757 }
3758 
3759 /*
3760  * Non-global zone version of start_init.
3761  */
3762 void
3763 zone_start_init(void)
3764 {
3765         proc_t *p = ttoproc(curthread);
3766         zone_t *z = p->p_zone;
3767 
3768         ASSERT(!INGLOBALZONE(curproc));
3769 
3770         /*
3771          * For all purposes (ZONE_ATTR_INITPID and restart_init),
3772          * storing just the pid of init is sufficient.
3773          */
3774         z->zone_proc_initpid = p->p_pid;
3775 

3776         /*










3777          * We maintain zone_boot_err so that we can return the cause of the
3778          * failure back to the caller of the zone_boot syscall.
3779          */
3780         p->p_zone->zone_boot_err = start_init_common();
3781 
3782         /*
3783          * We will prevent booting zones from becoming running zones if the
3784          * global zone is shutting down.
3785          */
3786         mutex_enter(&zone_status_lock);
3787         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3788             ZONE_IS_SHUTTING_DOWN) {
3789                 /*
3790                  * Make sure we are still in the booting state-- we could have
3791                  * raced and already be shutting down, or even further along.
3792                  */
3793                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3794                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3795                 }
3796                 mutex_exit(&zone_status_lock);
3797                 /* It's gone bad, dispose of the process */
3798                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3799                         mutex_enter(&p->p_lock);
3800                         ASSERT(p->p_flag & SEXITLWPS);
3801                         lwp_exit();
3802                 }
3803         } else {


3804                 if (zone_status_get(z) == ZONE_IS_BOOTING)
3805                         zone_status_set(z, ZONE_IS_RUNNING);
3806                 mutex_exit(&zone_status_lock);











































3807                 /* cause the process to return to userland. */
3808                 lwp_rtt();
3809         }
3810 }
3811 
3812 struct zsched_arg {
3813         zone_t *zone;
3814         nvlist_t *nvlist;
3815 };
3816 
3817 /*
3818  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3819  * anything to do with scheduling, but rather with the fact that
3820  * per-zone kernel threads are parented to zsched, just like regular
3821  * kernel threads are parented to sched (p0).
3822  *
3823  * zsched is also responsible for launching init for the zone.
3824  */
3825 static void
3826 zsched(void *arg)


3828         struct zsched_arg *za = arg;
3829         proc_t *pp = curproc;
3830         proc_t *initp = proc_init;
3831         zone_t *zone = za->zone;
3832         cred_t *cr, *oldcred;
3833         rctl_set_t *set;
3834         rctl_alloc_gp_t *gp;
3835         contract_t *ct = NULL;
3836         task_t *tk, *oldtk;
3837         rctl_entity_p_t e;
3838         kproject_t *pj;
3839 
3840         nvlist_t *nvl = za->nvlist;
3841         nvpair_t *nvp = NULL;
3842 
3843         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3844         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3845         PTOU(pp)->u_argc = 0;
3846         PTOU(pp)->u_argv = NULL;
3847         PTOU(pp)->u_envp = NULL;

3848         closeall(P_FINFO(pp));
3849 
3850         /*
3851          * We are this zone's "zsched" process.  As the zone isn't generally
3852          * visible yet we don't need to grab any locks before initializing its
3853          * zone_proc pointer.
3854          */
3855         zone_hold(zone);  /* this hold is released by zone_destroy() */
3856         zone->zone_zsched = pp;
3857         mutex_enter(&pp->p_lock);
3858         pp->p_zone = zone;
3859         mutex_exit(&pp->p_lock);
3860 
3861         /*
3862          * Disassociate process from its 'parent'; parent ourselves to init
3863          * (pid 1) and change other values as needed.
3864          */
3865         sess_create();
3866 
3867         mutex_enter(&pidlock);


4270                 goto out;
4271         }
4272         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4273                 /*
4274                  * nvl may have been allocated/free'd, but the value set to
4275                  * non-NULL, so we reset it here.
4276                  */
4277                 nvl = NULL;
4278                 error = EINVAL;
4279                 goto out;
4280         }
4281         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4282                 rctl_dict_entry_t *rde;
4283                 rctl_hndl_t hndl;
4284                 nvlist_t **nvlarray;
4285                 uint_t i, nelem;
4286                 char *name;
4287 
4288                 error = EINVAL;
4289                 name = nvpair_name(nvp);
4290                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4291                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {

4292                         goto out;
4293                 }
4294                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4295                         goto out;
4296                 }
4297                 rde = rctl_dict_lookup_hndl(hndl);
4298                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4299                 ASSERT(error == 0);
4300                 for (i = 0; i < nelem; i++) {
4301                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4302                                 goto out;
4303                 }
4304                 if (rctl_invalid_value(rde, &rv)) {
4305                         error = EINVAL;
4306                         goto out;
4307                 }
4308         }
4309         error = 0;
4310         *nvlp = nvl;
4311 out:


4419         cred_t *zkcr;
4420         boolean_t insert_label_hash;
4421 
4422         if (secpolicy_zone_config(CRED()) != 0)
4423                 return (set_errno(EPERM));
4424 
4425         /* can't boot zone from within chroot environment */
4426         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4427                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4428                     extended_error));
4429 
4430         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4431         zoneid = zone->zone_id = id_alloc(zoneid_space);
4432         zone->zone_status = ZONE_IS_UNINITIALIZED;
4433         zone->zone_pool = pool_default;
4434         zone->zone_pool_mod = gethrtime();
4435         zone->zone_psetid = ZONE_PS_INVAL;
4436         zone->zone_ncpus = 0;
4437         zone->zone_ncpus_online = 0;
4438         zone->zone_restart_init = B_TRUE;


4439         zone->zone_brand = &native_brand;
4440         zone->zone_initname = NULL;
4441         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4442         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4443         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4444         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4445         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4446             offsetof(zone_ref_t, zref_linkage));
4447         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4448             offsetof(struct zsd_entry, zsd_linkage));
4449         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4450             offsetof(zone_dataset_t, zd_linkage));
4451         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4452             offsetof(zone_dl_t, zdl_linkage));
4453         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4454         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4455 
4456         if (flags & ZCF_NET_EXCL) {
4457                 zone->zone_flags |= ZF_NET_EXCL;
4458         }


4480         zone->zone_domain[0] = '\0';
4481         zone->zone_hostid = HW_INVALID_HOSTID;
4482         zone->zone_shares = 1;
4483         zone->zone_shmmax = 0;
4484         zone->zone_ipc.ipcq_shmmni = 0;
4485         zone->zone_ipc.ipcq_semmni = 0;
4486         zone->zone_ipc.ipcq_msgmni = 0;
4487         zone->zone_bootargs = NULL;
4488         zone->zone_fs_allowed = NULL;
4489         zone->zone_initname =
4490             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4491         (void) strcpy(zone->zone_initname, zone_default_initname);
4492         zone->zone_nlwps = 0;
4493         zone->zone_nlwps_ctl = INT_MAX;
4494         zone->zone_nprocs = 0;
4495         zone->zone_nprocs_ctl = INT_MAX;
4496         zone->zone_locked_mem = 0;
4497         zone->zone_locked_mem_ctl = UINT64_MAX;
4498         zone->zone_max_swap = 0;
4499         zone->zone_max_swap_ctl = UINT64_MAX;


4500         zone->zone_max_lofi = 0;
4501         zone->zone_max_lofi_ctl = UINT64_MAX;
4502         zone->zone_lockedmem_kstat = NULL;
4503         zone->zone_swapresv_kstat = NULL;

4504         zone->zone_zfs_io_pri = 1;
4505 
4506         /*
4507          * Zsched initializes the rctls.
4508          */
4509         zone->zone_rctls = NULL;
4510 
4511         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4512                 zone_free(zone);
4513                 return (zone_create_error(error, 0, extended_error));
4514         }
4515 
4516         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4517                 zone_free(zone);
4518                 return (set_errno(error));
4519         }
4520 
4521         /*
4522          * Read in the trusted system parameters:
4523          * match flag and sensitivity label.


4639         if (insert_label_hash) {
4640                 (void) mod_hash_insert(zonehashbylabel,
4641                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4642                 zone->zone_flags |= ZF_HASHED_LABEL;
4643         }
4644 
4645         /*
4646          * Insert into active list.  At this point there are no 'hold's
4647          * on the zone, but everyone else knows not to use it, so we can
4648          * continue to use it.  zsched() will do a zone_hold() if the
4649          * newproc() is successful.
4650          */
4651         list_insert_tail(&zone_active, zone);
4652         mutex_exit(&zonehash_lock);
4653 
4654         zarg.zone = zone;
4655         zarg.nvlist = rctls;
4656         /*
4657          * The process, task, and project rctls are probably wrong;
4658          * we need an interface to get the default values of all rctls,
4659          * and initialize zsched appropriately.  I'm not sure that that
4660          * makes much of a difference, though.
4661          */
4662         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4663         if (error != 0) {
4664                 /*
4665                  * We need to undo all globally visible state.
4666                  */
4667                 mutex_enter(&zonehash_lock);
4668                 list_remove(&zone_active, zone);
4669                 if (zone->zone_flags & ZF_HASHED_LABEL) {
4670                         ASSERT(zone->zone_slabel != NULL);
4671                         (void) mod_hash_destroy(zonehashbylabel,
4672                             (mod_hash_key_t)zone->zone_slabel);
4673                 }
4674                 (void) mod_hash_destroy(zonehashbyname,
4675                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
4676                 (void) mod_hash_destroy(zonehashbyid,
4677                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
4678                 ASSERT(zonecount > 1);
4679                 zonecount--;
4680                 goto errout;


5540                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5541                             NULL);
5542                         if (err != 0 && err != ENAMETOOLONG)
5543                                 error = EFAULT;
5544                 }
5545                 break;
5546         case ZONE_ATTR_BOOTARGS:
5547                 if (zone->zone_bootargs == NULL)
5548                         outstr = "";
5549                 else
5550                         outstr = zone->zone_bootargs;
5551                 size = strlen(outstr) + 1;
5552                 if (bufsize > size)
5553                         bufsize = size;
5554                 if (buf != NULL) {
5555                         err = copyoutstr(outstr, buf, bufsize, NULL);
5556                         if (err != 0 && err != ENAMETOOLONG)
5557                                 error = EFAULT;
5558                 }
5559                 break;
5560         case ZONE_ATTR_PHYS_MCAP:
5561                 size = sizeof (zone->zone_phys_mcap);
5562                 if (bufsize > size)
5563                         bufsize = size;
5564                 if (buf != NULL &&
5565                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5566                         error = EFAULT;
5567                 break;
5568         case ZONE_ATTR_SCHED_CLASS:
5569                 mutex_enter(&class_lock);
5570 
5571                 if (zone->zone_defaultcid >= loaded_classes)
5572                         outstr = "";
5573                 else
5574                         outstr = sclass[zone->zone_defaultcid].cl_name;
5575                 size = strlen(outstr) + 1;
5576                 if (bufsize > size)
5577                         bufsize = size;
5578                 if (buf != NULL) {
5579                         err = copyoutstr(outstr, buf, bufsize, NULL);
5580                         if (err != 0 && err != ENAMETOOLONG)
5581                                 error = EFAULT;
5582                 }
5583 
5584                 mutex_exit(&class_lock);
5585                 break;
5586         case ZONE_ATTR_HOSTID:
5587                 if (zone->zone_hostid != HW_INVALID_HOSTID &&


5602                 size = strlen(outstr) + 1;
5603                 if (bufsize > size)
5604                         bufsize = size;
5605                 if (buf != NULL) {
5606                         err = copyoutstr(outstr, buf, bufsize, NULL);
5607                         if (err != 0 && err != ENAMETOOLONG)
5608                                 error = EFAULT;
5609                 }
5610                 break;
5611         case ZONE_ATTR_NETWORK:
5612                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5613                 if (copyin(buf, zbuf, bufsize) != 0) {
5614                         error = EFAULT;
5615                 } else {
5616                         error = zone_get_network(zoneid, zbuf);
5617                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5618                                 error = EFAULT;
5619                 }
5620                 kmem_free(zbuf, bufsize);
5621                 break;









5622         default:
5623                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5624                         size = bufsize;
5625                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5626                 } else {
5627                         error = EINVAL;
5628                 }
5629         }
5630         zone_rele(zone);
5631 
5632         if (error)
5633                 return (set_errno(error));
5634         return ((ssize_t)size);
5635 }
5636 
5637 /*
5638  * Systemcall entry point for zone_setattr(2).
5639  */
5640 /*ARGSUSED*/
5641 static int
5642 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5643 {
5644         zone_t *zone;
5645         zone_status_t zone_status;
5646         int err = -1;
5647         zone_net_data_t *zbuf;
5648 
5649         if (secpolicy_zone_config(CRED()) != 0)
5650                 return (set_errno(EPERM));
5651 
5652         /*
5653          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5654          * global zone.
5655          */
5656         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {

5657                 return (set_errno(EINVAL));
5658         }
5659 
5660         mutex_enter(&zonehash_lock);
5661         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5662                 mutex_exit(&zonehash_lock);
5663                 return (set_errno(EINVAL));
5664         }
5665         zone_hold(zone);
5666         mutex_exit(&zonehash_lock);
5667 
5668         /*
5669          * At present most attributes can only be set on non-running,
5670          * non-global zones.
5671          */
5672         zone_status = zone_status_get(zone);
5673         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {


5674                 err = EINVAL;
5675                 goto done;
5676         }
5677 
5678         switch (attr) {
5679         case ZONE_ATTR_INITNAME:
5680                 err = zone_set_initname(zone, (const char *)buf);
5681                 break;
5682         case ZONE_ATTR_INITNORESTART:
5683                 zone->zone_restart_init = B_FALSE;
5684                 err = 0;
5685                 break;
5686         case ZONE_ATTR_BOOTARGS:
5687                 err = zone_set_bootargs(zone, (const char *)buf);
5688                 break;
5689         case ZONE_ATTR_BRAND:
5690                 err = zone_set_brand(zone, (const char *)buf);
5691                 break;
5692         case ZONE_ATTR_FS_ALLOWED:
5693                 err = zone_set_fs_allowed(zone, (const char *)buf);
5694                 break;
5695         case ZONE_ATTR_PHYS_MCAP:
5696                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5697                 break;









5698         case ZONE_ATTR_SCHED_CLASS:
5699                 err = zone_set_sched_class(zone, (const char *)buf);
5700                 break;
5701         case ZONE_ATTR_HOSTID:
5702                 if (bufsize == sizeof (zone->zone_hostid)) {
5703                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5704                                 err = 0;
5705                         else
5706                                 err = EFAULT;
5707                 } else {
5708                         err = EINVAL;
5709                 }
5710                 break;
5711         case ZONE_ATTR_NETWORK:
5712                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5713                         err = EINVAL;
5714                         break;
5715                 }
5716                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5717                 if (copyin(buf, zbuf, bufsize) != 0) {
5718                         kmem_free(zbuf, bufsize);
5719                         err = EFAULT;
5720                         break;
5721                 }
5722                 err = zone_set_network(zoneid, zbuf);
5723                 kmem_free(zbuf, bufsize);
5724                 break;
















5725         default:
5726                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5727                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5728                 else
5729                         err = EINVAL;
5730         }
5731 
5732 done:
5733         zone_rele(zone);
5734         ASSERT(err != -1);
5735         return (err != 0 ? set_errno(err) : 0);
5736 }
5737 
5738 /*
5739  * Return zero if the process has at least one vnode mapped in to its
5740  * address space which shouldn't be allowed to change zones.
5741  *
5742  * Also return zero if the process has any shared mappings which reserve
5743  * swap.  This is because the counting for zone.max-swap does not allow swap
5744  * reservation to be shared between zones.  zone swap reservation is counted


6508         door_arg_t darg, save_arg;
6509         char *zone_name;
6510         size_t zone_namelen;
6511         zoneid_t zoneid;
6512         zone_t *zone;
6513         zone_cmd_arg_t arg;
6514         uint64_t uniqid;
6515         size_t size;
6516         int error;
6517         int retry;
6518 
6519         zone = zargp->zone;
6520         arg = zargp->arg;
6521         kmem_free(zargp, sizeof (*zargp));
6522 
6523         zone_namelen = strlen(zone->zone_name) + 1;
6524         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6525         bcopy(zone->zone_name, zone_name, zone_namelen);
6526         zoneid = zone->zone_id;
6527         uniqid = zone->zone_uniqid;

6528         /*
6529          * zoneadmd may be down, but at least we can empty out the zone.
6530          * We can ignore the return value of zone_empty() since we're called
6531          * from a kernel thread and know we won't be delivered any signals.
6532          */
6533         ASSERT(curproc == &p0);
6534         (void) zone_empty(zone);
6535         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6536         zone_rele(zone);
6537 
6538         size = sizeof (arg);
6539         darg.rbuf = (char *)&arg;
6540         darg.data_ptr = (char *)&arg;
6541         darg.rsize = size;
6542         darg.data_size = size;
6543         darg.desc_ptr = NULL;
6544         darg.desc_num = 0;
6545 
6546         save_arg = darg;
6547         /*


6748 
6749         /*
6750          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6751          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6752          * could cause assertions to fail (e.g., assertions about a zone's
6753          * state during initialization, readying, or booting) or produce races.
6754          * We'll let threads continue to initialize and ready new zones: they'll
6755          * fail to boot the new zones when they see that the global zone is
6756          * shutting down.
6757          */
6758         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6759             current_zonep = list_next(&zone_active, current_zonep)) {
6760                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6761                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6762         }
6763         mutex_exit(&zone_status_lock);
6764         mutex_exit(&zonehash_lock);
6765 }
6766 
6767 /*
6768  * Returns true if the named dataset is visible in the current zone.
6769  * The 'write' parameter is set to 1 if the dataset is also writable.
6770  */
6771 int
6772 zone_dataset_visible(const char *dataset, int *write)
6773 {
6774         static int zfstype = -1;
6775         zone_dataset_t *zd;
6776         size_t len;
6777         zone_t *zone = curproc->p_zone;
6778         const char *name = NULL;
6779         vfs_t *vfsp = NULL;
6780 
6781         if (dataset[0] == '\0')
6782                 return (0);
6783 
6784         /*
6785          * Walk the list once, looking for datasets which match exactly, or
6786          * specify a dataset underneath an exported dataset.  If found, return
6787          * true and note that it is writable.
6788          */
6789         for (zd = list_head(&zone->zone_datasets); zd != NULL;
6790             zd = list_next(&zone->zone_datasets, zd)) {
6791 
6792                 len = strlen(zd->zd_dataset);
6793                 if (strlen(dataset) >= len &&
6794                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
6795                     (dataset[len] == '\0' || dataset[len] == '/' ||
6796                     dataset[len] == '@')) {
6797                         if (write)


6825         /*
6826          * We reach here if the given dataset is not found in the zone_dataset
6827          * list. Check if this dataset was added as a filesystem (ie. "add fs")
6828          * instead of delegation. For this we search for the dataset in the
6829          * zone_vfslist of this zone. If found, return true and note that it is
6830          * not writable.
6831          */
6832 
6833         /*
6834          * Initialize zfstype if it is not initialized yet.
6835          */
6836         if (zfstype == -1) {
6837                 struct vfssw *vswp = vfs_getvfssw("zfs");
6838                 zfstype = vswp - vfssw;
6839                 vfs_unrefvfssw(vswp);
6840         }
6841 
6842         vfs_list_read_lock();
6843         vfsp = zone->zone_vfslist;
6844         do {
6845                 ASSERT(vfsp);

6846                 if (vfsp->vfs_fstype == zfstype) {
6847                         name = refstr_value(vfsp->vfs_resource);
6848 
6849                         /*
6850                          * Check if we have an exact match.
6851                          */
6852                         if (strcmp(dataset, name) == 0) {
6853                                 vfs_list_unlock();
6854                                 if (write)
6855                                         *write = 0;
6856                                 return (1);
6857                         }
6858                         /*
6859                          * We need to check if we are looking for parents of
6860                          * a dataset. These should be visible, but read-only.
6861                          */
6862                         len = strlen(dataset);
6863                         if (dataset[len - 1] == '/')
6864                                 len--;
6865 
6866                         if (len < strlen(name) &&
6867                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
6868                                 vfs_list_unlock();
6869                                 if (write)
6870                                         *write = 0;
6871                                 return (1);
6872                         }
6873                 }
6874                 vfsp = vfsp->vfs_zone_next;
6875         } while (vfsp != zone->zone_vfslist);
6876 
6877         vfs_list_unlock();
6878         return (0);
6879 }
6880 












6881 /*
6882  * zone_find_by_any_path() -
6883  *
6884  * kernel-private routine similar to zone_find_by_path(), but which
6885  * effectively compares against zone paths rather than zonerootpath
6886  * (i.e., the last component of zonerootpaths, which should be "root/",
6887  * are not compared.)  This is done in order to accurately identify all
6888  * paths, whether zone-visible or not, including those which are parallel
6889  * to /root/, such as /dev/, /home/, etc...
6890  *
6891  * If the specified path does not fall under any zone path then global
6892  * zone is returned.
6893  *
6894  * The treat_abs parameter indicates whether the path should be treated as
6895  * an absolute path although it does not begin with "/".  (This supports
6896  * nfs mount syntax such as host:any/path.)
6897  *
6898  * The caller is responsible for zone_rele of the returned zone.
6899  */
6900 zone_t *




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016, Joyent Inc.
  25  */
  26 
  27 /*
  28  * Zones
  29  *
  30  *   A zone is a named collection of processes, namespace constraints,
  31  *   and other system resources which comprise a secure and manageable
  32  *   application containment facility.
  33  *
  34  *   Zones (represented by the reference counted zone_t) are tracked in
  35  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
  36  *   (zoneid_t) are used to track zone association.  Zone IDs are
  37  *   dynamically generated when the zone is created; if a persistent
  38  *   identifier is needed (core files, accounting logs, audit trail,
  39  *   etc.), the zone name should be used.
  40  *
  41  *
  42  *   Global Zone:
  43  *
  44  *   The global zone (zoneid 0) is automatically associated with all


 233 #include <sys/klpd.h>
 234 
 235 #include <sys/door.h>
 236 #include <sys/cpuvar.h>
 237 #include <sys/sdt.h>
 238 
 239 #include <sys/uadmin.h>
 240 #include <sys/session.h>
 241 #include <sys/cmn_err.h>
 242 #include <sys/modhash.h>
 243 #include <sys/sunddi.h>
 244 #include <sys/nvpair.h>
 245 #include <sys/rctl.h>
 246 #include <sys/fss.h>
 247 #include <sys/brand.h>
 248 #include <sys/zone.h>
 249 #include <net/if.h>
 250 #include <sys/cpucaps.h>
 251 #include <vm/seg.h>
 252 #include <sys/mac.h>
 253 #include <sys/rt.h>
 254 #include <sys/fx.h>
 255 
 256 /*
 257  * This constant specifies the number of seconds that threads waiting for
 258  * subsystems to release a zone's general-purpose references will wait before
 259  * they log the zone's reference counts.  The constant's value shouldn't
 260  * be so small that reference counts are unnecessarily reported for zones
 261  * whose references are slowly released.  On the other hand, it shouldn't be so
 262  * large that users reboot their systems out of frustration over hung zones
 263  * before the system logs the zones' reference counts.
 264  */
 265 #define ZONE_DESTROY_TIMEOUT_SECS       60
 266 
 267 /* List of data link IDs which are accessible from the zone */
 268 typedef struct zone_dl {
 269         datalink_id_t   zdl_id;
 270         nvlist_t        *zdl_net;
 271         list_node_t     zdl_linkage;
 272 } zone_dl_t;
 273 
 274 /*


 355 /*
 356  * This array contains the names of the subsystems listed in zone_ref_subsys_t
 357  * (see sys/zone.h).
 358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368 
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_phys_mem;
 376 rctl_hndl_t rc_zone_max_lofi;
 377 rctl_hndl_t rc_zone_cpu_cap;
 378 rctl_hndl_t rc_zone_zfs_io_pri;
 379 rctl_hndl_t rc_zone_nlwps;
 380 rctl_hndl_t rc_zone_nprocs;
 381 rctl_hndl_t rc_zone_shmmax;
 382 rctl_hndl_t rc_zone_shmmni;
 383 rctl_hndl_t rc_zone_semmni;
 384 rctl_hndl_t rc_zone_msgmni;
 385 
 386 const char * const zone_default_initname = "/sbin/init";
 387 static char * const zone_prefix = "/zone/";
 388 static int zone_shutdown(zoneid_t zoneid);
 389 static int zone_add_datalink(zoneid_t, datalink_id_t);
 390 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 391 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 392 static int zone_set_network(zoneid_t, zone_net_data_t *);
 393 static int zone_get_network(zoneid_t, zone_net_data_t *);
 394 
 395 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);


1728 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1729     rctl_qty_t nv)
1730 {
1731         ASSERT(MUTEX_HELD(&p->p_lock));
1732         ASSERT(e->rcep_t == RCENTITY_ZONE);
1733         if (e->rcep_p.zone == NULL)
1734                 return (0);
1735         e->rcep_p.zone->zone_max_swap_ctl = nv;
1736         return (0);
1737 }
1738 
1739 static rctl_ops_t zone_max_swap_ops = {
1740         rcop_no_action,
1741         zone_max_swap_usage,
1742         zone_max_swap_set,
1743         zone_max_swap_test
1744 };
1745 
1746 /*ARGSUSED*/
1747 static rctl_qty_t
1748 zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
1749 {
1750         rctl_qty_t q;
1751         zone_t *z = p->p_zone;
1752 
1753         ASSERT(MUTEX_HELD(&p->p_lock));
1754         /* No additional lock because not enforced in the kernel */
1755         q = z->zone_phys_mem;
1756         return (q);
1757 }
1758 
1759 /*ARGSUSED*/
1760 static int
1761 zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1762     rctl_qty_t nv)
1763 {
1764         ASSERT(MUTEX_HELD(&p->p_lock));
1765         ASSERT(e->rcep_t == RCENTITY_ZONE);
1766         if (e->rcep_p.zone == NULL)
1767                 return (0);
1768         e->rcep_p.zone->zone_phys_mem_ctl = nv;
1769         return (0);
1770 }
1771 
1772 static rctl_ops_t zone_phys_mem_ops = {
1773         rcop_no_action,
1774         zone_phys_mem_usage,
1775         zone_phys_mem_set,
1776         rcop_no_test
1777 };
1778 
1779 /*ARGSUSED*/
1780 static rctl_qty_t
1781 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1782 {
1783         rctl_qty_t q;
1784         zone_t *z = p->p_zone;
1785 
1786         ASSERT(MUTEX_HELD(&p->p_lock));
1787         mutex_enter(&z->zone_rctl_lock);
1788         q = z->zone_max_lofi;
1789         mutex_exit(&z->zone_rctl_lock);
1790         return (q);
1791 }
1792 
1793 /*ARGSUSED*/
1794 static int
1795 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1796     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1797 {
1798         rctl_qty_t q;
1799         zone_t *z;
1800 


1854         crhold(cr);
1855         zone_rele(zone);
1856         return (cr);
1857 }
1858 
1859 static int
1860 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1861 {
1862         zone_t *zone = ksp->ks_private;
1863         zone_kstat_t *zk = ksp->ks_data;
1864 
1865         if (rw == KSTAT_WRITE)
1866                 return (EACCES);
1867 
1868         zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1869         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1870         return (0);
1871 }
1872 
1873 static int
1874 zone_physmem_kstat_update(kstat_t *ksp, int rw)
1875 {
1876         zone_t *zone = ksp->ks_private;
1877         zone_kstat_t *zk = ksp->ks_data;
1878 
1879         if (rw == KSTAT_WRITE)
1880                 return (EACCES);
1881 
1882         zk->zk_usage.value.ui64 = zone->zone_phys_mem;
1883         zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
1884         return (0);
1885 }
1886 
1887 static int
1888 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1889 {
1890         zone_t *zone = ksp->ks_private;
1891         zone_kstat_t *zk = ksp->ks_data;
1892 
1893         if (rw == KSTAT_WRITE)
1894                 return (EACCES);
1895 
1896         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1897         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1898         return (0);
1899 }
1900 
1901 static int
1902 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1903 {
1904         zone_t *zone = ksp->ks_private;
1905         zone_kstat_t *zk = ksp->ks_data;
1906 
1907         if (rw == KSTAT_WRITE)


1921 
1922         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1923             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1924             KSTAT_FLAG_VIRTUAL);
1925 
1926         if (ksp == NULL)
1927                 return (NULL);
1928 
1929         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1930         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1931         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1932         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1933         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1934         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1935         ksp->ks_update = updatefunc;
1936         ksp->ks_private = zone;
1937         kstat_install(ksp);
1938         return (ksp);
1939 }
1940 
1941 static int
1942 zone_vfs_kstat_update(kstat_t *ksp, int rw)
1943 {
1944         zone_t *zone = ksp->ks_private;
1945         zone_vfs_kstat_t *zvp = ksp->ks_data;
1946         kstat_io_t *kiop = &zone->zone_vfs_rwstats;
1947 
1948         if (rw == KSTAT_WRITE)
1949                 return (EACCES);
1950 
1951         /*
1952          * Extract the VFS statistics from the kstat_io_t structure used by
1953          * kstat_runq_enter() and related functions.  Since the slow ops
1954          * counters are updated directly by the VFS layer, there's no need to
1955          * copy those statistics here.
1956          *
1957          * Note that kstat_runq_enter() and the related functions use
1958          * gethrtime_unscaled(), so scale the time here.
1959          */
1960         zvp->zv_nread.value.ui64 = kiop->nread;
1961         zvp->zv_reads.value.ui64 = kiop->reads;
1962         zvp->zv_rtime.value.ui64 = kiop->rtime;
1963         zvp->zv_rcnt.value.ui64 = kiop->rcnt;
1964         zvp->zv_rlentime.value.ui64 = kiop->rlentime;
1965         zvp->zv_nwritten.value.ui64 = kiop->nwritten;
1966         zvp->zv_writes.value.ui64 = kiop->writes;
1967         zvp->zv_wtime.value.ui64 = kiop->wtime;
1968         zvp->zv_wcnt.value.ui64 = kiop->wcnt;
1969         zvp->zv_wlentime.value.ui64 = kiop->wlentime;
1970 
1971         scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
1972         scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
1973         scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
1974         scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
1975 
1976         return (0);
1977 }
1978 
1979 static kstat_t *
1980 zone_vfs_kstat_create(zone_t *zone)
1981 {
1982         kstat_t *ksp;
1983         zone_vfs_kstat_t *zvp;
1984 
1985         if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
1986             zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
1987             sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
1988             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1989                 return (NULL);
1990 
1991         if (zone->zone_id != GLOBAL_ZONEID)
1992                 kstat_zone_add(ksp, GLOBAL_ZONEID);
1993 
1994         zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
1995         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1996         ksp->ks_lock = &zone->zone_vfs_lock;
1997         zone->zone_vfs_stats = zvp;
1998 
1999         /* The kstat "name" field is not large enough for a full zonename */
2000         kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
2001         kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
2002         kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
2003         kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
2004         kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
2005         kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
2006         kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
2007         kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
2008         kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
2009         kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
2010         kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
2011         kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
2012         kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
2013         kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
2014         kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
2015         kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
2016         kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
2017         kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
2018 
2019         ksp->ks_update = zone_vfs_kstat_update;
2020         ksp->ks_private = zone;
2021 
2022         kstat_install(ksp);
2023         return (ksp);
2024 }
2025 
2026 static int
2027 zone_zfs_kstat_update(kstat_t *ksp, int rw)
2028 {
2029         zone_t *zone = ksp->ks_private;
2030         zone_zfs_kstat_t *zzp = ksp->ks_data;
2031         kstat_io_t *kiop = &zone->zone_zfs_rwstats;
2032 
2033         if (rw == KSTAT_WRITE)
2034                 return (EACCES);
2035 
2036         /*
2037          * Extract the ZFS statistics from the kstat_io_t structure used by
2038          * kstat_runq_enter() and related functions.  Since the I/O throttle
2039          * counters are updated directly by the ZFS layer, there's no need to
2040          * copy those statistics here.
2041          *
2042          * Note that kstat_runq_enter() and the related functions use
2043          * gethrtime_unscaled(), so scale the time here.
2044          */
2045         zzp->zz_nread.value.ui64 = kiop->nread;
2046         zzp->zz_reads.value.ui64 = kiop->reads;
2047         zzp->zz_rtime.value.ui64 = kiop->rtime;
2048         zzp->zz_rlentime.value.ui64 = kiop->rlentime;
2049         zzp->zz_nwritten.value.ui64 = kiop->nwritten;
2050         zzp->zz_writes.value.ui64 = kiop->writes;
2051 
2052         scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
2053         scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
2054 
2055         return (0);
2056 }
2057 
2058 static kstat_t *
2059 zone_zfs_kstat_create(zone_t *zone)
2060 {
2061         kstat_t *ksp;
2062         zone_zfs_kstat_t *zzp;
2063 
2064         if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
2065             zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
2066             sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
2067             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2068                 return (NULL);
2069 
2070         if (zone->zone_id != GLOBAL_ZONEID)
2071                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2072 
2073         zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
2074         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2075         ksp->ks_lock = &zone->zone_zfs_lock;
2076         zone->zone_zfs_stats = zzp;
2077 
2078         /* The kstat "name" field is not large enough for a full zonename */
2079         kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
2080         kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
2081         kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
2082         kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
2083         kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
2084         kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
2085         kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
2086         kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
2087         kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
2088 
2089         ksp->ks_update = zone_zfs_kstat_update;
2090         ksp->ks_private = zone;
2091 
2092         kstat_install(ksp);
2093         return (ksp);
2094 }
2095 
2096 static int
2097 zone_mcap_kstat_update(kstat_t *ksp, int rw)
2098 {
2099         zone_t *zone = ksp->ks_private;
2100         zone_mcap_kstat_t *zmp = ksp->ks_data;
2101 
2102         if (rw == KSTAT_WRITE)
2103                 return (EACCES);
2104 
2105         zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
2106         zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
2107         zmp->zm_swap.value.ui64 = zone->zone_max_swap;
2108         zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
2109         zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
2110         zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
2111         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
2112         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
2113         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
2114         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
2115         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
2116         zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
2117         zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
2118 
2119         return (0);
2120 }
2121 
2122 static kstat_t *
2123 zone_mcap_kstat_create(zone_t *zone)
2124 {
2125         kstat_t *ksp;
2126         zone_mcap_kstat_t *zmp;
2127 
2128         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2129             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2130             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2131             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2132                 return (NULL);
2133 
2134         if (zone->zone_id != GLOBAL_ZONEID)
2135                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2136 
2137         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2138         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2139         ksp->ks_lock = &zone->zone_mcap_lock;
2140         zone->zone_mcap_stats = zmp;
2141 
2142         /* The kstat "name" field is not large enough for a full zonename */
2143         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2144         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2145         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2146         kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2147         kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2148         kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2149         kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2150         kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2151         kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2152         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2153         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2154         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2155         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2156         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2157             KSTAT_DATA_UINT64);
2158         kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
2159             KSTAT_DATA_UINT64);
2160         kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
2161             KSTAT_DATA_UINT64);
2162 
2163         ksp->ks_update = zone_mcap_kstat_update;
2164         ksp->ks_private = zone;
2165 
2166         kstat_install(ksp);
2167         return (ksp);
2168 }
2169 
2170 static int
2171 zone_misc_kstat_update(kstat_t *ksp, int rw)
2172 {
2173         zone_t *zone = ksp->ks_private;
2174         zone_misc_kstat_t *zmp = ksp->ks_data;
2175         hrtime_t tmp;
2176 
2177         if (rw == KSTAT_WRITE)
2178                 return (EACCES);
2179 
2180         tmp = zone->zone_utime;
2181         scalehrtime(&tmp);


2241         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2242         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2243             KSTAT_DATA_UINT32);
2244         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2245         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2246 
2247         ksp->ks_update = zone_misc_kstat_update;
2248         ksp->ks_private = zone;
2249 
2250         kstat_install(ksp);
2251         return (ksp);
2252 }
2253 
2254 static void
2255 zone_kstat_create(zone_t *zone)
2256 {
2257         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2258             "lockedmem", zone_lockedmem_kstat_update);
2259         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2260             "swapresv", zone_swapresv_kstat_update);
2261         zone->zone_physmem_kstat = zone_kstat_create_common(zone,
2262             "physicalmem", zone_physmem_kstat_update);
2263         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2264             "nprocs", zone_nprocs_kstat_update);
2265 
2266         if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2267                 zone->zone_vfs_stats = kmem_zalloc(
2268                     sizeof (zone_vfs_kstat_t), KM_SLEEP);
2269         }
2270 
2271         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2272                 zone->zone_mcap_stats = kmem_zalloc(
2273                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2274         }
2275 
2276         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2277                 zone->zone_misc_stats = kmem_zalloc(
2278                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2279         }
2280 
2281 }
2282 
2283 static void
2284 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2285 {
2286         void *data;
2287 
2288         if (*pkstat != NULL) {
2289                 data = (*pkstat)->ks_data;
2290                 kstat_delete(*pkstat);
2291                 kmem_free(data, datasz);
2292                 *pkstat = NULL;
2293         }
2294 }
2295 
2296 static void
2297 zone_kstat_delete(zone_t *zone)
2298 {
2299         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2300             sizeof (zone_kstat_t));
2301         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2302             sizeof (zone_kstat_t));
2303         zone_kstat_delete_common(&zone->zone_physmem_kstat,
2304             sizeof (zone_kstat_t));
2305         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2306             sizeof (zone_kstat_t));
2307 
2308         zone_kstat_delete_common(&zone->zone_vfs_ksp,
2309             sizeof (zone_vfs_kstat_t));
2310         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2311             sizeof (zone_mcap_kstat_t));
2312         zone_kstat_delete_common(&zone->zone_misc_ksp,
2313             sizeof (zone_misc_kstat_t));
2314 
2315 }
2316 
2317 /*
2318  * Called very early on in boot to initialize the ZSD list so that
2319  * zone_key_create() can be called before zone_init().  It also initializes
2320  * portions of zone0 which may be used before zone_init() is called.  The
2321  * variable "global_zone" will be set when zone0 is fully initialized by
2322  * zone_init().
2323  */
2324 void
2325 zone_zsd_init(void)
2326 {
2327         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2328         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2329         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2330             offsetof(struct zsd_entry, zsd_linkage));
2331         list_create(&zone_active, sizeof (zone_t),
2332             offsetof(zone_t, zone_linkage));
2333         list_create(&zone_deathrow, sizeof (zone_t),
2334             offsetof(zone_t, zone_linkage));
2335 
2336         mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2337         mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2338         mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2339         zone0.zone_shares = 1;
2340         zone0.zone_nlwps = 0;
2341         zone0.zone_nlwps_ctl = INT_MAX;
2342         zone0.zone_nprocs = 0;
2343         zone0.zone_nprocs_ctl = INT_MAX;
2344         zone0.zone_locked_mem = 0;
2345         zone0.zone_locked_mem_ctl = UINT64_MAX;
2346         ASSERT(zone0.zone_max_swap == 0);
2347         zone0.zone_max_swap_ctl = UINT64_MAX;
2348         zone0.zone_phys_mem = 0;
2349         zone0.zone_phys_mem_ctl = UINT64_MAX;
2350         zone0.zone_max_lofi = 0;
2351         zone0.zone_max_lofi_ctl = UINT64_MAX;
2352         zone0.zone_shmmax = 0;
2353         zone0.zone_ipc.ipcq_shmmni = 0;
2354         zone0.zone_ipc.ipcq_semmni = 0;
2355         zone0.zone_ipc.ipcq_msgmni = 0;
2356         zone0.zone_name = GLOBAL_ZONENAME;
2357         zone0.zone_nodename = utsname.nodename;
2358         zone0.zone_domain = srpc_domain;
2359         zone0.zone_hostid = HW_INVALID_HOSTID;
2360         zone0.zone_fs_allowed = NULL;
2361         zone0.zone_ref = 1;
2362         zone0.zone_id = GLOBAL_ZONEID;
2363         zone0.zone_status = ZONE_IS_RUNNING;
2364         zone0.zone_rootpath = "/";
2365         zone0.zone_rootpathlen = 2;
2366         zone0.zone_psetid = ZONE_PS_INVAL;
2367         zone0.zone_ncpus = 0;
2368         zone0.zone_ncpus_online = 0;
2369         zone0.zone_proc_initpid = 1;
2370         zone0.zone_initname = initname;
2371         zone0.zone_lockedmem_kstat = NULL;
2372         zone0.zone_swapresv_kstat = NULL;
2373         zone0.zone_physmem_kstat = NULL;
2374         zone0.zone_nprocs_kstat = NULL;
2375         zone0.zone_zfs_io_pri = 1;
2376 
2377         zone0.zone_stime = 0;
2378         zone0.zone_utime = 0;
2379         zone0.zone_wtime = 0;
2380 
2381         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2382             offsetof(zone_ref_t, zref_linkage));
2383         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2384             offsetof(struct zsd_entry, zsd_linkage));
2385         list_insert_head(&zone_active, &zone0);
2386 
2387         /*
2388          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2389          * to anything meaningful.  It is assigned to be 'rootdir' in
2390          * vfs_mountroot().
2391          */
2392         zone0.zone_rootvp = NULL;
2393         zone0.zone_vfslist = NULL;


2470          */
2471         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2472 
2473         /*
2474          * Initialize generic zone resource controls, if any.
2475          */
2476         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2477             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2478             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2479             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2480 
2481         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2482             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2483             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2484             RCTL_GLOBAL_INFINITE,
2485             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2486 
2487         rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2488             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2489             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2490             16384, 16384, &zone_zfs_io_pri_ops);
2491 
2492         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2493             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2494             INT_MAX, INT_MAX, &zone_lwps_ops);
2495 
2496         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2497             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2498             INT_MAX, INT_MAX, &zone_procs_ops);
2499 
2500         /*
2501          * System V IPC resource controls
2502          */
2503         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2504             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2505             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2506 
2507         rc_zone_semmni = rctl_register("zone.max-sem-ids",
2508             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2509             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2510 


2523         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2524         bzero(dval, sizeof (rctl_val_t));
2525         dval->rcv_value = 1;
2526         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2527         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2528         dval->rcv_action_recip_pid = -1;
2529 
2530         rde = rctl_dict_lookup("zone.cpu-shares");
2531         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2532 
2533         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2534             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2535             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2536             &zone_locked_mem_ops);
2537 
2538         rc_zone_max_swap = rctl_register("zone.max-swap",
2539             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2540             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2541             &zone_max_swap_ops);
2542 
2543         rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2544             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2545             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2546             &zone_phys_mem_ops);
2547 
2548         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2549             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2550             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2551             &zone_max_lofi_ops);
2552 
2553         /*
2554          * Initialize the ``global zone''.
2555          */
2556         set = rctl_set_create();
2557         gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2558         mutex_enter(&p0.p_lock);
2559         e.rcep_p.zone = &zone0;
2560         e.rcep_t = RCENTITY_ZONE;
2561         zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2562             gp);
2563 
2564         zone0.zone_nlwps = p0.p_lwpcnt;
2565         zone0.zone_nprocs = 1;
2566         zone0.zone_ntasks = 1;
2567         mutex_exit(&p0.p_lock);
2568         zone0.zone_restart_init = B_TRUE;
2569         zone0.zone_reboot_on_init_exit = B_FALSE;
2570         zone0.zone_init_status = -1;
2571         zone0.zone_brand = &native_brand;
2572         rctl_prealloc_destroy(gp);
2573         /*
2574          * pool_default hasn't been initialized yet, so we let pool_init()
2575          * take care of making sure the global zone is in the default pool.
2576          */
2577 
2578         /*
2579          * Initialize global zone kstats
2580          */
2581         zone_kstat_create(&zone0);
2582 
2583         /*
2584          * Initialize zone label.
2585          * mlp are initialized when tnzonecfg is loaded.
2586          */
2587         zone0.zone_slabel = l_admin_low;
2588         rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2589         label_hold(l_admin_low);
2590 


2630         /*
2631          * The global zone is fully initialized (except for zone_rootvp which
2632          * will be set when the root filesystem is mounted).
2633          */
2634         global_zone = &zone0;
2635 
2636         /*
2637          * Setup an event channel to send zone status change notifications on
2638          */
2639         res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2640             EVCH_CREAT);
2641 
2642         if (res)
2643                 panic("Sysevent_evc_bind failed during zone setup.\n");
2644 
2645 }
2646 
2647 static void
2648 zone_free(zone_t *zone)
2649 {
2650         zone_dl_t *zdl;
2651 
2652         ASSERT(zone != global_zone);
2653         ASSERT(zone->zone_ntasks == 0);
2654         ASSERT(zone->zone_nlwps == 0);
2655         ASSERT(zone->zone_nprocs == 0);
2656         ASSERT(zone->zone_cred_ref == 0);
2657         ASSERT(zone->zone_kcred == NULL);
2658         ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2659             zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2660         ASSERT(list_is_empty(&zone->zone_ref_list));
2661 
2662         /*
2663          * Remove any zone caps.
2664          */
2665         cpucaps_zone_remove(zone);
2666 
2667         ASSERT(zone->zone_cpucap == NULL);
2668 
2669         /* remove from deathrow list */
2670         if (zone_status_get(zone) == ZONE_IS_DEAD) {
2671                 ASSERT(zone->zone_ref == 0);
2672                 mutex_enter(&zone_deathrow_lock);
2673                 list_remove(&zone_deathrow, zone);
2674                 mutex_exit(&zone_deathrow_lock);
2675         }
2676 
2677         list_destroy(&zone->zone_ref_list);
2678         zone_free_zsd(zone);
2679         zone_free_datasets(zone);
2680 
2681         /*
2682          * While dlmgmtd should have removed all of these, it could have left
2683          * something behind or crashed. In which case it's not safe for us to
2684          * assume that the list is empty which list_destroy() will ASSERT. We
2685          * clean up for our userland comrades which may have crashed, or worse,
2686          * been disabled by SMF.
2687          */
2688         while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
2689                 if (zdl->zdl_net != NULL)
2690                         nvlist_free(zdl->zdl_net);
2691                 kmem_free(zdl, sizeof (zone_dl_t));
2692         }
2693         list_destroy(&zone->zone_dl_list);
2694 
2695         if (zone->zone_rootvp != NULL)
2696                 VN_RELE(zone->zone_rootvp);
2697         if (zone->zone_rootpath)
2698                 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2699         if (zone->zone_name != NULL)
2700                 kmem_free(zone->zone_name, ZONENAME_MAX);
2701         if (zone->zone_slabel != NULL)
2702                 label_rele(zone->zone_slabel);
2703         if (zone->zone_nodename != NULL)
2704                 kmem_free(zone->zone_nodename, _SYS_NMLN);
2705         if (zone->zone_domain != NULL)
2706                 kmem_free(zone->zone_domain, _SYS_NMLN);
2707         if (zone->zone_privset != NULL)
2708                 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2709         if (zone->zone_rctls != NULL)
2710                 rctl_set_free(zone->zone_rctls);
2711         if (zone->zone_bootargs != NULL)
2712                 strfree(zone->zone_bootargs);


2808         kmem_free(attrp, sizeof (struct brand_attr));
2809         if (bp == NULL)
2810                 return (EINVAL);
2811 
2812         /*
2813          * This is the only place where a zone can change it's brand.
2814          * We already need to hold zone_status_lock to check the zone
2815          * status, so we'll just use that lock to serialize zone
2816          * branding requests as well.
2817          */
2818         mutex_enter(&zone_status_lock);
2819 
2820         /* Re-Branding is not allowed and the zone can't be booted yet */
2821         if ((ZONE_IS_BRANDED(zone)) ||
2822             (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2823                 mutex_exit(&zone_status_lock);
2824                 brand_unregister_zone(bp);
2825                 return (EINVAL);
2826         }
2827 
2828         /*
2829          * Set up the brand specific data.
2830          * Note that it's possible that the hook has to drop the
2831          * zone_status_lock and reaquire it before returning so we can't
2832          * assume the lock has been held the entire time.
2833          */
2834         zone->zone_brand = bp;
2835         ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
2836 
2837         mutex_exit(&zone_status_lock);
2838         return (0);
2839 }
2840 
2841 static int
2842 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2843 {
2844         char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2845         int err = 0;
2846 
2847         ASSERT(zone != global_zone);
2848         if ((err = copyinstr(zone_fs_allowed, buf,
2849             ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2850                 goto done;
2851 
2852         if (zone->zone_fs_allowed != NULL)
2853                 strfree(zone->zone_fs_allowed);
2854 
2855         zone->zone_fs_allowed = strdup(buf);


2861 
2862 static int
2863 zone_set_initname(zone_t *zone, const char *zone_initname)
2864 {
2865         char initname[INITNAME_SZ];
2866         size_t len;
2867         int err = 0;
2868 
2869         ASSERT(zone != global_zone);
2870         if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2871                 return (err);   /* EFAULT or ENAMETOOLONG */
2872 
2873         if (zone->zone_initname != NULL)
2874                 strfree(zone->zone_initname);
2875 
2876         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2877         (void) strcpy(zone->zone_initname, initname);
2878         return (0);
2879 }
2880 
2881 /*
2882  * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
2883  * to provide the physical memory capping kstats.  Since physical memory
2884  * capping is currently implemented in userland, that code uses the setattr
2885  * entry point to increment the kstats.  We always simply increment nover
2886  * every time that setattr is called and we always add in the input value
2887  * to zone_mcap_pagedout every time that is called.
2888  */
2889 /*ARGSUSED*/
2890 static int
2891 zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
2892 {
2893         zone->zone_mcap_nover++;

2894 
2895         return (0);
2896 }
2897 
2898 static int
2899 zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
2900 {
2901         uint64_t pageout;
2902         int err;
2903 
2904         if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
2905                 zone->zone_mcap_pagedout += pageout;
2906 
2907         return (err);
2908 }
2909 
2910 /*
2911  * The zone_set_page_fault_delay function is used to set the number of usecs
2912  * to throttle page faults.  This is normally 0 but can be set to a non-0 value
2913  * by the user-land memory capping code when the zone is over its physcial
2914  * memory cap.
2915  */
2916 static int
2917 zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
2918 {
2919         uint32_t dusec;
2920         int err;
2921 
2922         if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
2923                 zone->zone_pg_flt_delay = dusec;
2924 
2925         return (err);
2926 }
2927 
2928 /*
2929  * The zone_set_rss function is used to set the zone's RSS when we do the
2930  * fast, approximate calculation in user-land.
2931  */
2932 static int
2933 zone_set_rss(zone_t *zone, const uint64_t *prss)
2934 {
2935         uint64_t rss;
2936         int err;
2937 
2938         if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
2939                 zone->zone_phys_mem = rss;
2940 
2941         return (err);
2942 }
2943 
2944 static int
2945 zone_set_sched_class(zone_t *zone, const char *new_class)
2946 {
2947         char sched_class[PC_CLNMSZ];
2948         id_t classid;
2949         int err;
2950 
2951         ASSERT(zone != global_zone);
2952         if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2953                 return (err);   /* EFAULT or ENAMETOOLONG */
2954 
2955         if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2956                 return (set_errno(EINVAL));
2957         zone->zone_defaultcid = classid;
2958         ASSERT(zone->zone_defaultcid > 0 &&
2959             zone->zone_defaultcid < loaded_classes);
2960 
2961         return (0);
2962 }
2963 
2964 /*


4074         return (0);
4075 }
4076 
4077 /*
4078  * Non-global zone version of start_init.
4079  */
4080 void
4081 zone_start_init(void)
4082 {
4083         proc_t *p = ttoproc(curthread);
4084         zone_t *z = p->p_zone;
4085 
4086         ASSERT(!INGLOBALZONE(curproc));
4087 
4088         /*
4089          * For all purposes (ZONE_ATTR_INITPID and restart_init),
4090          * storing just the pid of init is sufficient.
4091          */
4092         z->zone_proc_initpid = p->p_pid;
4093 
4094         if (z->zone_setup_app_contract == B_TRUE) {
4095                 /*
4096                  * Normally a process cannot modify its own contract, but we're
4097                  * just starting the zone's init process and its contract is
4098                  * always initialized from the sys_process_tmpl template, so
4099                  * this is the simplest way to setup init's contract to kill
4100                  * the process if any other process in the contract exits.
4101                  */
4102                 p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
4103         }
4104 
4105         /*
4106          * We maintain zone_boot_err so that we can return the cause of the
4107          * failure back to the caller of the zone_boot syscall.
4108          */
4109         p->p_zone->zone_boot_err = start_init_common();
4110 
4111         /*
4112          * We will prevent booting zones from becoming running zones if the
4113          * global zone is shutting down.
4114          */
4115         mutex_enter(&zone_status_lock);
4116         if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
4117             ZONE_IS_SHUTTING_DOWN) {
4118                 /*
4119                  * Make sure we are still in the booting state-- we could have
4120                  * raced and already be shutting down, or even further along.
4121                  */
4122                 if (zone_status_get(z) == ZONE_IS_BOOTING) {
4123                         zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
4124                 }
4125                 mutex_exit(&zone_status_lock);
4126                 /* It's gone bad, dispose of the process */
4127                 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
4128                         mutex_enter(&p->p_lock);
4129                         ASSERT(p->p_flag & SEXITLWPS);
4130                         lwp_exit();
4131                 }
4132         } else {
4133                 id_t cid = curthread->t_cid;
4134 
4135                 if (zone_status_get(z) == ZONE_IS_BOOTING)
4136                         zone_status_set(z, ZONE_IS_RUNNING);
4137                 mutex_exit(&zone_status_lock);
4138 
4139                 mutex_enter(&class_lock);
4140                 ASSERT(cid < loaded_classes);
4141                 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
4142                     z->zone_fixed_hipri) {
4143                         /*
4144                          * If the zone is using FX then by default all
4145                          * processes start at the lowest priority and stay
4146                          * there. We provide a mechanism for the zone to
4147                          * indicate that it should run at "high priority". In
4148                          * this case we setup init to run at the highest FX
4149                          * priority (which is one level higher than the
4150                          * non-fixed scheduling classes can use).
4151                          */
4152                         pcparms_t pcparms;
4153 
4154                         pcparms.pc_cid = cid;
4155                         ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
4156                         ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
4157                             FXMAXUPRI;
4158                         ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
4159                             FX_DOUPRILIM | FX_DOUPRI;
4160 
4161                         mutex_enter(&pidlock);
4162                         mutex_enter(&curproc->p_lock);
4163 
4164                         (void) parmsset(&pcparms, curthread);
4165 
4166                         mutex_exit(&curproc->p_lock);
4167                         mutex_exit(&pidlock);
4168                 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
4169                         /*
4170                          * zsched always starts the init lwp at priority
4171                          * minclsyspri - 1. This priority gets set in t_pri and
4172                          * is invalid for RT, but RT never uses t_pri. However
4173                          * t_pri is used by procfs, so we always see processes
4174                          * within an RT zone with an invalid priority value.
4175                          * We fix that up now.
4176                          */
4177                         curthread->t_pri = RTGPPRIO0;
4178                 }
4179                 mutex_exit(&class_lock);
4180 
4181                 /* cause the process to return to userland. */
4182                 lwp_rtt();
4183         }
4184 }
4185 
4186 struct zsched_arg {
4187         zone_t *zone;
4188         nvlist_t *nvlist;
4189 };
4190 
4191 /*
4192  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
4193  * anything to do with scheduling, but rather with the fact that
4194  * per-zone kernel threads are parented to zsched, just like regular
4195  * kernel threads are parented to sched (p0).
4196  *
4197  * zsched is also responsible for launching init for the zone.
4198  */
4199 static void
4200 zsched(void *arg)


4202         struct zsched_arg *za = arg;
4203         proc_t *pp = curproc;
4204         proc_t *initp = proc_init;
4205         zone_t *zone = za->zone;
4206         cred_t *cr, *oldcred;
4207         rctl_set_t *set;
4208         rctl_alloc_gp_t *gp;
4209         contract_t *ct = NULL;
4210         task_t *tk, *oldtk;
4211         rctl_entity_p_t e;
4212         kproject_t *pj;
4213 
4214         nvlist_t *nvl = za->nvlist;
4215         nvpair_t *nvp = NULL;
4216 
4217         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
4218         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
4219         PTOU(pp)->u_argc = 0;
4220         PTOU(pp)->u_argv = NULL;
4221         PTOU(pp)->u_envp = NULL;
4222         PTOU(pp)->u_commpagep = NULL;
4223         closeall(P_FINFO(pp));
4224 
4225         /*
4226          * We are this zone's "zsched" process.  As the zone isn't generally
4227          * visible yet we don't need to grab any locks before initializing its
4228          * zone_proc pointer.
4229          */
4230         zone_hold(zone);  /* this hold is released by zone_destroy() */
4231         zone->zone_zsched = pp;
4232         mutex_enter(&pp->p_lock);
4233         pp->p_zone = zone;
4234         mutex_exit(&pp->p_lock);
4235 
4236         /*
4237          * Disassociate process from its 'parent'; parent ourselves to init
4238          * (pid 1) and change other values as needed.
4239          */
4240         sess_create();
4241 
4242         mutex_enter(&pidlock);


4645                 goto out;
4646         }
4647         if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4648                 /*
4649                  * nvl may have been allocated/free'd, but the value set to
4650                  * non-NULL, so we reset it here.
4651                  */
4652                 nvl = NULL;
4653                 error = EINVAL;
4654                 goto out;
4655         }
4656         while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4657                 rctl_dict_entry_t *rde;
4658                 rctl_hndl_t hndl;
4659                 nvlist_t **nvlarray;
4660                 uint_t i, nelem;
4661                 char *name;
4662 
4663                 error = EINVAL;
4664                 name = nvpair_name(nvp);
4665                 if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
4666                     strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
4667                     nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4668                         goto out;
4669                 }
4670                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4671                         goto out;
4672                 }
4673                 rde = rctl_dict_lookup_hndl(hndl);
4674                 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4675                 ASSERT(error == 0);
4676                 for (i = 0; i < nelem; i++) {
4677                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4678                                 goto out;
4679                 }
4680                 if (rctl_invalid_value(rde, &rv)) {
4681                         error = EINVAL;
4682                         goto out;
4683                 }
4684         }
4685         error = 0;
4686         *nvlp = nvl;
4687 out:


4795         cred_t *zkcr;
4796         boolean_t insert_label_hash;
4797 
4798         if (secpolicy_zone_config(CRED()) != 0)
4799                 return (set_errno(EPERM));
4800 
4801         /* can't boot zone from within chroot environment */
4802         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4803                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4804                     extended_error));
4805 
4806         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4807         zoneid = zone->zone_id = id_alloc(zoneid_space);
4808         zone->zone_status = ZONE_IS_UNINITIALIZED;
4809         zone->zone_pool = pool_default;
4810         zone->zone_pool_mod = gethrtime();
4811         zone->zone_psetid = ZONE_PS_INVAL;
4812         zone->zone_ncpus = 0;
4813         zone->zone_ncpus_online = 0;
4814         zone->zone_restart_init = B_TRUE;
4815         zone->zone_reboot_on_init_exit = B_FALSE;
4816         zone->zone_init_status = -1;
4817         zone->zone_brand = &native_brand;
4818         zone->zone_initname = NULL;
4819         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4820         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4821         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4822         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4823         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4824             offsetof(zone_ref_t, zref_linkage));
4825         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4826             offsetof(struct zsd_entry, zsd_linkage));
4827         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4828             offsetof(zone_dataset_t, zd_linkage));
4829         list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4830             offsetof(zone_dl_t, zdl_linkage));
4831         rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4832         rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4833 
4834         if (flags & ZCF_NET_EXCL) {
4835                 zone->zone_flags |= ZF_NET_EXCL;
4836         }


4858         zone->zone_domain[0] = '\0';
4859         zone->zone_hostid = HW_INVALID_HOSTID;
4860         zone->zone_shares = 1;
4861         zone->zone_shmmax = 0;
4862         zone->zone_ipc.ipcq_shmmni = 0;
4863         zone->zone_ipc.ipcq_semmni = 0;
4864         zone->zone_ipc.ipcq_msgmni = 0;
4865         zone->zone_bootargs = NULL;
4866         zone->zone_fs_allowed = NULL;
4867         zone->zone_initname =
4868             kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4869         (void) strcpy(zone->zone_initname, zone_default_initname);
4870         zone->zone_nlwps = 0;
4871         zone->zone_nlwps_ctl = INT_MAX;
4872         zone->zone_nprocs = 0;
4873         zone->zone_nprocs_ctl = INT_MAX;
4874         zone->zone_locked_mem = 0;
4875         zone->zone_locked_mem_ctl = UINT64_MAX;
4876         zone->zone_max_swap = 0;
4877         zone->zone_max_swap_ctl = UINT64_MAX;
4878         zone->zone_phys_mem = 0;
4879         zone->zone_phys_mem_ctl = UINT64_MAX;
4880         zone->zone_max_lofi = 0;
4881         zone->zone_max_lofi_ctl = UINT64_MAX;
4882         zone->zone_lockedmem_kstat = NULL;
4883         zone->zone_swapresv_kstat = NULL;
4884         zone->zone_physmem_kstat = NULL;
4885         zone->zone_zfs_io_pri = 1;
4886 
4887         /*
4888          * Zsched initializes the rctls.
4889          */
4890         zone->zone_rctls = NULL;
4891 
4892         if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4893                 zone_free(zone);
4894                 return (zone_create_error(error, 0, extended_error));
4895         }
4896 
4897         if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4898                 zone_free(zone);
4899                 return (set_errno(error));
4900         }
4901 
4902         /*
4903          * Read in the trusted system parameters:
4904          * match flag and sensitivity label.


5020         if (insert_label_hash) {
5021                 (void) mod_hash_insert(zonehashbylabel,
5022                     (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
5023                 zone->zone_flags |= ZF_HASHED_LABEL;
5024         }
5025 
5026         /*
5027          * Insert into active list.  At this point there are no 'hold's
5028          * on the zone, but everyone else knows not to use it, so we can
5029          * continue to use it.  zsched() will do a zone_hold() if the
5030          * newproc() is successful.
5031          */
5032         list_insert_tail(&zone_active, zone);
5033         mutex_exit(&zonehash_lock);
5034 
5035         zarg.zone = zone;
5036         zarg.nvlist = rctls;
5037         /*
5038          * The process, task, and project rctls are probably wrong;
5039          * we need an interface to get the default values of all rctls,
5040          * and initialize zsched appropriately. However, we allow zoneadmd
5041          * to pass down both zone and project rctls for the zone's init.
5042          */
5043         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
5044         if (error != 0) {
5045                 /*
5046                  * We need to undo all globally visible state.
5047                  */
5048                 mutex_enter(&zonehash_lock);
5049                 list_remove(&zone_active, zone);
5050                 if (zone->zone_flags & ZF_HASHED_LABEL) {
5051                         ASSERT(zone->zone_slabel != NULL);
5052                         (void) mod_hash_destroy(zonehashbylabel,
5053                             (mod_hash_key_t)zone->zone_slabel);
5054                 }
5055                 (void) mod_hash_destroy(zonehashbyname,
5056                     (mod_hash_key_t)(uintptr_t)zone->zone_name);
5057                 (void) mod_hash_destroy(zonehashbyid,
5058                     (mod_hash_key_t)(uintptr_t)zone->zone_id);
5059                 ASSERT(zonecount > 1);
5060                 zonecount--;
5061                 goto errout;


5921                         err = copyoutstr(zone->zone_initname, buf, bufsize,
5922                             NULL);
5923                         if (err != 0 && err != ENAMETOOLONG)
5924                                 error = EFAULT;
5925                 }
5926                 break;
5927         case ZONE_ATTR_BOOTARGS:
5928                 if (zone->zone_bootargs == NULL)
5929                         outstr = "";
5930                 else
5931                         outstr = zone->zone_bootargs;
5932                 size = strlen(outstr) + 1;
5933                 if (bufsize > size)
5934                         bufsize = size;
5935                 if (buf != NULL) {
5936                         err = copyoutstr(outstr, buf, bufsize, NULL);
5937                         if (err != 0 && err != ENAMETOOLONG)
5938                                 error = EFAULT;
5939                 }
5940                 break;








5941         case ZONE_ATTR_SCHED_CLASS:
5942                 mutex_enter(&class_lock);
5943 
5944                 if (zone->zone_defaultcid >= loaded_classes)
5945                         outstr = "";
5946                 else
5947                         outstr = sclass[zone->zone_defaultcid].cl_name;
5948                 size = strlen(outstr) + 1;
5949                 if (bufsize > size)
5950                         bufsize = size;
5951                 if (buf != NULL) {
5952                         err = copyoutstr(outstr, buf, bufsize, NULL);
5953                         if (err != 0 && err != ENAMETOOLONG)
5954                                 error = EFAULT;
5955                 }
5956 
5957                 mutex_exit(&class_lock);
5958                 break;
5959         case ZONE_ATTR_HOSTID:
5960                 if (zone->zone_hostid != HW_INVALID_HOSTID &&


5975                 size = strlen(outstr) + 1;
5976                 if (bufsize > size)
5977                         bufsize = size;
5978                 if (buf != NULL) {
5979                         err = copyoutstr(outstr, buf, bufsize, NULL);
5980                         if (err != 0 && err != ENAMETOOLONG)
5981                                 error = EFAULT;
5982                 }
5983                 break;
5984         case ZONE_ATTR_NETWORK:
5985                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5986                 if (copyin(buf, zbuf, bufsize) != 0) {
5987                         error = EFAULT;
5988                 } else {
5989                         error = zone_get_network(zoneid, zbuf);
5990                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5991                                 error = EFAULT;
5992                 }
5993                 kmem_free(zbuf, bufsize);
5994                 break;
5995         case ZONE_ATTR_SCHED_FIXEDHI:
5996                 size = sizeof (boolean_t);
5997                 if (bufsize > size)
5998                         bufsize = size;
5999 
6000                 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6001                     bufsize) != 0)
6002                         error = EFAULT;
6003                 break;
6004         default:
6005                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6006                         size = bufsize;
6007                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6008                 } else {
6009                         error = EINVAL;
6010                 }
6011         }
6012         zone_rele(zone);
6013 
6014         if (error)
6015                 return (set_errno(error));
6016         return ((ssize_t)size);
6017 }
6018 
6019 /*
6020  * Systemcall entry point for zone_setattr(2).
6021  */
6022 /*ARGSUSED*/
6023 static int
6024 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
6025 {
6026         zone_t *zone;
6027         zone_status_t zone_status;
6028         int err = -1;
6029         zone_net_data_t *zbuf;
6030 
6031         if (secpolicy_zone_config(CRED()) != 0)
6032                 return (set_errno(EPERM));
6033 
6034         /*
6035          * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
6036          * attributes can be set on the global zone.
6037          */
6038         if (zoneid == GLOBAL_ZONEID &&
6039             attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
6040                 return (set_errno(EINVAL));
6041         }
6042 
6043         mutex_enter(&zonehash_lock);
6044         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
6045                 mutex_exit(&zonehash_lock);
6046                 return (set_errno(EINVAL));
6047         }
6048         zone_hold(zone);
6049         mutex_exit(&zonehash_lock);
6050 
6051         /*
6052          * At present most attributes can only be set on non-running,
6053          * non-global zones.
6054          */
6055         zone_status = zone_status_get(zone);
6056         if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
6057             attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
6058             zone_status > ZONE_IS_READY) {
6059                 err = EINVAL;
6060                 goto done;
6061         }
6062 
6063         switch (attr) {
6064         case ZONE_ATTR_INITNAME:
6065                 err = zone_set_initname(zone, (const char *)buf);
6066                 break;
6067         case ZONE_ATTR_INITNORESTART:
6068                 zone->zone_restart_init = B_FALSE;
6069                 err = 0;
6070                 break;
6071         case ZONE_ATTR_BOOTARGS:
6072                 err = zone_set_bootargs(zone, (const char *)buf);
6073                 break;
6074         case ZONE_ATTR_BRAND:
6075                 err = zone_set_brand(zone, (const char *)buf);
6076                 break;
6077         case ZONE_ATTR_FS_ALLOWED:
6078                 err = zone_set_fs_allowed(zone, (const char *)buf);
6079                 break;
6080         case ZONE_ATTR_PMCAP_NOVER:
6081                 err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
6082                 break;
6083         case ZONE_ATTR_PMCAP_PAGEOUT:
6084                 err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
6085                 break;
6086         case ZONE_ATTR_PG_FLT_DELAY:
6087                 err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
6088                 break;
6089         case ZONE_ATTR_RSS:
6090                 err = zone_set_rss(zone, (const uint64_t *)buf);
6091                 break;
6092         case ZONE_ATTR_SCHED_CLASS:
6093                 err = zone_set_sched_class(zone, (const char *)buf);
6094                 break;
6095         case ZONE_ATTR_HOSTID:
6096                 if (bufsize == sizeof (zone->zone_hostid)) {
6097                         if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
6098                                 err = 0;
6099                         else
6100                                 err = EFAULT;
6101                 } else {
6102                         err = EINVAL;
6103                 }
6104                 break;
6105         case ZONE_ATTR_NETWORK:
6106                 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
6107                         err = EINVAL;
6108                         break;
6109                 }
6110                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6111                 if (copyin(buf, zbuf, bufsize) != 0) {
6112                         kmem_free(zbuf, bufsize);
6113                         err = EFAULT;
6114                         break;
6115                 }
6116                 err = zone_set_network(zoneid, zbuf);
6117                 kmem_free(zbuf, bufsize);
6118                 break;
6119         case ZONE_ATTR_APP_SVC_CT:
6120                 if (bufsize != sizeof (boolean_t)) {
6121                         err = EINVAL;
6122                 } else {
6123                         zone->zone_setup_app_contract = (boolean_t)buf;
6124                         err = 0;
6125                 }
6126                 break;
6127         case ZONE_ATTR_SCHED_FIXEDHI:
6128                 if (bufsize != sizeof (boolean_t)) {
6129                         err = EINVAL;
6130                 } else {
6131                         zone->zone_fixed_hipri = (boolean_t)buf;
6132                         err = 0;
6133                 }
6134                 break;
6135         default:
6136                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
6137                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
6138                 else
6139                         err = EINVAL;
6140         }
6141 
6142 done:
6143         zone_rele(zone);
6144         ASSERT(err != -1);
6145         return (err != 0 ? set_errno(err) : 0);
6146 }
6147 
6148 /*
6149  * Return zero if the process has at least one vnode mapped in to its
6150  * address space which shouldn't be allowed to change zones.
6151  *
6152  * Also return zero if the process has any shared mappings which reserve
6153  * swap.  This is because the counting for zone.max-swap does not allow swap
6154  * reservation to be shared between zones.  zone swap reservation is counted


6918         door_arg_t darg, save_arg;
6919         char *zone_name;
6920         size_t zone_namelen;
6921         zoneid_t zoneid;
6922         zone_t *zone;
6923         zone_cmd_arg_t arg;
6924         uint64_t uniqid;
6925         size_t size;
6926         int error;
6927         int retry;
6928 
6929         zone = zargp->zone;
6930         arg = zargp->arg;
6931         kmem_free(zargp, sizeof (*zargp));
6932 
6933         zone_namelen = strlen(zone->zone_name) + 1;
6934         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6935         bcopy(zone->zone_name, zone_name, zone_namelen);
6936         zoneid = zone->zone_id;
6937         uniqid = zone->zone_uniqid;
6938         arg.status = zone->zone_init_status;
6939         /*
6940          * zoneadmd may be down, but at least we can empty out the zone.
6941          * We can ignore the return value of zone_empty() since we're called
6942          * from a kernel thread and know we won't be delivered any signals.
6943          */
6944         ASSERT(curproc == &p0);
6945         (void) zone_empty(zone);
6946         ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6947         zone_rele(zone);
6948 
6949         size = sizeof (arg);
6950         darg.rbuf = (char *)&arg;
6951         darg.data_ptr = (char *)&arg;
6952         darg.rsize = size;
6953         darg.data_size = size;
6954         darg.desc_ptr = NULL;
6955         darg.desc_num = 0;
6956 
6957         save_arg = darg;
6958         /*


7159 
7160         /*
7161          * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
7162          * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
7163          * could cause assertions to fail (e.g., assertions about a zone's
7164          * state during initialization, readying, or booting) or produce races.
7165          * We'll let threads continue to initialize and ready new zones: they'll
7166          * fail to boot the new zones when they see that the global zone is
7167          * shutting down.
7168          */
7169         for (current_zonep = list_head(&zone_active); current_zonep != NULL;
7170             current_zonep = list_next(&zone_active, current_zonep)) {
7171                 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
7172                         zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
7173         }
7174         mutex_exit(&zone_status_lock);
7175         mutex_exit(&zonehash_lock);
7176 }
7177 
7178 /*
7179  * Returns true if the named dataset is visible in the specified zone.
7180  * The 'write' parameter is set to 1 if the dataset is also writable.
7181  */
7182 int
7183 zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
7184 {
7185         static int zfstype = -1;
7186         zone_dataset_t *zd;
7187         size_t len;

7188         const char *name = NULL;
7189         vfs_t *vfsp = NULL;
7190 
7191         if (dataset[0] == '\0')
7192                 return (0);
7193 
7194         /*
7195          * Walk the list once, looking for datasets which match exactly, or
7196          * specify a dataset underneath an exported dataset.  If found, return
7197          * true and note that it is writable.
7198          */
7199         for (zd = list_head(&zone->zone_datasets); zd != NULL;
7200             zd = list_next(&zone->zone_datasets, zd)) {
7201 
7202                 len = strlen(zd->zd_dataset);
7203                 if (strlen(dataset) >= len &&
7204                     bcmp(dataset, zd->zd_dataset, len) == 0 &&
7205                     (dataset[len] == '\0' || dataset[len] == '/' ||
7206                     dataset[len] == '@')) {
7207                         if (write)


7235         /*
7236          * We reach here if the given dataset is not found in the zone_dataset
7237          * list. Check if this dataset was added as a filesystem (ie. "add fs")
7238          * instead of delegation. For this we search for the dataset in the
7239          * zone_vfslist of this zone. If found, return true and note that it is
7240          * not writable.
7241          */
7242 
7243         /*
7244          * Initialize zfstype if it is not initialized yet.
7245          */
7246         if (zfstype == -1) {
7247                 struct vfssw *vswp = vfs_getvfssw("zfs");
7248                 zfstype = vswp - vfssw;
7249                 vfs_unrefvfssw(vswp);
7250         }
7251 
7252         vfs_list_read_lock();
7253         vfsp = zone->zone_vfslist;
7254         do {
7255                 if (vfsp == NULL)
7256                         break;
7257                 if (vfsp->vfs_fstype == zfstype) {
7258                         name = refstr_value(vfsp->vfs_resource);
7259 
7260                         /*
7261                          * Check if we have an exact match.
7262                          */
7263                         if (strcmp(dataset, name) == 0) {
7264                                 vfs_list_unlock();
7265                                 if (write)
7266                                         *write = 0;
7267                                 return (1);
7268                         }
7269                         /*
7270                          * We need to check if we are looking for parents of
7271                          * a dataset. These should be visible, but read-only.
7272                          */
7273                         len = strlen(dataset);
7274                         if (dataset[len - 1] == '/')
7275                                 len--;
7276 
7277                         if (len < strlen(name) &&
7278                             bcmp(dataset, name, len) == 0 && name[len] == '/') {
7279                                 vfs_list_unlock();
7280                                 if (write)
7281                                         *write = 0;
7282                                 return (1);
7283                         }
7284                 }
7285                 vfsp = vfsp->vfs_zone_next;
7286         } while (vfsp != zone->zone_vfslist);
7287 
7288         vfs_list_unlock();
7289         return (0);
7290 }
7291 
7292 /*
7293  * Returns true if the named dataset is visible in the current zone.
7294  * The 'write' parameter is set to 1 if the dataset is also writable.
7295  */
7296 int
7297 zone_dataset_visible(const char *dataset, int *write)
7298 {
7299         zone_t *zone = curproc->p_zone;
7300 
7301         return (zone_dataset_visible_inzone(zone, dataset, write));
7302 }
7303 
7304 /*
7305  * zone_find_by_any_path() -
7306  *
7307  * kernel-private routine similar to zone_find_by_path(), but which
7308  * effectively compares against zone paths rather than zonerootpath
7309  * (i.e., the last component of zonerootpaths, which should be "root/",
7310  * are not compared.)  This is done in order to accurately identify all
7311  * paths, whether zone-visible or not, including those which are parallel
7312  * to /root/, such as /dev/, /home/, etc...
7313  *
7314  * If the specified path does not fall under any zone path then global
7315  * zone is returned.
7316  *
7317  * The treat_abs parameter indicates whether the path should be treated as
7318  * an absolute path although it does not begin with "/".  (This supports
7319  * nfs mount syntax such as host:any/path.)
7320  *
7321  * The caller is responsible for zone_rele of the returned zone.
7322  */
7323 zone_t *