4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent Inc. All rights reserved.
25 */
26
27 /*
28 * Zones
29 *
30 * A zone is a named collection of processes, namespace constraints,
31 * and other system resources which comprise a secure and manageable
32 * application containment facility.
33 *
34 * Zones (represented by the reference counted zone_t) are tracked in
35 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs
36 * (zoneid_t) are used to track zone association. Zone IDs are
37 * dynamically generated when the zone is created; if a persistent
38 * identifier is needed (core files, accounting logs, audit trail,
39 * etc.), the zone name should be used.
40 *
41 *
42 * Global Zone:
43 *
44 * The global zone (zoneid 0) is automatically associated with all
233 #include <sys/klpd.h>
234
235 #include <sys/door.h>
236 #include <sys/cpuvar.h>
237 #include <sys/sdt.h>
238
239 #include <sys/uadmin.h>
240 #include <sys/session.h>
241 #include <sys/cmn_err.h>
242 #include <sys/modhash.h>
243 #include <sys/sunddi.h>
244 #include <sys/nvpair.h>
245 #include <sys/rctl.h>
246 #include <sys/fss.h>
247 #include <sys/brand.h>
248 #include <sys/zone.h>
249 #include <net/if.h>
250 #include <sys/cpucaps.h>
251 #include <vm/seg.h>
252 #include <sys/mac.h>
253
254 /*
255 * This constant specifies the number of seconds that threads waiting for
256 * subsystems to release a zone's general-purpose references will wait before
257 * they log the zone's reference counts. The constant's value shouldn't
258 * be so small that reference counts are unnecessarily reported for zones
259 * whose references are slowly released. On the other hand, it shouldn't be so
260 * large that users reboot their systems out of frustration over hung zones
261 * before the system logs the zones' reference counts.
262 */
263 #define ZONE_DESTROY_TIMEOUT_SECS 60
264
265 /* List of data link IDs which are accessible from the zone */
266 typedef struct zone_dl {
267 datalink_id_t zdl_id;
268 nvlist_t *zdl_net;
269 list_node_t zdl_linkage;
270 } zone_dl_t;
271
272 /*
353 /*
354 * This array contains the names of the subsystems listed in zone_ref_subsys_t
355 * (see sys/zone.h).
356 */
357 static char *zone_ref_subsys_names[] = {
358 "NFS", /* ZONE_REF_NFS */
359 "NFSv4", /* ZONE_REF_NFSV4 */
360 "SMBFS", /* ZONE_REF_SMBFS */
361 "MNTFS", /* ZONE_REF_MNTFS */
362 "LOFI", /* ZONE_REF_LOFI */
363 "VFS", /* ZONE_REF_VFS */
364 "IPC" /* ZONE_REF_IPC */
365 };
366
367 /*
368 * This isn't static so lint doesn't complain.
369 */
370 rctl_hndl_t rc_zone_cpu_shares;
371 rctl_hndl_t rc_zone_locked_mem;
372 rctl_hndl_t rc_zone_max_swap;
373 rctl_hndl_t rc_zone_max_lofi;
374 rctl_hndl_t rc_zone_cpu_cap;
375 rctl_hndl_t rc_zone_zfs_io_pri;
376 rctl_hndl_t rc_zone_nlwps;
377 rctl_hndl_t rc_zone_nprocs;
378 rctl_hndl_t rc_zone_shmmax;
379 rctl_hndl_t rc_zone_shmmni;
380 rctl_hndl_t rc_zone_semmni;
381 rctl_hndl_t rc_zone_msgmni;
382
383 const char * const zone_default_initname = "/sbin/init";
384 static char * const zone_prefix = "/zone/";
385 static int zone_shutdown(zoneid_t zoneid);
386 static int zone_add_datalink(zoneid_t, datalink_id_t);
387 static int zone_remove_datalink(zoneid_t, datalink_id_t);
388 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
389 static int zone_set_network(zoneid_t, zone_net_data_t *);
390 static int zone_get_network(zoneid_t, zone_net_data_t *);
391
392 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
1725 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1726 rctl_qty_t nv)
1727 {
1728 ASSERT(MUTEX_HELD(&p->p_lock));
1729 ASSERT(e->rcep_t == RCENTITY_ZONE);
1730 if (e->rcep_p.zone == NULL)
1731 return (0);
1732 e->rcep_p.zone->zone_max_swap_ctl = nv;
1733 return (0);
1734 }
1735
1736 static rctl_ops_t zone_max_swap_ops = {
1737 rcop_no_action,
1738 zone_max_swap_usage,
1739 zone_max_swap_set,
1740 zone_max_swap_test
1741 };
1742
1743 /*ARGSUSED*/
1744 static rctl_qty_t
1745 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1746 {
1747 rctl_qty_t q;
1748 zone_t *z = p->p_zone;
1749
1750 ASSERT(MUTEX_HELD(&p->p_lock));
1751 mutex_enter(&z->zone_rctl_lock);
1752 q = z->zone_max_lofi;
1753 mutex_exit(&z->zone_rctl_lock);
1754 return (q);
1755 }
1756
1757 /*ARGSUSED*/
1758 static int
1759 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1760 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1761 {
1762 rctl_qty_t q;
1763 zone_t *z;
1764
1818 crhold(cr);
1819 zone_rele(zone);
1820 return (cr);
1821 }
1822
1823 static int
1824 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1825 {
1826 zone_t *zone = ksp->ks_private;
1827 zone_kstat_t *zk = ksp->ks_data;
1828
1829 if (rw == KSTAT_WRITE)
1830 return (EACCES);
1831
1832 zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1833 zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1834 return (0);
1835 }
1836
1837 static int
1838 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1839 {
1840 zone_t *zone = ksp->ks_private;
1841 zone_kstat_t *zk = ksp->ks_data;
1842
1843 if (rw == KSTAT_WRITE)
1844 return (EACCES);
1845
1846 zk->zk_usage.value.ui64 = zone->zone_nprocs;
1847 zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1848 return (0);
1849 }
1850
1851 static int
1852 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1853 {
1854 zone_t *zone = ksp->ks_private;
1855 zone_kstat_t *zk = ksp->ks_data;
1856
1857 if (rw == KSTAT_WRITE)
1871
1872 ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1873 sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1874 KSTAT_FLAG_VIRTUAL);
1875
1876 if (ksp == NULL)
1877 return (NULL);
1878
1879 zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1880 ksp->ks_data_size += strlen(zone->zone_name) + 1;
1881 kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1882 kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1883 kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1884 kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1885 ksp->ks_update = updatefunc;
1886 ksp->ks_private = zone;
1887 kstat_install(ksp);
1888 return (ksp);
1889 }
1890
1891
1892 static int
1893 zone_mcap_kstat_update(kstat_t *ksp, int rw)
1894 {
1895 zone_t *zone = ksp->ks_private;
1896 zone_mcap_kstat_t *zmp = ksp->ks_data;
1897
1898 if (rw == KSTAT_WRITE)
1899 return (EACCES);
1900
1901 zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1902 zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1903 zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1904 zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1905 zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
1906
1907 return (0);
1908 }
1909
1910 static kstat_t *
1911 zone_mcap_kstat_create(zone_t *zone)
1912 {
1913 kstat_t *ksp;
1914 zone_mcap_kstat_t *zmp;
1915
1916 if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
1917 zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
1918 sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
1919 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1920 return (NULL);
1921
1922 if (zone->zone_id != GLOBAL_ZONEID)
1923 kstat_zone_add(ksp, GLOBAL_ZONEID);
1924
1925 zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1926 ksp->ks_data_size += strlen(zone->zone_name) + 1;
1927 ksp->ks_lock = &zone->zone_mcap_lock;
1928 zone->zone_mcap_stats = zmp;
1929
1930 /* The kstat "name" field is not large enough for a full zonename */
1931 kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1932 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
1933 kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1934 kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1935 kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1936 kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1937 kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1938 KSTAT_DATA_UINT64);
1939
1940 ksp->ks_update = zone_mcap_kstat_update;
1941 ksp->ks_private = zone;
1942
1943 kstat_install(ksp);
1944 return (ksp);
1945 }
1946
1947 static int
1948 zone_misc_kstat_update(kstat_t *ksp, int rw)
1949 {
1950 zone_t *zone = ksp->ks_private;
1951 zone_misc_kstat_t *zmp = ksp->ks_data;
1952 hrtime_t tmp;
1953
1954 if (rw == KSTAT_WRITE)
1955 return (EACCES);
1956
1957 tmp = zone->zone_utime;
1958 scalehrtime(&tmp);
2018 kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2019 kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2020 KSTAT_DATA_UINT32);
2021 kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2022 kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2023
2024 ksp->ks_update = zone_misc_kstat_update;
2025 ksp->ks_private = zone;
2026
2027 kstat_install(ksp);
2028 return (ksp);
2029 }
2030
2031 static void
2032 zone_kstat_create(zone_t *zone)
2033 {
2034 zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2035 "lockedmem", zone_lockedmem_kstat_update);
2036 zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2037 "swapresv", zone_swapresv_kstat_update);
2038 zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2039 "nprocs", zone_nprocs_kstat_update);
2040
2041 if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2042 zone->zone_mcap_stats = kmem_zalloc(
2043 sizeof (zone_mcap_kstat_t), KM_SLEEP);
2044 }
2045
2046 if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2047 zone->zone_misc_stats = kmem_zalloc(
2048 sizeof (zone_misc_kstat_t), KM_SLEEP);
2049 }
2050 }
2051
2052 static void
2053 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2054 {
2055 void *data;
2056
2057 if (*pkstat != NULL) {
2058 data = (*pkstat)->ks_data;
2059 kstat_delete(*pkstat);
2060 kmem_free(data, datasz);
2061 *pkstat = NULL;
2062 }
2063 }
2064
2065 static void
2066 zone_kstat_delete(zone_t *zone)
2067 {
2068 zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2069 sizeof (zone_kstat_t));
2070 zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2071 sizeof (zone_kstat_t));
2072 zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2073 sizeof (zone_kstat_t));
2074 zone_kstat_delete_common(&zone->zone_mcap_ksp,
2075 sizeof (zone_mcap_kstat_t));
2076 zone_kstat_delete_common(&zone->zone_misc_ksp,
2077 sizeof (zone_misc_kstat_t));
2078 }
2079
2080 /*
2081 * Called very early on in boot to initialize the ZSD list so that
2082 * zone_key_create() can be called before zone_init(). It also initializes
2083 * portions of zone0 which may be used before zone_init() is called. The
2084 * variable "global_zone" will be set when zone0 is fully initialized by
2085 * zone_init().
2086 */
2087 void
2088 zone_zsd_init(void)
2089 {
2090 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2091 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2092 list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2093 offsetof(struct zsd_entry, zsd_linkage));
2094 list_create(&zone_active, sizeof (zone_t),
2095 offsetof(zone_t, zone_linkage));
2096 list_create(&zone_deathrow, sizeof (zone_t),
2097 offsetof(zone_t, zone_linkage));
2098
2099 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2100 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2101 mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2102 zone0.zone_shares = 1;
2103 zone0.zone_nlwps = 0;
2104 zone0.zone_nlwps_ctl = INT_MAX;
2105 zone0.zone_nprocs = 0;
2106 zone0.zone_nprocs_ctl = INT_MAX;
2107 zone0.zone_locked_mem = 0;
2108 zone0.zone_locked_mem_ctl = UINT64_MAX;
2109 ASSERT(zone0.zone_max_swap == 0);
2110 zone0.zone_max_swap_ctl = UINT64_MAX;
2111 zone0.zone_max_lofi = 0;
2112 zone0.zone_max_lofi_ctl = UINT64_MAX;
2113 zone0.zone_shmmax = 0;
2114 zone0.zone_ipc.ipcq_shmmni = 0;
2115 zone0.zone_ipc.ipcq_semmni = 0;
2116 zone0.zone_ipc.ipcq_msgmni = 0;
2117 zone0.zone_name = GLOBAL_ZONENAME;
2118 zone0.zone_nodename = utsname.nodename;
2119 zone0.zone_domain = srpc_domain;
2120 zone0.zone_hostid = HW_INVALID_HOSTID;
2121 zone0.zone_fs_allowed = NULL;
2122 zone0.zone_ref = 1;
2123 zone0.zone_id = GLOBAL_ZONEID;
2124 zone0.zone_status = ZONE_IS_RUNNING;
2125 zone0.zone_rootpath = "/";
2126 zone0.zone_rootpathlen = 2;
2127 zone0.zone_psetid = ZONE_PS_INVAL;
2128 zone0.zone_ncpus = 0;
2129 zone0.zone_ncpus_online = 0;
2130 zone0.zone_proc_initpid = 1;
2131 zone0.zone_initname = initname;
2132 zone0.zone_lockedmem_kstat = NULL;
2133 zone0.zone_swapresv_kstat = NULL;
2134 zone0.zone_nprocs_kstat = NULL;
2135 zone0.zone_zfs_io_pri = 1;
2136
2137 zone0.zone_stime = 0;
2138 zone0.zone_utime = 0;
2139 zone0.zone_wtime = 0;
2140
2141 list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2142 offsetof(zone_ref_t, zref_linkage));
2143 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2144 offsetof(struct zsd_entry, zsd_linkage));
2145 list_insert_head(&zone_active, &zone0);
2146
2147 /*
2148 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2149 * to anything meaningful. It is assigned to be 'rootdir' in
2150 * vfs_mountroot().
2151 */
2152 zone0.zone_rootvp = NULL;
2153 zone0.zone_vfslist = NULL;
2230 */
2231 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2232
2233 /*
2234 * Initialize generic zone resource controls, if any.
2235 */
2236 rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2237 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2238 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2239 FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2240
2241 rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2242 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2243 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2244 RCTL_GLOBAL_INFINITE,
2245 MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2246
2247 rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2248 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2249 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2250 1024, 1024, &zone_zfs_io_pri_ops);
2251
2252 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2253 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2254 INT_MAX, INT_MAX, &zone_lwps_ops);
2255
2256 rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2257 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2258 INT_MAX, INT_MAX, &zone_procs_ops);
2259
2260 /*
2261 * System V IPC resource controls
2262 */
2263 rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2264 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2265 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2266
2267 rc_zone_semmni = rctl_register("zone.max-sem-ids",
2268 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2269 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2270
2283 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2284 bzero(dval, sizeof (rctl_val_t));
2285 dval->rcv_value = 1;
2286 dval->rcv_privilege = RCPRIV_PRIVILEGED;
2287 dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2288 dval->rcv_action_recip_pid = -1;
2289
2290 rde = rctl_dict_lookup("zone.cpu-shares");
2291 (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2292
2293 rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2294 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2295 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2296 &zone_locked_mem_ops);
2297
2298 rc_zone_max_swap = rctl_register("zone.max-swap",
2299 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2300 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2301 &zone_max_swap_ops);
2302
2303 rc_zone_max_lofi = rctl_register("zone.max-lofi",
2304 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2305 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2306 &zone_max_lofi_ops);
2307
2308 /*
2309 * Initialize the ``global zone''.
2310 */
2311 set = rctl_set_create();
2312 gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2313 mutex_enter(&p0.p_lock);
2314 e.rcep_p.zone = &zone0;
2315 e.rcep_t = RCENTITY_ZONE;
2316 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2317 gp);
2318
2319 zone0.zone_nlwps = p0.p_lwpcnt;
2320 zone0.zone_nprocs = 1;
2321 zone0.zone_ntasks = 1;
2322 mutex_exit(&p0.p_lock);
2323 zone0.zone_restart_init = B_TRUE;
2324 zone0.zone_brand = &native_brand;
2325 rctl_prealloc_destroy(gp);
2326 /*
2327 * pool_default hasn't been initialized yet, so we let pool_init()
2328 * take care of making sure the global zone is in the default pool.
2329 */
2330
2331 /*
2332 * Initialize global zone kstats
2333 */
2334 zone_kstat_create(&zone0);
2335
2336 /*
2337 * Initialize zone label.
2338 * mlp are initialized when tnzonecfg is loaded.
2339 */
2340 zone0.zone_slabel = l_admin_low;
2341 rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2342 label_hold(l_admin_low);
2343
2383 /*
2384 * The global zone is fully initialized (except for zone_rootvp which
2385 * will be set when the root filesystem is mounted).
2386 */
2387 global_zone = &zone0;
2388
2389 /*
2390 * Setup an event channel to send zone status change notifications on
2391 */
2392 res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2393 EVCH_CREAT);
2394
2395 if (res)
2396 panic("Sysevent_evc_bind failed during zone setup.\n");
2397
2398 }
2399
2400 static void
2401 zone_free(zone_t *zone)
2402 {
2403 ASSERT(zone != global_zone);
2404 ASSERT(zone->zone_ntasks == 0);
2405 ASSERT(zone->zone_nlwps == 0);
2406 ASSERT(zone->zone_nprocs == 0);
2407 ASSERT(zone->zone_cred_ref == 0);
2408 ASSERT(zone->zone_kcred == NULL);
2409 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2410 zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2411 ASSERT(list_is_empty(&zone->zone_ref_list));
2412
2413 /*
2414 * Remove any zone caps.
2415 */
2416 cpucaps_zone_remove(zone);
2417
2418 ASSERT(zone->zone_cpucap == NULL);
2419
2420 /* remove from deathrow list */
2421 if (zone_status_get(zone) == ZONE_IS_DEAD) {
2422 ASSERT(zone->zone_ref == 0);
2423 mutex_enter(&zone_deathrow_lock);
2424 list_remove(&zone_deathrow, zone);
2425 mutex_exit(&zone_deathrow_lock);
2426 }
2427
2428 list_destroy(&zone->zone_ref_list);
2429 zone_free_zsd(zone);
2430 zone_free_datasets(zone);
2431 list_destroy(&zone->zone_dl_list);
2432
2433 if (zone->zone_rootvp != NULL)
2434 VN_RELE(zone->zone_rootvp);
2435 if (zone->zone_rootpath)
2436 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2437 if (zone->zone_name != NULL)
2438 kmem_free(zone->zone_name, ZONENAME_MAX);
2439 if (zone->zone_slabel != NULL)
2440 label_rele(zone->zone_slabel);
2441 if (zone->zone_nodename != NULL)
2442 kmem_free(zone->zone_nodename, _SYS_NMLN);
2443 if (zone->zone_domain != NULL)
2444 kmem_free(zone->zone_domain, _SYS_NMLN);
2445 if (zone->zone_privset != NULL)
2446 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2447 if (zone->zone_rctls != NULL)
2448 rctl_set_free(zone->zone_rctls);
2449 if (zone->zone_bootargs != NULL)
2450 strfree(zone->zone_bootargs);
2546 kmem_free(attrp, sizeof (struct brand_attr));
2547 if (bp == NULL)
2548 return (EINVAL);
2549
2550 /*
2551 * This is the only place where a zone can change it's brand.
2552 * We already need to hold zone_status_lock to check the zone
2553 * status, so we'll just use that lock to serialize zone
2554 * branding requests as well.
2555 */
2556 mutex_enter(&zone_status_lock);
2557
2558 /* Re-Branding is not allowed and the zone can't be booted yet */
2559 if ((ZONE_IS_BRANDED(zone)) ||
2560 (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2561 mutex_exit(&zone_status_lock);
2562 brand_unregister_zone(bp);
2563 return (EINVAL);
2564 }
2565
2566 /* set up the brand specific data */
2567 zone->zone_brand = bp;
2568 ZBROP(zone)->b_init_brand_data(zone);
2569
2570 mutex_exit(&zone_status_lock);
2571 return (0);
2572 }
2573
2574 static int
2575 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2576 {
2577 char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2578 int err = 0;
2579
2580 ASSERT(zone != global_zone);
2581 if ((err = copyinstr(zone_fs_allowed, buf,
2582 ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2583 goto done;
2584
2585 if (zone->zone_fs_allowed != NULL)
2586 strfree(zone->zone_fs_allowed);
2587
2588 zone->zone_fs_allowed = strdup(buf);
2594
2595 static int
2596 zone_set_initname(zone_t *zone, const char *zone_initname)
2597 {
2598 char initname[INITNAME_SZ];
2599 size_t len;
2600 int err = 0;
2601
2602 ASSERT(zone != global_zone);
2603 if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2604 return (err); /* EFAULT or ENAMETOOLONG */
2605
2606 if (zone->zone_initname != NULL)
2607 strfree(zone->zone_initname);
2608
2609 zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2610 (void) strcpy(zone->zone_initname, initname);
2611 return (0);
2612 }
2613
2614 static int
2615 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2616 {
2617 uint64_t mcap;
2618 int err = 0;
2619
2620 if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2621 zone->zone_phys_mcap = mcap;
2622
2623 return (err);
2624 }
2625
2626 static int
2627 zone_set_sched_class(zone_t *zone, const char *new_class)
2628 {
2629 char sched_class[PC_CLNMSZ];
2630 id_t classid;
2631 int err;
2632
2633 ASSERT(zone != global_zone);
2634 if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2635 return (err); /* EFAULT or ENAMETOOLONG */
2636
2637 if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2638 return (set_errno(EINVAL));
2639 zone->zone_defaultcid = classid;
2640 ASSERT(zone->zone_defaultcid > 0 &&
2641 zone->zone_defaultcid < loaded_classes);
2642
2643 return (0);
2644 }
2645
2646 /*
3756 return (0);
3757 }
3758
3759 /*
3760 * Non-global zone version of start_init.
3761 */
3762 void
3763 zone_start_init(void)
3764 {
3765 proc_t *p = ttoproc(curthread);
3766 zone_t *z = p->p_zone;
3767
3768 ASSERT(!INGLOBALZONE(curproc));
3769
3770 /*
3771 * For all purposes (ZONE_ATTR_INITPID and restart_init),
3772 * storing just the pid of init is sufficient.
3773 */
3774 z->zone_proc_initpid = p->p_pid;
3775
3776 /*
3777 * We maintain zone_boot_err so that we can return the cause of the
3778 * failure back to the caller of the zone_boot syscall.
3779 */
3780 p->p_zone->zone_boot_err = start_init_common();
3781
3782 /*
3783 * We will prevent booting zones from becoming running zones if the
3784 * global zone is shutting down.
3785 */
3786 mutex_enter(&zone_status_lock);
3787 if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3788 ZONE_IS_SHUTTING_DOWN) {
3789 /*
3790 * Make sure we are still in the booting state-- we could have
3791 * raced and already be shutting down, or even further along.
3792 */
3793 if (zone_status_get(z) == ZONE_IS_BOOTING) {
3794 zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3795 }
3796 mutex_exit(&zone_status_lock);
3797 /* It's gone bad, dispose of the process */
3798 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3799 mutex_enter(&p->p_lock);
3800 ASSERT(p->p_flag & SEXITLWPS);
3801 lwp_exit();
3802 }
3803 } else {
3804 if (zone_status_get(z) == ZONE_IS_BOOTING)
3805 zone_status_set(z, ZONE_IS_RUNNING);
3806 mutex_exit(&zone_status_lock);
3807 /* cause the process to return to userland. */
3808 lwp_rtt();
3809 }
3810 }
3811
3812 struct zsched_arg {
3813 zone_t *zone;
3814 nvlist_t *nvlist;
3815 };
3816
3817 /*
3818 * Per-zone "sched" workalike. The similarity to "sched" doesn't have
3819 * anything to do with scheduling, but rather with the fact that
3820 * per-zone kernel threads are parented to zsched, just like regular
3821 * kernel threads are parented to sched (p0).
3822 *
3823 * zsched is also responsible for launching init for the zone.
3824 */
3825 static void
3826 zsched(void *arg)
3828 struct zsched_arg *za = arg;
3829 proc_t *pp = curproc;
3830 proc_t *initp = proc_init;
3831 zone_t *zone = za->zone;
3832 cred_t *cr, *oldcred;
3833 rctl_set_t *set;
3834 rctl_alloc_gp_t *gp;
3835 contract_t *ct = NULL;
3836 task_t *tk, *oldtk;
3837 rctl_entity_p_t e;
3838 kproject_t *pj;
3839
3840 nvlist_t *nvl = za->nvlist;
3841 nvpair_t *nvp = NULL;
3842
3843 bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3844 bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3845 PTOU(pp)->u_argc = 0;
3846 PTOU(pp)->u_argv = NULL;
3847 PTOU(pp)->u_envp = NULL;
3848 closeall(P_FINFO(pp));
3849
3850 /*
3851 * We are this zone's "zsched" process. As the zone isn't generally
3852 * visible yet we don't need to grab any locks before initializing its
3853 * zone_proc pointer.
3854 */
3855 zone_hold(zone); /* this hold is released by zone_destroy() */
3856 zone->zone_zsched = pp;
3857 mutex_enter(&pp->p_lock);
3858 pp->p_zone = zone;
3859 mutex_exit(&pp->p_lock);
3860
3861 /*
3862 * Disassociate process from its 'parent'; parent ourselves to init
3863 * (pid 1) and change other values as needed.
3864 */
3865 sess_create();
3866
3867 mutex_enter(&pidlock);
4270 goto out;
4271 }
4272 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4273 /*
4274 * nvl may have been allocated/free'd, but the value set to
4275 * non-NULL, so we reset it here.
4276 */
4277 nvl = NULL;
4278 error = EINVAL;
4279 goto out;
4280 }
4281 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4282 rctl_dict_entry_t *rde;
4283 rctl_hndl_t hndl;
4284 nvlist_t **nvlarray;
4285 uint_t i, nelem;
4286 char *name;
4287
4288 error = EINVAL;
4289 name = nvpair_name(nvp);
4290 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4291 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4292 goto out;
4293 }
4294 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4295 goto out;
4296 }
4297 rde = rctl_dict_lookup_hndl(hndl);
4298 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4299 ASSERT(error == 0);
4300 for (i = 0; i < nelem; i++) {
4301 if (error = nvlist2rctlval(nvlarray[i], &rv))
4302 goto out;
4303 }
4304 if (rctl_invalid_value(rde, &rv)) {
4305 error = EINVAL;
4306 goto out;
4307 }
4308 }
4309 error = 0;
4310 *nvlp = nvl;
4311 out:
4419 cred_t *zkcr;
4420 boolean_t insert_label_hash;
4421
4422 if (secpolicy_zone_config(CRED()) != 0)
4423 return (set_errno(EPERM));
4424
4425 /* can't boot zone from within chroot environment */
4426 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4427 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4428 extended_error));
4429
4430 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4431 zoneid = zone->zone_id = id_alloc(zoneid_space);
4432 zone->zone_status = ZONE_IS_UNINITIALIZED;
4433 zone->zone_pool = pool_default;
4434 zone->zone_pool_mod = gethrtime();
4435 zone->zone_psetid = ZONE_PS_INVAL;
4436 zone->zone_ncpus = 0;
4437 zone->zone_ncpus_online = 0;
4438 zone->zone_restart_init = B_TRUE;
4439 zone->zone_brand = &native_brand;
4440 zone->zone_initname = NULL;
4441 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4442 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4443 mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4444 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4445 list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4446 offsetof(zone_ref_t, zref_linkage));
4447 list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4448 offsetof(struct zsd_entry, zsd_linkage));
4449 list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4450 offsetof(zone_dataset_t, zd_linkage));
4451 list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4452 offsetof(zone_dl_t, zdl_linkage));
4453 rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4454 rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4455
4456 if (flags & ZCF_NET_EXCL) {
4457 zone->zone_flags |= ZF_NET_EXCL;
4458 }
4480 zone->zone_domain[0] = '\0';
4481 zone->zone_hostid = HW_INVALID_HOSTID;
4482 zone->zone_shares = 1;
4483 zone->zone_shmmax = 0;
4484 zone->zone_ipc.ipcq_shmmni = 0;
4485 zone->zone_ipc.ipcq_semmni = 0;
4486 zone->zone_ipc.ipcq_msgmni = 0;
4487 zone->zone_bootargs = NULL;
4488 zone->zone_fs_allowed = NULL;
4489 zone->zone_initname =
4490 kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4491 (void) strcpy(zone->zone_initname, zone_default_initname);
4492 zone->zone_nlwps = 0;
4493 zone->zone_nlwps_ctl = INT_MAX;
4494 zone->zone_nprocs = 0;
4495 zone->zone_nprocs_ctl = INT_MAX;
4496 zone->zone_locked_mem = 0;
4497 zone->zone_locked_mem_ctl = UINT64_MAX;
4498 zone->zone_max_swap = 0;
4499 zone->zone_max_swap_ctl = UINT64_MAX;
4500 zone->zone_max_lofi = 0;
4501 zone->zone_max_lofi_ctl = UINT64_MAX;
4502 zone->zone_lockedmem_kstat = NULL;
4503 zone->zone_swapresv_kstat = NULL;
4504 zone->zone_zfs_io_pri = 1;
4505
4506 /*
4507 * Zsched initializes the rctls.
4508 */
4509 zone->zone_rctls = NULL;
4510
4511 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4512 zone_free(zone);
4513 return (zone_create_error(error, 0, extended_error));
4514 }
4515
4516 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4517 zone_free(zone);
4518 return (set_errno(error));
4519 }
4520
4521 /*
4522 * Read in the trusted system parameters:
4523 * match flag and sensitivity label.
4639 if (insert_label_hash) {
4640 (void) mod_hash_insert(zonehashbylabel,
4641 (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4642 zone->zone_flags |= ZF_HASHED_LABEL;
4643 }
4644
4645 /*
4646 * Insert into active list. At this point there are no 'hold's
4647 * on the zone, but everyone else knows not to use it, so we can
4648 * continue to use it. zsched() will do a zone_hold() if the
4649 * newproc() is successful.
4650 */
4651 list_insert_tail(&zone_active, zone);
4652 mutex_exit(&zonehash_lock);
4653
4654 zarg.zone = zone;
4655 zarg.nvlist = rctls;
4656 /*
4657 * The process, task, and project rctls are probably wrong;
4658 * we need an interface to get the default values of all rctls,
4659 * and initialize zsched appropriately. I'm not sure that that
4660 * makes much of a difference, though.
4661 */
4662 error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4663 if (error != 0) {
4664 /*
4665 * We need to undo all globally visible state.
4666 */
4667 mutex_enter(&zonehash_lock);
4668 list_remove(&zone_active, zone);
4669 if (zone->zone_flags & ZF_HASHED_LABEL) {
4670 ASSERT(zone->zone_slabel != NULL);
4671 (void) mod_hash_destroy(zonehashbylabel,
4672 (mod_hash_key_t)zone->zone_slabel);
4673 }
4674 (void) mod_hash_destroy(zonehashbyname,
4675 (mod_hash_key_t)(uintptr_t)zone->zone_name);
4676 (void) mod_hash_destroy(zonehashbyid,
4677 (mod_hash_key_t)(uintptr_t)zone->zone_id);
4678 ASSERT(zonecount > 1);
4679 zonecount--;
4680 goto errout;
5540 err = copyoutstr(zone->zone_initname, buf, bufsize,
5541 NULL);
5542 if (err != 0 && err != ENAMETOOLONG)
5543 error = EFAULT;
5544 }
5545 break;
5546 case ZONE_ATTR_BOOTARGS:
5547 if (zone->zone_bootargs == NULL)
5548 outstr = "";
5549 else
5550 outstr = zone->zone_bootargs;
5551 size = strlen(outstr) + 1;
5552 if (bufsize > size)
5553 bufsize = size;
5554 if (buf != NULL) {
5555 err = copyoutstr(outstr, buf, bufsize, NULL);
5556 if (err != 0 && err != ENAMETOOLONG)
5557 error = EFAULT;
5558 }
5559 break;
5560 case ZONE_ATTR_PHYS_MCAP:
5561 size = sizeof (zone->zone_phys_mcap);
5562 if (bufsize > size)
5563 bufsize = size;
5564 if (buf != NULL &&
5565 copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5566 error = EFAULT;
5567 break;
5568 case ZONE_ATTR_SCHED_CLASS:
5569 mutex_enter(&class_lock);
5570
5571 if (zone->zone_defaultcid >= loaded_classes)
5572 outstr = "";
5573 else
5574 outstr = sclass[zone->zone_defaultcid].cl_name;
5575 size = strlen(outstr) + 1;
5576 if (bufsize > size)
5577 bufsize = size;
5578 if (buf != NULL) {
5579 err = copyoutstr(outstr, buf, bufsize, NULL);
5580 if (err != 0 && err != ENAMETOOLONG)
5581 error = EFAULT;
5582 }
5583
5584 mutex_exit(&class_lock);
5585 break;
5586 case ZONE_ATTR_HOSTID:
5587 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5602 size = strlen(outstr) + 1;
5603 if (bufsize > size)
5604 bufsize = size;
5605 if (buf != NULL) {
5606 err = copyoutstr(outstr, buf, bufsize, NULL);
5607 if (err != 0 && err != ENAMETOOLONG)
5608 error = EFAULT;
5609 }
5610 break;
5611 case ZONE_ATTR_NETWORK:
5612 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5613 if (copyin(buf, zbuf, bufsize) != 0) {
5614 error = EFAULT;
5615 } else {
5616 error = zone_get_network(zoneid, zbuf);
5617 if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5618 error = EFAULT;
5619 }
5620 kmem_free(zbuf, bufsize);
5621 break;
5622 default:
5623 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5624 size = bufsize;
5625 error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5626 } else {
5627 error = EINVAL;
5628 }
5629 }
5630 zone_rele(zone);
5631
5632 if (error)
5633 return (set_errno(error));
5634 return ((ssize_t)size);
5635 }
5636
5637 /*
5638 * Systemcall entry point for zone_setattr(2).
5639 */
5640 /*ARGSUSED*/
5641 static int
5642 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
5643 {
5644 zone_t *zone;
5645 zone_status_t zone_status;
5646 int err = -1;
5647 zone_net_data_t *zbuf;
5648
5649 if (secpolicy_zone_config(CRED()) != 0)
5650 return (set_errno(EPERM));
5651
5652 /*
5653 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5654 * global zone.
5655 */
5656 if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5657 return (set_errno(EINVAL));
5658 }
5659
5660 mutex_enter(&zonehash_lock);
5661 if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5662 mutex_exit(&zonehash_lock);
5663 return (set_errno(EINVAL));
5664 }
5665 zone_hold(zone);
5666 mutex_exit(&zonehash_lock);
5667
5668 /*
5669 * At present most attributes can only be set on non-running,
5670 * non-global zones.
5671 */
5672 zone_status = zone_status_get(zone);
5673 if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
5674 err = EINVAL;
5675 goto done;
5676 }
5677
5678 switch (attr) {
5679 case ZONE_ATTR_INITNAME:
5680 err = zone_set_initname(zone, (const char *)buf);
5681 break;
5682 case ZONE_ATTR_INITNORESTART:
5683 zone->zone_restart_init = B_FALSE;
5684 err = 0;
5685 break;
5686 case ZONE_ATTR_BOOTARGS:
5687 err = zone_set_bootargs(zone, (const char *)buf);
5688 break;
5689 case ZONE_ATTR_BRAND:
5690 err = zone_set_brand(zone, (const char *)buf);
5691 break;
5692 case ZONE_ATTR_FS_ALLOWED:
5693 err = zone_set_fs_allowed(zone, (const char *)buf);
5694 break;
5695 case ZONE_ATTR_PHYS_MCAP:
5696 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5697 break;
5698 case ZONE_ATTR_SCHED_CLASS:
5699 err = zone_set_sched_class(zone, (const char *)buf);
5700 break;
5701 case ZONE_ATTR_HOSTID:
5702 if (bufsize == sizeof (zone->zone_hostid)) {
5703 if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5704 err = 0;
5705 else
5706 err = EFAULT;
5707 } else {
5708 err = EINVAL;
5709 }
5710 break;
5711 case ZONE_ATTR_NETWORK:
5712 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
5713 err = EINVAL;
5714 break;
5715 }
5716 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5717 if (copyin(buf, zbuf, bufsize) != 0) {
5718 kmem_free(zbuf, bufsize);
5719 err = EFAULT;
5720 break;
5721 }
5722 err = zone_set_network(zoneid, zbuf);
5723 kmem_free(zbuf, bufsize);
5724 break;
5725 default:
5726 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5727 err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5728 else
5729 err = EINVAL;
5730 }
5731
5732 done:
5733 zone_rele(zone);
5734 ASSERT(err != -1);
5735 return (err != 0 ? set_errno(err) : 0);
5736 }
5737
5738 /*
5739 * Return zero if the process has at least one vnode mapped in to its
5740 * address space which shouldn't be allowed to change zones.
5741 *
5742 * Also return zero if the process has any shared mappings which reserve
5743 * swap. This is because the counting for zone.max-swap does not allow swap
5744 * reservation to be shared between zones. zone swap reservation is counted
6508 door_arg_t darg, save_arg;
6509 char *zone_name;
6510 size_t zone_namelen;
6511 zoneid_t zoneid;
6512 zone_t *zone;
6513 zone_cmd_arg_t arg;
6514 uint64_t uniqid;
6515 size_t size;
6516 int error;
6517 int retry;
6518
6519 zone = zargp->zone;
6520 arg = zargp->arg;
6521 kmem_free(zargp, sizeof (*zargp));
6522
6523 zone_namelen = strlen(zone->zone_name) + 1;
6524 zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6525 bcopy(zone->zone_name, zone_name, zone_namelen);
6526 zoneid = zone->zone_id;
6527 uniqid = zone->zone_uniqid;
6528 /*
6529 * zoneadmd may be down, but at least we can empty out the zone.
6530 * We can ignore the return value of zone_empty() since we're called
6531 * from a kernel thread and know we won't be delivered any signals.
6532 */
6533 ASSERT(curproc == &p0);
6534 (void) zone_empty(zone);
6535 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6536 zone_rele(zone);
6537
6538 size = sizeof (arg);
6539 darg.rbuf = (char *)&arg;
6540 darg.data_ptr = (char *)&arg;
6541 darg.rsize = size;
6542 darg.data_size = size;
6543 darg.desc_ptr = NULL;
6544 darg.desc_num = 0;
6545
6546 save_arg = darg;
6547 /*
6748
6749 /*
6750 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6751 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6752 * could cause assertions to fail (e.g., assertions about a zone's
6753 * state during initialization, readying, or booting) or produce races.
6754 * We'll let threads continue to initialize and ready new zones: they'll
6755 * fail to boot the new zones when they see that the global zone is
6756 * shutting down.
6757 */
6758 for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6759 current_zonep = list_next(&zone_active, current_zonep)) {
6760 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6761 zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6762 }
6763 mutex_exit(&zone_status_lock);
6764 mutex_exit(&zonehash_lock);
6765 }
6766
6767 /*
6768 * Returns true if the named dataset is visible in the current zone.
6769 * The 'write' parameter is set to 1 if the dataset is also writable.
6770 */
6771 int
6772 zone_dataset_visible(const char *dataset, int *write)
6773 {
6774 static int zfstype = -1;
6775 zone_dataset_t *zd;
6776 size_t len;
6777 zone_t *zone = curproc->p_zone;
6778 const char *name = NULL;
6779 vfs_t *vfsp = NULL;
6780
6781 if (dataset[0] == '\0')
6782 return (0);
6783
6784 /*
6785 * Walk the list once, looking for datasets which match exactly, or
6786 * specify a dataset underneath an exported dataset. If found, return
6787 * true and note that it is writable.
6788 */
6789 for (zd = list_head(&zone->zone_datasets); zd != NULL;
6790 zd = list_next(&zone->zone_datasets, zd)) {
6791
6792 len = strlen(zd->zd_dataset);
6793 if (strlen(dataset) >= len &&
6794 bcmp(dataset, zd->zd_dataset, len) == 0 &&
6795 (dataset[len] == '\0' || dataset[len] == '/' ||
6796 dataset[len] == '@')) {
6797 if (write)
6825 /*
6826 * We reach here if the given dataset is not found in the zone_dataset
6827 * list. Check if this dataset was added as a filesystem (ie. "add fs")
6828 * instead of delegation. For this we search for the dataset in the
6829 * zone_vfslist of this zone. If found, return true and note that it is
6830 * not writable.
6831 */
6832
6833 /*
6834 * Initialize zfstype if it is not initialized yet.
6835 */
6836 if (zfstype == -1) {
6837 struct vfssw *vswp = vfs_getvfssw("zfs");
6838 zfstype = vswp - vfssw;
6839 vfs_unrefvfssw(vswp);
6840 }
6841
6842 vfs_list_read_lock();
6843 vfsp = zone->zone_vfslist;
6844 do {
6845 ASSERT(vfsp);
6846 if (vfsp->vfs_fstype == zfstype) {
6847 name = refstr_value(vfsp->vfs_resource);
6848
6849 /*
6850 * Check if we have an exact match.
6851 */
6852 if (strcmp(dataset, name) == 0) {
6853 vfs_list_unlock();
6854 if (write)
6855 *write = 0;
6856 return (1);
6857 }
6858 /*
6859 * We need to check if we are looking for parents of
6860 * a dataset. These should be visible, but read-only.
6861 */
6862 len = strlen(dataset);
6863 if (dataset[len - 1] == '/')
6864 len--;
6865
6866 if (len < strlen(name) &&
6867 bcmp(dataset, name, len) == 0 && name[len] == '/') {
6868 vfs_list_unlock();
6869 if (write)
6870 *write = 0;
6871 return (1);
6872 }
6873 }
6874 vfsp = vfsp->vfs_zone_next;
6875 } while (vfsp != zone->zone_vfslist);
6876
6877 vfs_list_unlock();
6878 return (0);
6879 }
6880
6881 /*
6882 * zone_find_by_any_path() -
6883 *
6884 * kernel-private routine similar to zone_find_by_path(), but which
6885 * effectively compares against zone paths rather than zonerootpath
6886 * (i.e., the last component of zonerootpaths, which should be "root/",
6887 * are not compared.) This is done in order to accurately identify all
6888 * paths, whether zone-visible or not, including those which are parallel
6889 * to /root/, such as /dev/, /home/, etc...
6890 *
6891 * If the specified path does not fall under any zone path then global
6892 * zone is returned.
6893 *
6894 * The treat_abs parameter indicates whether the path should be treated as
6895 * an absolute path although it does not begin with "/". (This supports
6896 * nfs mount syntax such as host:any/path.)
6897 *
6898 * The caller is responsible for zone_rele of the returned zone.
6899 */
6900 zone_t *
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2016, Joyent Inc.
25 */
26
27 /*
28 * Zones
29 *
30 * A zone is a named collection of processes, namespace constraints,
31 * and other system resources which comprise a secure and manageable
32 * application containment facility.
33 *
34 * Zones (represented by the reference counted zone_t) are tracked in
35 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs
36 * (zoneid_t) are used to track zone association. Zone IDs are
37 * dynamically generated when the zone is created; if a persistent
38 * identifier is needed (core files, accounting logs, audit trail,
39 * etc.), the zone name should be used.
40 *
41 *
42 * Global Zone:
43 *
44 * The global zone (zoneid 0) is automatically associated with all
233 #include <sys/klpd.h>
234
235 #include <sys/door.h>
236 #include <sys/cpuvar.h>
237 #include <sys/sdt.h>
238
239 #include <sys/uadmin.h>
240 #include <sys/session.h>
241 #include <sys/cmn_err.h>
242 #include <sys/modhash.h>
243 #include <sys/sunddi.h>
244 #include <sys/nvpair.h>
245 #include <sys/rctl.h>
246 #include <sys/fss.h>
247 #include <sys/brand.h>
248 #include <sys/zone.h>
249 #include <net/if.h>
250 #include <sys/cpucaps.h>
251 #include <vm/seg.h>
252 #include <sys/mac.h>
253 #include <sys/rt.h>
254 #include <sys/fx.h>
255
256 /*
257 * This constant specifies the number of seconds that threads waiting for
258 * subsystems to release a zone's general-purpose references will wait before
259 * they log the zone's reference counts. The constant's value shouldn't
260 * be so small that reference counts are unnecessarily reported for zones
261 * whose references are slowly released. On the other hand, it shouldn't be so
262 * large that users reboot their systems out of frustration over hung zones
263 * before the system logs the zones' reference counts.
264 */
265 #define ZONE_DESTROY_TIMEOUT_SECS 60
266
267 /* List of data link IDs which are accessible from the zone */
268 typedef struct zone_dl {
269 datalink_id_t zdl_id;
270 nvlist_t *zdl_net;
271 list_node_t zdl_linkage;
272 } zone_dl_t;
273
274 /*
355 /*
356 * This array contains the names of the subsystems listed in zone_ref_subsys_t
357 * (see sys/zone.h).
358 */
359 static char *zone_ref_subsys_names[] = {
360 "NFS", /* ZONE_REF_NFS */
361 "NFSv4", /* ZONE_REF_NFSV4 */
362 "SMBFS", /* ZONE_REF_SMBFS */
363 "MNTFS", /* ZONE_REF_MNTFS */
364 "LOFI", /* ZONE_REF_LOFI */
365 "VFS", /* ZONE_REF_VFS */
366 "IPC" /* ZONE_REF_IPC */
367 };
368
369 /*
370 * This isn't static so lint doesn't complain.
371 */
372 rctl_hndl_t rc_zone_cpu_shares;
373 rctl_hndl_t rc_zone_locked_mem;
374 rctl_hndl_t rc_zone_max_swap;
375 rctl_hndl_t rc_zone_phys_mem;
376 rctl_hndl_t rc_zone_max_lofi;
377 rctl_hndl_t rc_zone_cpu_cap;
378 rctl_hndl_t rc_zone_zfs_io_pri;
379 rctl_hndl_t rc_zone_nlwps;
380 rctl_hndl_t rc_zone_nprocs;
381 rctl_hndl_t rc_zone_shmmax;
382 rctl_hndl_t rc_zone_shmmni;
383 rctl_hndl_t rc_zone_semmni;
384 rctl_hndl_t rc_zone_msgmni;
385
386 const char * const zone_default_initname = "/sbin/init";
387 static char * const zone_prefix = "/zone/";
388 static int zone_shutdown(zoneid_t zoneid);
389 static int zone_add_datalink(zoneid_t, datalink_id_t);
390 static int zone_remove_datalink(zoneid_t, datalink_id_t);
391 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
392 static int zone_set_network(zoneid_t, zone_net_data_t *);
393 static int zone_get_network(zoneid_t, zone_net_data_t *);
394
395 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
1728 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1729 rctl_qty_t nv)
1730 {
1731 ASSERT(MUTEX_HELD(&p->p_lock));
1732 ASSERT(e->rcep_t == RCENTITY_ZONE);
1733 if (e->rcep_p.zone == NULL)
1734 return (0);
1735 e->rcep_p.zone->zone_max_swap_ctl = nv;
1736 return (0);
1737 }
1738
1739 static rctl_ops_t zone_max_swap_ops = {
1740 rcop_no_action,
1741 zone_max_swap_usage,
1742 zone_max_swap_set,
1743 zone_max_swap_test
1744 };
1745
1746 /*ARGSUSED*/
1747 static rctl_qty_t
1748 zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
1749 {
1750 rctl_qty_t q;
1751 zone_t *z = p->p_zone;
1752
1753 ASSERT(MUTEX_HELD(&p->p_lock));
1754 /* No additional lock because not enforced in the kernel */
1755 q = z->zone_phys_mem;
1756 return (q);
1757 }
1758
1759 /*ARGSUSED*/
1760 static int
1761 zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1762 rctl_qty_t nv)
1763 {
1764 ASSERT(MUTEX_HELD(&p->p_lock));
1765 ASSERT(e->rcep_t == RCENTITY_ZONE);
1766 if (e->rcep_p.zone == NULL)
1767 return (0);
1768 e->rcep_p.zone->zone_phys_mem_ctl = nv;
1769 return (0);
1770 }
1771
1772 static rctl_ops_t zone_phys_mem_ops = {
1773 rcop_no_action,
1774 zone_phys_mem_usage,
1775 zone_phys_mem_set,
1776 rcop_no_test
1777 };
1778
1779 /*ARGSUSED*/
1780 static rctl_qty_t
1781 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1782 {
1783 rctl_qty_t q;
1784 zone_t *z = p->p_zone;
1785
1786 ASSERT(MUTEX_HELD(&p->p_lock));
1787 mutex_enter(&z->zone_rctl_lock);
1788 q = z->zone_max_lofi;
1789 mutex_exit(&z->zone_rctl_lock);
1790 return (q);
1791 }
1792
1793 /*ARGSUSED*/
1794 static int
1795 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1796 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1797 {
1798 rctl_qty_t q;
1799 zone_t *z;
1800
1854 crhold(cr);
1855 zone_rele(zone);
1856 return (cr);
1857 }
1858
1859 static int
1860 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1861 {
1862 zone_t *zone = ksp->ks_private;
1863 zone_kstat_t *zk = ksp->ks_data;
1864
1865 if (rw == KSTAT_WRITE)
1866 return (EACCES);
1867
1868 zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1869 zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1870 return (0);
1871 }
1872
1873 static int
1874 zone_physmem_kstat_update(kstat_t *ksp, int rw)
1875 {
1876 zone_t *zone = ksp->ks_private;
1877 zone_kstat_t *zk = ksp->ks_data;
1878
1879 if (rw == KSTAT_WRITE)
1880 return (EACCES);
1881
1882 zk->zk_usage.value.ui64 = zone->zone_phys_mem;
1883 zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
1884 return (0);
1885 }
1886
1887 static int
1888 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1889 {
1890 zone_t *zone = ksp->ks_private;
1891 zone_kstat_t *zk = ksp->ks_data;
1892
1893 if (rw == KSTAT_WRITE)
1894 return (EACCES);
1895
1896 zk->zk_usage.value.ui64 = zone->zone_nprocs;
1897 zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1898 return (0);
1899 }
1900
1901 static int
1902 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1903 {
1904 zone_t *zone = ksp->ks_private;
1905 zone_kstat_t *zk = ksp->ks_data;
1906
1907 if (rw == KSTAT_WRITE)
1921
1922 ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1923 sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1924 KSTAT_FLAG_VIRTUAL);
1925
1926 if (ksp == NULL)
1927 return (NULL);
1928
1929 zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1930 ksp->ks_data_size += strlen(zone->zone_name) + 1;
1931 kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1932 kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1933 kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1934 kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1935 ksp->ks_update = updatefunc;
1936 ksp->ks_private = zone;
1937 kstat_install(ksp);
1938 return (ksp);
1939 }
1940
1941 static int
1942 zone_vfs_kstat_update(kstat_t *ksp, int rw)
1943 {
1944 zone_t *zone = ksp->ks_private;
1945 zone_vfs_kstat_t *zvp = ksp->ks_data;
1946 kstat_io_t *kiop = &zone->zone_vfs_rwstats;
1947
1948 if (rw == KSTAT_WRITE)
1949 return (EACCES);
1950
1951 /*
1952 * Extract the VFS statistics from the kstat_io_t structure used by
1953 * kstat_runq_enter() and related functions. Since the slow ops
1954 * counters are updated directly by the VFS layer, there's no need to
1955 * copy those statistics here.
1956 *
1957 * Note that kstat_runq_enter() and the related functions use
1958 * gethrtime_unscaled(), so scale the time here.
1959 */
1960 zvp->zv_nread.value.ui64 = kiop->nread;
1961 zvp->zv_reads.value.ui64 = kiop->reads;
1962 zvp->zv_rtime.value.ui64 = kiop->rtime;
1963 zvp->zv_rcnt.value.ui64 = kiop->rcnt;
1964 zvp->zv_rlentime.value.ui64 = kiop->rlentime;
1965 zvp->zv_nwritten.value.ui64 = kiop->nwritten;
1966 zvp->zv_writes.value.ui64 = kiop->writes;
1967 zvp->zv_wtime.value.ui64 = kiop->wtime;
1968 zvp->zv_wcnt.value.ui64 = kiop->wcnt;
1969 zvp->zv_wlentime.value.ui64 = kiop->wlentime;
1970
1971 scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
1972 scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
1973 scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
1974 scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
1975
1976 return (0);
1977 }
1978
1979 static kstat_t *
1980 zone_vfs_kstat_create(zone_t *zone)
1981 {
1982 kstat_t *ksp;
1983 zone_vfs_kstat_t *zvp;
1984
1985 if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
1986 zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
1987 sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
1988 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
1989 return (NULL);
1990
1991 if (zone->zone_id != GLOBAL_ZONEID)
1992 kstat_zone_add(ksp, GLOBAL_ZONEID);
1993
1994 zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
1995 ksp->ks_data_size += strlen(zone->zone_name) + 1;
1996 ksp->ks_lock = &zone->zone_vfs_lock;
1997 zone->zone_vfs_stats = zvp;
1998
1999 /* The kstat "name" field is not large enough for a full zonename */
2000 kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
2001 kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
2002 kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
2003 kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
2004 kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
2005 kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
2006 kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
2007 kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
2008 kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
2009 kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
2010 kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
2011 kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
2012 kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
2013 kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
2014 kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
2015 kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
2016 kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
2017 kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
2018
2019 ksp->ks_update = zone_vfs_kstat_update;
2020 ksp->ks_private = zone;
2021
2022 kstat_install(ksp);
2023 return (ksp);
2024 }
2025
2026 static int
2027 zone_zfs_kstat_update(kstat_t *ksp, int rw)
2028 {
2029 zone_t *zone = ksp->ks_private;
2030 zone_zfs_kstat_t *zzp = ksp->ks_data;
2031 kstat_io_t *kiop = &zone->zone_zfs_rwstats;
2032
2033 if (rw == KSTAT_WRITE)
2034 return (EACCES);
2035
2036 /*
2037 * Extract the ZFS statistics from the kstat_io_t structure used by
2038 * kstat_runq_enter() and related functions. Since the I/O throttle
2039 * counters are updated directly by the ZFS layer, there's no need to
2040 * copy those statistics here.
2041 *
2042 * Note that kstat_runq_enter() and the related functions use
2043 * gethrtime_unscaled(), so scale the time here.
2044 */
2045 zzp->zz_nread.value.ui64 = kiop->nread;
2046 zzp->zz_reads.value.ui64 = kiop->reads;
2047 zzp->zz_rtime.value.ui64 = kiop->rtime;
2048 zzp->zz_rlentime.value.ui64 = kiop->rlentime;
2049 zzp->zz_nwritten.value.ui64 = kiop->nwritten;
2050 zzp->zz_writes.value.ui64 = kiop->writes;
2051
2052 scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
2053 scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
2054
2055 return (0);
2056 }
2057
2058 static kstat_t *
2059 zone_zfs_kstat_create(zone_t *zone)
2060 {
2061 kstat_t *ksp;
2062 zone_zfs_kstat_t *zzp;
2063
2064 if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
2065 zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
2066 sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
2067 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2068 return (NULL);
2069
2070 if (zone->zone_id != GLOBAL_ZONEID)
2071 kstat_zone_add(ksp, GLOBAL_ZONEID);
2072
2073 zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
2074 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2075 ksp->ks_lock = &zone->zone_zfs_lock;
2076 zone->zone_zfs_stats = zzp;
2077
2078 /* The kstat "name" field is not large enough for a full zonename */
2079 kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
2080 kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
2081 kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
2082 kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
2083 kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
2084 kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
2085 kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
2086 kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
2087 kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
2088
2089 ksp->ks_update = zone_zfs_kstat_update;
2090 ksp->ks_private = zone;
2091
2092 kstat_install(ksp);
2093 return (ksp);
2094 }
2095
2096 static int
2097 zone_mcap_kstat_update(kstat_t *ksp, int rw)
2098 {
2099 zone_t *zone = ksp->ks_private;
2100 zone_mcap_kstat_t *zmp = ksp->ks_data;
2101
2102 if (rw == KSTAT_WRITE)
2103 return (EACCES);
2104
2105 zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
2106 zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
2107 zmp->zm_swap.value.ui64 = zone->zone_max_swap;
2108 zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
2109 zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
2110 zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
2111 zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
2112 zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
2113 zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
2114 zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
2115 zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
2116 zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
2117 zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
2118
2119 return (0);
2120 }
2121
2122 static kstat_t *
2123 zone_mcap_kstat_create(zone_t *zone)
2124 {
2125 kstat_t *ksp;
2126 zone_mcap_kstat_t *zmp;
2127
2128 if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2129 zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2130 sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2131 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2132 return (NULL);
2133
2134 if (zone->zone_id != GLOBAL_ZONEID)
2135 kstat_zone_add(ksp, GLOBAL_ZONEID);
2136
2137 zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2138 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2139 ksp->ks_lock = &zone->zone_mcap_lock;
2140 zone->zone_mcap_stats = zmp;
2141
2142 /* The kstat "name" field is not large enough for a full zonename */
2143 kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2144 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2145 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2146 kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2147 kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2148 kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2149 kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2150 kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2151 kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2152 kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2153 kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2154 kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2155 kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2156 kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2157 KSTAT_DATA_UINT64);
2158 kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
2159 KSTAT_DATA_UINT64);
2160 kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
2161 KSTAT_DATA_UINT64);
2162
2163 ksp->ks_update = zone_mcap_kstat_update;
2164 ksp->ks_private = zone;
2165
2166 kstat_install(ksp);
2167 return (ksp);
2168 }
2169
2170 static int
2171 zone_misc_kstat_update(kstat_t *ksp, int rw)
2172 {
2173 zone_t *zone = ksp->ks_private;
2174 zone_misc_kstat_t *zmp = ksp->ks_data;
2175 hrtime_t tmp;
2176
2177 if (rw == KSTAT_WRITE)
2178 return (EACCES);
2179
2180 tmp = zone->zone_utime;
2181 scalehrtime(&tmp);
2241 kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2242 kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2243 KSTAT_DATA_UINT32);
2244 kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2245 kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2246
2247 ksp->ks_update = zone_misc_kstat_update;
2248 ksp->ks_private = zone;
2249
2250 kstat_install(ksp);
2251 return (ksp);
2252 }
2253
2254 static void
2255 zone_kstat_create(zone_t *zone)
2256 {
2257 zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2258 "lockedmem", zone_lockedmem_kstat_update);
2259 zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2260 "swapresv", zone_swapresv_kstat_update);
2261 zone->zone_physmem_kstat = zone_kstat_create_common(zone,
2262 "physicalmem", zone_physmem_kstat_update);
2263 zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2264 "nprocs", zone_nprocs_kstat_update);
2265
2266 if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2267 zone->zone_vfs_stats = kmem_zalloc(
2268 sizeof (zone_vfs_kstat_t), KM_SLEEP);
2269 }
2270
2271 if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2272 zone->zone_mcap_stats = kmem_zalloc(
2273 sizeof (zone_mcap_kstat_t), KM_SLEEP);
2274 }
2275
2276 if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2277 zone->zone_misc_stats = kmem_zalloc(
2278 sizeof (zone_misc_kstat_t), KM_SLEEP);
2279 }
2280
2281 }
2282
2283 static void
2284 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2285 {
2286 void *data;
2287
2288 if (*pkstat != NULL) {
2289 data = (*pkstat)->ks_data;
2290 kstat_delete(*pkstat);
2291 kmem_free(data, datasz);
2292 *pkstat = NULL;
2293 }
2294 }
2295
2296 static void
2297 zone_kstat_delete(zone_t *zone)
2298 {
2299 zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2300 sizeof (zone_kstat_t));
2301 zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2302 sizeof (zone_kstat_t));
2303 zone_kstat_delete_common(&zone->zone_physmem_kstat,
2304 sizeof (zone_kstat_t));
2305 zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2306 sizeof (zone_kstat_t));
2307
2308 zone_kstat_delete_common(&zone->zone_vfs_ksp,
2309 sizeof (zone_vfs_kstat_t));
2310 zone_kstat_delete_common(&zone->zone_mcap_ksp,
2311 sizeof (zone_mcap_kstat_t));
2312 zone_kstat_delete_common(&zone->zone_misc_ksp,
2313 sizeof (zone_misc_kstat_t));
2314
2315 }
2316
2317 /*
2318 * Called very early on in boot to initialize the ZSD list so that
2319 * zone_key_create() can be called before zone_init(). It also initializes
2320 * portions of zone0 which may be used before zone_init() is called. The
2321 * variable "global_zone" will be set when zone0 is fully initialized by
2322 * zone_init().
2323 */
2324 void
2325 zone_zsd_init(void)
2326 {
2327 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2328 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2329 list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2330 offsetof(struct zsd_entry, zsd_linkage));
2331 list_create(&zone_active, sizeof (zone_t),
2332 offsetof(zone_t, zone_linkage));
2333 list_create(&zone_deathrow, sizeof (zone_t),
2334 offsetof(zone_t, zone_linkage));
2335
2336 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
2337 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2338 mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2339 zone0.zone_shares = 1;
2340 zone0.zone_nlwps = 0;
2341 zone0.zone_nlwps_ctl = INT_MAX;
2342 zone0.zone_nprocs = 0;
2343 zone0.zone_nprocs_ctl = INT_MAX;
2344 zone0.zone_locked_mem = 0;
2345 zone0.zone_locked_mem_ctl = UINT64_MAX;
2346 ASSERT(zone0.zone_max_swap == 0);
2347 zone0.zone_max_swap_ctl = UINT64_MAX;
2348 zone0.zone_phys_mem = 0;
2349 zone0.zone_phys_mem_ctl = UINT64_MAX;
2350 zone0.zone_max_lofi = 0;
2351 zone0.zone_max_lofi_ctl = UINT64_MAX;
2352 zone0.zone_shmmax = 0;
2353 zone0.zone_ipc.ipcq_shmmni = 0;
2354 zone0.zone_ipc.ipcq_semmni = 0;
2355 zone0.zone_ipc.ipcq_msgmni = 0;
2356 zone0.zone_name = GLOBAL_ZONENAME;
2357 zone0.zone_nodename = utsname.nodename;
2358 zone0.zone_domain = srpc_domain;
2359 zone0.zone_hostid = HW_INVALID_HOSTID;
2360 zone0.zone_fs_allowed = NULL;
2361 zone0.zone_ref = 1;
2362 zone0.zone_id = GLOBAL_ZONEID;
2363 zone0.zone_status = ZONE_IS_RUNNING;
2364 zone0.zone_rootpath = "/";
2365 zone0.zone_rootpathlen = 2;
2366 zone0.zone_psetid = ZONE_PS_INVAL;
2367 zone0.zone_ncpus = 0;
2368 zone0.zone_ncpus_online = 0;
2369 zone0.zone_proc_initpid = 1;
2370 zone0.zone_initname = initname;
2371 zone0.zone_lockedmem_kstat = NULL;
2372 zone0.zone_swapresv_kstat = NULL;
2373 zone0.zone_physmem_kstat = NULL;
2374 zone0.zone_nprocs_kstat = NULL;
2375 zone0.zone_zfs_io_pri = 1;
2376
2377 zone0.zone_stime = 0;
2378 zone0.zone_utime = 0;
2379 zone0.zone_wtime = 0;
2380
2381 list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2382 offsetof(zone_ref_t, zref_linkage));
2383 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2384 offsetof(struct zsd_entry, zsd_linkage));
2385 list_insert_head(&zone_active, &zone0);
2386
2387 /*
2388 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2389 * to anything meaningful. It is assigned to be 'rootdir' in
2390 * vfs_mountroot().
2391 */
2392 zone0.zone_rootvp = NULL;
2393 zone0.zone_vfslist = NULL;
2470 */
2471 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2472
2473 /*
2474 * Initialize generic zone resource controls, if any.
2475 */
2476 rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2477 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2478 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2479 FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2480
2481 rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2482 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2483 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2484 RCTL_GLOBAL_INFINITE,
2485 MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2486
2487 rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2488 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2489 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2490 16384, 16384, &zone_zfs_io_pri_ops);
2491
2492 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2493 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2494 INT_MAX, INT_MAX, &zone_lwps_ops);
2495
2496 rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2497 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2498 INT_MAX, INT_MAX, &zone_procs_ops);
2499
2500 /*
2501 * System V IPC resource controls
2502 */
2503 rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2504 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2505 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2506
2507 rc_zone_semmni = rctl_register("zone.max-sem-ids",
2508 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2509 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2510
2523 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2524 bzero(dval, sizeof (rctl_val_t));
2525 dval->rcv_value = 1;
2526 dval->rcv_privilege = RCPRIV_PRIVILEGED;
2527 dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2528 dval->rcv_action_recip_pid = -1;
2529
2530 rde = rctl_dict_lookup("zone.cpu-shares");
2531 (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2532
2533 rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2534 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2535 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2536 &zone_locked_mem_ops);
2537
2538 rc_zone_max_swap = rctl_register("zone.max-swap",
2539 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2540 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2541 &zone_max_swap_ops);
2542
2543 rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2544 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2545 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2546 &zone_phys_mem_ops);
2547
2548 rc_zone_max_lofi = rctl_register("zone.max-lofi",
2549 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2550 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2551 &zone_max_lofi_ops);
2552
2553 /*
2554 * Initialize the ``global zone''.
2555 */
2556 set = rctl_set_create();
2557 gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2558 mutex_enter(&p0.p_lock);
2559 e.rcep_p.zone = &zone0;
2560 e.rcep_t = RCENTITY_ZONE;
2561 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2562 gp);
2563
2564 zone0.zone_nlwps = p0.p_lwpcnt;
2565 zone0.zone_nprocs = 1;
2566 zone0.zone_ntasks = 1;
2567 mutex_exit(&p0.p_lock);
2568 zone0.zone_restart_init = B_TRUE;
2569 zone0.zone_reboot_on_init_exit = B_FALSE;
2570 zone0.zone_init_status = -1;
2571 zone0.zone_brand = &native_brand;
2572 rctl_prealloc_destroy(gp);
2573 /*
2574 * pool_default hasn't been initialized yet, so we let pool_init()
2575 * take care of making sure the global zone is in the default pool.
2576 */
2577
2578 /*
2579 * Initialize global zone kstats
2580 */
2581 zone_kstat_create(&zone0);
2582
2583 /*
2584 * Initialize zone label.
2585 * mlp are initialized when tnzonecfg is loaded.
2586 */
2587 zone0.zone_slabel = l_admin_low;
2588 rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2589 label_hold(l_admin_low);
2590
2630 /*
2631 * The global zone is fully initialized (except for zone_rootvp which
2632 * will be set when the root filesystem is mounted).
2633 */
2634 global_zone = &zone0;
2635
2636 /*
2637 * Setup an event channel to send zone status change notifications on
2638 */
2639 res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2640 EVCH_CREAT);
2641
2642 if (res)
2643 panic("Sysevent_evc_bind failed during zone setup.\n");
2644
2645 }
2646
2647 static void
2648 zone_free(zone_t *zone)
2649 {
2650 zone_dl_t *zdl;
2651
2652 ASSERT(zone != global_zone);
2653 ASSERT(zone->zone_ntasks == 0);
2654 ASSERT(zone->zone_nlwps == 0);
2655 ASSERT(zone->zone_nprocs == 0);
2656 ASSERT(zone->zone_cred_ref == 0);
2657 ASSERT(zone->zone_kcred == NULL);
2658 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2659 zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2660 ASSERT(list_is_empty(&zone->zone_ref_list));
2661
2662 /*
2663 * Remove any zone caps.
2664 */
2665 cpucaps_zone_remove(zone);
2666
2667 ASSERT(zone->zone_cpucap == NULL);
2668
2669 /* remove from deathrow list */
2670 if (zone_status_get(zone) == ZONE_IS_DEAD) {
2671 ASSERT(zone->zone_ref == 0);
2672 mutex_enter(&zone_deathrow_lock);
2673 list_remove(&zone_deathrow, zone);
2674 mutex_exit(&zone_deathrow_lock);
2675 }
2676
2677 list_destroy(&zone->zone_ref_list);
2678 zone_free_zsd(zone);
2679 zone_free_datasets(zone);
2680
2681 /*
2682 * While dlmgmtd should have removed all of these, it could have left
2683 * something behind or crashed. In which case it's not safe for us to
2684 * assume that the list is empty which list_destroy() will ASSERT. We
2685 * clean up for our userland comrades which may have crashed, or worse,
2686 * been disabled by SMF.
2687 */
2688 while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
2689 if (zdl->zdl_net != NULL)
2690 nvlist_free(zdl->zdl_net);
2691 kmem_free(zdl, sizeof (zone_dl_t));
2692 }
2693 list_destroy(&zone->zone_dl_list);
2694
2695 if (zone->zone_rootvp != NULL)
2696 VN_RELE(zone->zone_rootvp);
2697 if (zone->zone_rootpath)
2698 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2699 if (zone->zone_name != NULL)
2700 kmem_free(zone->zone_name, ZONENAME_MAX);
2701 if (zone->zone_slabel != NULL)
2702 label_rele(zone->zone_slabel);
2703 if (zone->zone_nodename != NULL)
2704 kmem_free(zone->zone_nodename, _SYS_NMLN);
2705 if (zone->zone_domain != NULL)
2706 kmem_free(zone->zone_domain, _SYS_NMLN);
2707 if (zone->zone_privset != NULL)
2708 kmem_free(zone->zone_privset, sizeof (priv_set_t));
2709 if (zone->zone_rctls != NULL)
2710 rctl_set_free(zone->zone_rctls);
2711 if (zone->zone_bootargs != NULL)
2712 strfree(zone->zone_bootargs);
2808 kmem_free(attrp, sizeof (struct brand_attr));
2809 if (bp == NULL)
2810 return (EINVAL);
2811
2812 /*
2813 * This is the only place where a zone can change it's brand.
2814 * We already need to hold zone_status_lock to check the zone
2815 * status, so we'll just use that lock to serialize zone
2816 * branding requests as well.
2817 */
2818 mutex_enter(&zone_status_lock);
2819
2820 /* Re-Branding is not allowed and the zone can't be booted yet */
2821 if ((ZONE_IS_BRANDED(zone)) ||
2822 (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2823 mutex_exit(&zone_status_lock);
2824 brand_unregister_zone(bp);
2825 return (EINVAL);
2826 }
2827
2828 /*
2829 * Set up the brand specific data.
2830 * Note that it's possible that the hook has to drop the
2831 * zone_status_lock and reaquire it before returning so we can't
2832 * assume the lock has been held the entire time.
2833 */
2834 zone->zone_brand = bp;
2835 ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
2836
2837 mutex_exit(&zone_status_lock);
2838 return (0);
2839 }
2840
2841 static int
2842 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2843 {
2844 char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2845 int err = 0;
2846
2847 ASSERT(zone != global_zone);
2848 if ((err = copyinstr(zone_fs_allowed, buf,
2849 ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2850 goto done;
2851
2852 if (zone->zone_fs_allowed != NULL)
2853 strfree(zone->zone_fs_allowed);
2854
2855 zone->zone_fs_allowed = strdup(buf);
2861
2862 static int
2863 zone_set_initname(zone_t *zone, const char *zone_initname)
2864 {
2865 char initname[INITNAME_SZ];
2866 size_t len;
2867 int err = 0;
2868
2869 ASSERT(zone != global_zone);
2870 if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2871 return (err); /* EFAULT or ENAMETOOLONG */
2872
2873 if (zone->zone_initname != NULL)
2874 strfree(zone->zone_initname);
2875
2876 zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2877 (void) strcpy(zone->zone_initname, initname);
2878 return (0);
2879 }
2880
2881 /*
2882 * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
2883 * to provide the physical memory capping kstats. Since physical memory
2884 * capping is currently implemented in userland, that code uses the setattr
2885 * entry point to increment the kstats. We always simply increment nover
2886 * every time that setattr is called and we always add in the input value
2887 * to zone_mcap_pagedout every time that is called.
2888 */
2889 /*ARGSUSED*/
2890 static int
2891 zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
2892 {
2893 zone->zone_mcap_nover++;
2894
2895 return (0);
2896 }
2897
2898 static int
2899 zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
2900 {
2901 uint64_t pageout;
2902 int err;
2903
2904 if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
2905 zone->zone_mcap_pagedout += pageout;
2906
2907 return (err);
2908 }
2909
2910 /*
2911 * The zone_set_page_fault_delay function is used to set the number of usecs
2912 * to throttle page faults. This is normally 0 but can be set to a non-0 value
2913 * by the user-land memory capping code when the zone is over its physcial
2914 * memory cap.
2915 */
2916 static int
2917 zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
2918 {
2919 uint32_t dusec;
2920 int err;
2921
2922 if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
2923 zone->zone_pg_flt_delay = dusec;
2924
2925 return (err);
2926 }
2927
2928 /*
2929 * The zone_set_rss function is used to set the zone's RSS when we do the
2930 * fast, approximate calculation in user-land.
2931 */
2932 static int
2933 zone_set_rss(zone_t *zone, const uint64_t *prss)
2934 {
2935 uint64_t rss;
2936 int err;
2937
2938 if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
2939 zone->zone_phys_mem = rss;
2940
2941 return (err);
2942 }
2943
2944 static int
2945 zone_set_sched_class(zone_t *zone, const char *new_class)
2946 {
2947 char sched_class[PC_CLNMSZ];
2948 id_t classid;
2949 int err;
2950
2951 ASSERT(zone != global_zone);
2952 if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2953 return (err); /* EFAULT or ENAMETOOLONG */
2954
2955 if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2956 return (set_errno(EINVAL));
2957 zone->zone_defaultcid = classid;
2958 ASSERT(zone->zone_defaultcid > 0 &&
2959 zone->zone_defaultcid < loaded_classes);
2960
2961 return (0);
2962 }
2963
2964 /*
4074 return (0);
4075 }
4076
4077 /*
4078 * Non-global zone version of start_init.
4079 */
4080 void
4081 zone_start_init(void)
4082 {
4083 proc_t *p = ttoproc(curthread);
4084 zone_t *z = p->p_zone;
4085
4086 ASSERT(!INGLOBALZONE(curproc));
4087
4088 /*
4089 * For all purposes (ZONE_ATTR_INITPID and restart_init),
4090 * storing just the pid of init is sufficient.
4091 */
4092 z->zone_proc_initpid = p->p_pid;
4093
4094 if (z->zone_setup_app_contract == B_TRUE) {
4095 /*
4096 * Normally a process cannot modify its own contract, but we're
4097 * just starting the zone's init process and its contract is
4098 * always initialized from the sys_process_tmpl template, so
4099 * this is the simplest way to setup init's contract to kill
4100 * the process if any other process in the contract exits.
4101 */
4102 p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
4103 }
4104
4105 /*
4106 * We maintain zone_boot_err so that we can return the cause of the
4107 * failure back to the caller of the zone_boot syscall.
4108 */
4109 p->p_zone->zone_boot_err = start_init_common();
4110
4111 /*
4112 * We will prevent booting zones from becoming running zones if the
4113 * global zone is shutting down.
4114 */
4115 mutex_enter(&zone_status_lock);
4116 if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
4117 ZONE_IS_SHUTTING_DOWN) {
4118 /*
4119 * Make sure we are still in the booting state-- we could have
4120 * raced and already be shutting down, or even further along.
4121 */
4122 if (zone_status_get(z) == ZONE_IS_BOOTING) {
4123 zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
4124 }
4125 mutex_exit(&zone_status_lock);
4126 /* It's gone bad, dispose of the process */
4127 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
4128 mutex_enter(&p->p_lock);
4129 ASSERT(p->p_flag & SEXITLWPS);
4130 lwp_exit();
4131 }
4132 } else {
4133 id_t cid = curthread->t_cid;
4134
4135 if (zone_status_get(z) == ZONE_IS_BOOTING)
4136 zone_status_set(z, ZONE_IS_RUNNING);
4137 mutex_exit(&zone_status_lock);
4138
4139 mutex_enter(&class_lock);
4140 ASSERT(cid < loaded_classes);
4141 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
4142 z->zone_fixed_hipri) {
4143 /*
4144 * If the zone is using FX then by default all
4145 * processes start at the lowest priority and stay
4146 * there. We provide a mechanism for the zone to
4147 * indicate that it should run at "high priority". In
4148 * this case we setup init to run at the highest FX
4149 * priority (which is one level higher than the
4150 * non-fixed scheduling classes can use).
4151 */
4152 pcparms_t pcparms;
4153
4154 pcparms.pc_cid = cid;
4155 ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
4156 ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
4157 FXMAXUPRI;
4158 ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
4159 FX_DOUPRILIM | FX_DOUPRI;
4160
4161 mutex_enter(&pidlock);
4162 mutex_enter(&curproc->p_lock);
4163
4164 (void) parmsset(&pcparms, curthread);
4165
4166 mutex_exit(&curproc->p_lock);
4167 mutex_exit(&pidlock);
4168 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
4169 /*
4170 * zsched always starts the init lwp at priority
4171 * minclsyspri - 1. This priority gets set in t_pri and
4172 * is invalid for RT, but RT never uses t_pri. However
4173 * t_pri is used by procfs, so we always see processes
4174 * within an RT zone with an invalid priority value.
4175 * We fix that up now.
4176 */
4177 curthread->t_pri = RTGPPRIO0;
4178 }
4179 mutex_exit(&class_lock);
4180
4181 /* cause the process to return to userland. */
4182 lwp_rtt();
4183 }
4184 }
4185
4186 struct zsched_arg {
4187 zone_t *zone;
4188 nvlist_t *nvlist;
4189 };
4190
4191 /*
4192 * Per-zone "sched" workalike. The similarity to "sched" doesn't have
4193 * anything to do with scheduling, but rather with the fact that
4194 * per-zone kernel threads are parented to zsched, just like regular
4195 * kernel threads are parented to sched (p0).
4196 *
4197 * zsched is also responsible for launching init for the zone.
4198 */
4199 static void
4200 zsched(void *arg)
4202 struct zsched_arg *za = arg;
4203 proc_t *pp = curproc;
4204 proc_t *initp = proc_init;
4205 zone_t *zone = za->zone;
4206 cred_t *cr, *oldcred;
4207 rctl_set_t *set;
4208 rctl_alloc_gp_t *gp;
4209 contract_t *ct = NULL;
4210 task_t *tk, *oldtk;
4211 rctl_entity_p_t e;
4212 kproject_t *pj;
4213
4214 nvlist_t *nvl = za->nvlist;
4215 nvpair_t *nvp = NULL;
4216
4217 bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
4218 bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
4219 PTOU(pp)->u_argc = 0;
4220 PTOU(pp)->u_argv = NULL;
4221 PTOU(pp)->u_envp = NULL;
4222 PTOU(pp)->u_commpagep = NULL;
4223 closeall(P_FINFO(pp));
4224
4225 /*
4226 * We are this zone's "zsched" process. As the zone isn't generally
4227 * visible yet we don't need to grab any locks before initializing its
4228 * zone_proc pointer.
4229 */
4230 zone_hold(zone); /* this hold is released by zone_destroy() */
4231 zone->zone_zsched = pp;
4232 mutex_enter(&pp->p_lock);
4233 pp->p_zone = zone;
4234 mutex_exit(&pp->p_lock);
4235
4236 /*
4237 * Disassociate process from its 'parent'; parent ourselves to init
4238 * (pid 1) and change other values as needed.
4239 */
4240 sess_create();
4241
4242 mutex_enter(&pidlock);
4645 goto out;
4646 }
4647 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
4648 /*
4649 * nvl may have been allocated/free'd, but the value set to
4650 * non-NULL, so we reset it here.
4651 */
4652 nvl = NULL;
4653 error = EINVAL;
4654 goto out;
4655 }
4656 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4657 rctl_dict_entry_t *rde;
4658 rctl_hndl_t hndl;
4659 nvlist_t **nvlarray;
4660 uint_t i, nelem;
4661 char *name;
4662
4663 error = EINVAL;
4664 name = nvpair_name(nvp);
4665 if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
4666 strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
4667 nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4668 goto out;
4669 }
4670 if ((hndl = rctl_hndl_lookup(name)) == -1) {
4671 goto out;
4672 }
4673 rde = rctl_dict_lookup_hndl(hndl);
4674 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4675 ASSERT(error == 0);
4676 for (i = 0; i < nelem; i++) {
4677 if (error = nvlist2rctlval(nvlarray[i], &rv))
4678 goto out;
4679 }
4680 if (rctl_invalid_value(rde, &rv)) {
4681 error = EINVAL;
4682 goto out;
4683 }
4684 }
4685 error = 0;
4686 *nvlp = nvl;
4687 out:
4795 cred_t *zkcr;
4796 boolean_t insert_label_hash;
4797
4798 if (secpolicy_zone_config(CRED()) != 0)
4799 return (set_errno(EPERM));
4800
4801 /* can't boot zone from within chroot environment */
4802 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4803 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4804 extended_error));
4805
4806 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4807 zoneid = zone->zone_id = id_alloc(zoneid_space);
4808 zone->zone_status = ZONE_IS_UNINITIALIZED;
4809 zone->zone_pool = pool_default;
4810 zone->zone_pool_mod = gethrtime();
4811 zone->zone_psetid = ZONE_PS_INVAL;
4812 zone->zone_ncpus = 0;
4813 zone->zone_ncpus_online = 0;
4814 zone->zone_restart_init = B_TRUE;
4815 zone->zone_reboot_on_init_exit = B_FALSE;
4816 zone->zone_init_status = -1;
4817 zone->zone_brand = &native_brand;
4818 zone->zone_initname = NULL;
4819 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4820 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4821 mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4822 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4823 list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4824 offsetof(zone_ref_t, zref_linkage));
4825 list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4826 offsetof(struct zsd_entry, zsd_linkage));
4827 list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
4828 offsetof(zone_dataset_t, zd_linkage));
4829 list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
4830 offsetof(zone_dl_t, zdl_linkage));
4831 rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
4832 rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
4833
4834 if (flags & ZCF_NET_EXCL) {
4835 zone->zone_flags |= ZF_NET_EXCL;
4836 }
4858 zone->zone_domain[0] = '\0';
4859 zone->zone_hostid = HW_INVALID_HOSTID;
4860 zone->zone_shares = 1;
4861 zone->zone_shmmax = 0;
4862 zone->zone_ipc.ipcq_shmmni = 0;
4863 zone->zone_ipc.ipcq_semmni = 0;
4864 zone->zone_ipc.ipcq_msgmni = 0;
4865 zone->zone_bootargs = NULL;
4866 zone->zone_fs_allowed = NULL;
4867 zone->zone_initname =
4868 kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4869 (void) strcpy(zone->zone_initname, zone_default_initname);
4870 zone->zone_nlwps = 0;
4871 zone->zone_nlwps_ctl = INT_MAX;
4872 zone->zone_nprocs = 0;
4873 zone->zone_nprocs_ctl = INT_MAX;
4874 zone->zone_locked_mem = 0;
4875 zone->zone_locked_mem_ctl = UINT64_MAX;
4876 zone->zone_max_swap = 0;
4877 zone->zone_max_swap_ctl = UINT64_MAX;
4878 zone->zone_phys_mem = 0;
4879 zone->zone_phys_mem_ctl = UINT64_MAX;
4880 zone->zone_max_lofi = 0;
4881 zone->zone_max_lofi_ctl = UINT64_MAX;
4882 zone->zone_lockedmem_kstat = NULL;
4883 zone->zone_swapresv_kstat = NULL;
4884 zone->zone_physmem_kstat = NULL;
4885 zone->zone_zfs_io_pri = 1;
4886
4887 /*
4888 * Zsched initializes the rctls.
4889 */
4890 zone->zone_rctls = NULL;
4891
4892 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4893 zone_free(zone);
4894 return (zone_create_error(error, 0, extended_error));
4895 }
4896
4897 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4898 zone_free(zone);
4899 return (set_errno(error));
4900 }
4901
4902 /*
4903 * Read in the trusted system parameters:
4904 * match flag and sensitivity label.
5020 if (insert_label_hash) {
5021 (void) mod_hash_insert(zonehashbylabel,
5022 (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
5023 zone->zone_flags |= ZF_HASHED_LABEL;
5024 }
5025
5026 /*
5027 * Insert into active list. At this point there are no 'hold's
5028 * on the zone, but everyone else knows not to use it, so we can
5029 * continue to use it. zsched() will do a zone_hold() if the
5030 * newproc() is successful.
5031 */
5032 list_insert_tail(&zone_active, zone);
5033 mutex_exit(&zonehash_lock);
5034
5035 zarg.zone = zone;
5036 zarg.nvlist = rctls;
5037 /*
5038 * The process, task, and project rctls are probably wrong;
5039 * we need an interface to get the default values of all rctls,
5040 * and initialize zsched appropriately. However, we allow zoneadmd
5041 * to pass down both zone and project rctls for the zone's init.
5042 */
5043 error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
5044 if (error != 0) {
5045 /*
5046 * We need to undo all globally visible state.
5047 */
5048 mutex_enter(&zonehash_lock);
5049 list_remove(&zone_active, zone);
5050 if (zone->zone_flags & ZF_HASHED_LABEL) {
5051 ASSERT(zone->zone_slabel != NULL);
5052 (void) mod_hash_destroy(zonehashbylabel,
5053 (mod_hash_key_t)zone->zone_slabel);
5054 }
5055 (void) mod_hash_destroy(zonehashbyname,
5056 (mod_hash_key_t)(uintptr_t)zone->zone_name);
5057 (void) mod_hash_destroy(zonehashbyid,
5058 (mod_hash_key_t)(uintptr_t)zone->zone_id);
5059 ASSERT(zonecount > 1);
5060 zonecount--;
5061 goto errout;
5921 err = copyoutstr(zone->zone_initname, buf, bufsize,
5922 NULL);
5923 if (err != 0 && err != ENAMETOOLONG)
5924 error = EFAULT;
5925 }
5926 break;
5927 case ZONE_ATTR_BOOTARGS:
5928 if (zone->zone_bootargs == NULL)
5929 outstr = "";
5930 else
5931 outstr = zone->zone_bootargs;
5932 size = strlen(outstr) + 1;
5933 if (bufsize > size)
5934 bufsize = size;
5935 if (buf != NULL) {
5936 err = copyoutstr(outstr, buf, bufsize, NULL);
5937 if (err != 0 && err != ENAMETOOLONG)
5938 error = EFAULT;
5939 }
5940 break;
5941 case ZONE_ATTR_SCHED_CLASS:
5942 mutex_enter(&class_lock);
5943
5944 if (zone->zone_defaultcid >= loaded_classes)
5945 outstr = "";
5946 else
5947 outstr = sclass[zone->zone_defaultcid].cl_name;
5948 size = strlen(outstr) + 1;
5949 if (bufsize > size)
5950 bufsize = size;
5951 if (buf != NULL) {
5952 err = copyoutstr(outstr, buf, bufsize, NULL);
5953 if (err != 0 && err != ENAMETOOLONG)
5954 error = EFAULT;
5955 }
5956
5957 mutex_exit(&class_lock);
5958 break;
5959 case ZONE_ATTR_HOSTID:
5960 if (zone->zone_hostid != HW_INVALID_HOSTID &&
5975 size = strlen(outstr) + 1;
5976 if (bufsize > size)
5977 bufsize = size;
5978 if (buf != NULL) {
5979 err = copyoutstr(outstr, buf, bufsize, NULL);
5980 if (err != 0 && err != ENAMETOOLONG)
5981 error = EFAULT;
5982 }
5983 break;
5984 case ZONE_ATTR_NETWORK:
5985 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5986 if (copyin(buf, zbuf, bufsize) != 0) {
5987 error = EFAULT;
5988 } else {
5989 error = zone_get_network(zoneid, zbuf);
5990 if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5991 error = EFAULT;
5992 }
5993 kmem_free(zbuf, bufsize);
5994 break;
5995 case ZONE_ATTR_SCHED_FIXEDHI:
5996 size = sizeof (boolean_t);
5997 if (bufsize > size)
5998 bufsize = size;
5999
6000 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6001 bufsize) != 0)
6002 error = EFAULT;
6003 break;
6004 default:
6005 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6006 size = bufsize;
6007 error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6008 } else {
6009 error = EINVAL;
6010 }
6011 }
6012 zone_rele(zone);
6013
6014 if (error)
6015 return (set_errno(error));
6016 return ((ssize_t)size);
6017 }
6018
6019 /*
6020 * Systemcall entry point for zone_setattr(2).
6021 */
6022 /*ARGSUSED*/
6023 static int
6024 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
6025 {
6026 zone_t *zone;
6027 zone_status_t zone_status;
6028 int err = -1;
6029 zone_net_data_t *zbuf;
6030
6031 if (secpolicy_zone_config(CRED()) != 0)
6032 return (set_errno(EPERM));
6033
6034 /*
6035 * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
6036 * attributes can be set on the global zone.
6037 */
6038 if (zoneid == GLOBAL_ZONEID &&
6039 attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
6040 return (set_errno(EINVAL));
6041 }
6042
6043 mutex_enter(&zonehash_lock);
6044 if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
6045 mutex_exit(&zonehash_lock);
6046 return (set_errno(EINVAL));
6047 }
6048 zone_hold(zone);
6049 mutex_exit(&zonehash_lock);
6050
6051 /*
6052 * At present most attributes can only be set on non-running,
6053 * non-global zones.
6054 */
6055 zone_status = zone_status_get(zone);
6056 if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
6057 attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
6058 zone_status > ZONE_IS_READY) {
6059 err = EINVAL;
6060 goto done;
6061 }
6062
6063 switch (attr) {
6064 case ZONE_ATTR_INITNAME:
6065 err = zone_set_initname(zone, (const char *)buf);
6066 break;
6067 case ZONE_ATTR_INITNORESTART:
6068 zone->zone_restart_init = B_FALSE;
6069 err = 0;
6070 break;
6071 case ZONE_ATTR_BOOTARGS:
6072 err = zone_set_bootargs(zone, (const char *)buf);
6073 break;
6074 case ZONE_ATTR_BRAND:
6075 err = zone_set_brand(zone, (const char *)buf);
6076 break;
6077 case ZONE_ATTR_FS_ALLOWED:
6078 err = zone_set_fs_allowed(zone, (const char *)buf);
6079 break;
6080 case ZONE_ATTR_PMCAP_NOVER:
6081 err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
6082 break;
6083 case ZONE_ATTR_PMCAP_PAGEOUT:
6084 err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
6085 break;
6086 case ZONE_ATTR_PG_FLT_DELAY:
6087 err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
6088 break;
6089 case ZONE_ATTR_RSS:
6090 err = zone_set_rss(zone, (const uint64_t *)buf);
6091 break;
6092 case ZONE_ATTR_SCHED_CLASS:
6093 err = zone_set_sched_class(zone, (const char *)buf);
6094 break;
6095 case ZONE_ATTR_HOSTID:
6096 if (bufsize == sizeof (zone->zone_hostid)) {
6097 if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
6098 err = 0;
6099 else
6100 err = EFAULT;
6101 } else {
6102 err = EINVAL;
6103 }
6104 break;
6105 case ZONE_ATTR_NETWORK:
6106 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
6107 err = EINVAL;
6108 break;
6109 }
6110 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6111 if (copyin(buf, zbuf, bufsize) != 0) {
6112 kmem_free(zbuf, bufsize);
6113 err = EFAULT;
6114 break;
6115 }
6116 err = zone_set_network(zoneid, zbuf);
6117 kmem_free(zbuf, bufsize);
6118 break;
6119 case ZONE_ATTR_APP_SVC_CT:
6120 if (bufsize != sizeof (boolean_t)) {
6121 err = EINVAL;
6122 } else {
6123 zone->zone_setup_app_contract = (boolean_t)buf;
6124 err = 0;
6125 }
6126 break;
6127 case ZONE_ATTR_SCHED_FIXEDHI:
6128 if (bufsize != sizeof (boolean_t)) {
6129 err = EINVAL;
6130 } else {
6131 zone->zone_fixed_hipri = (boolean_t)buf;
6132 err = 0;
6133 }
6134 break;
6135 default:
6136 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
6137 err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
6138 else
6139 err = EINVAL;
6140 }
6141
6142 done:
6143 zone_rele(zone);
6144 ASSERT(err != -1);
6145 return (err != 0 ? set_errno(err) : 0);
6146 }
6147
6148 /*
6149 * Return zero if the process has at least one vnode mapped in to its
6150 * address space which shouldn't be allowed to change zones.
6151 *
6152 * Also return zero if the process has any shared mappings which reserve
6153 * swap. This is because the counting for zone.max-swap does not allow swap
6154 * reservation to be shared between zones. zone swap reservation is counted
6918 door_arg_t darg, save_arg;
6919 char *zone_name;
6920 size_t zone_namelen;
6921 zoneid_t zoneid;
6922 zone_t *zone;
6923 zone_cmd_arg_t arg;
6924 uint64_t uniqid;
6925 size_t size;
6926 int error;
6927 int retry;
6928
6929 zone = zargp->zone;
6930 arg = zargp->arg;
6931 kmem_free(zargp, sizeof (*zargp));
6932
6933 zone_namelen = strlen(zone->zone_name) + 1;
6934 zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6935 bcopy(zone->zone_name, zone_name, zone_namelen);
6936 zoneid = zone->zone_id;
6937 uniqid = zone->zone_uniqid;
6938 arg.status = zone->zone_init_status;
6939 /*
6940 * zoneadmd may be down, but at least we can empty out the zone.
6941 * We can ignore the return value of zone_empty() since we're called
6942 * from a kernel thread and know we won't be delivered any signals.
6943 */
6944 ASSERT(curproc == &p0);
6945 (void) zone_empty(zone);
6946 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6947 zone_rele(zone);
6948
6949 size = sizeof (arg);
6950 darg.rbuf = (char *)&arg;
6951 darg.data_ptr = (char *)&arg;
6952 darg.rsize = size;
6953 darg.data_size = size;
6954 darg.desc_ptr = NULL;
6955 darg.desc_num = 0;
6956
6957 save_arg = darg;
6958 /*
7159
7160 /*
7161 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
7162 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
7163 * could cause assertions to fail (e.g., assertions about a zone's
7164 * state during initialization, readying, or booting) or produce races.
7165 * We'll let threads continue to initialize and ready new zones: they'll
7166 * fail to boot the new zones when they see that the global zone is
7167 * shutting down.
7168 */
7169 for (current_zonep = list_head(&zone_active); current_zonep != NULL;
7170 current_zonep = list_next(&zone_active, current_zonep)) {
7171 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
7172 zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
7173 }
7174 mutex_exit(&zone_status_lock);
7175 mutex_exit(&zonehash_lock);
7176 }
7177
7178 /*
7179 * Returns true if the named dataset is visible in the specified zone.
7180 * The 'write' parameter is set to 1 if the dataset is also writable.
7181 */
7182 int
7183 zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
7184 {
7185 static int zfstype = -1;
7186 zone_dataset_t *zd;
7187 size_t len;
7188 const char *name = NULL;
7189 vfs_t *vfsp = NULL;
7190
7191 if (dataset[0] == '\0')
7192 return (0);
7193
7194 /*
7195 * Walk the list once, looking for datasets which match exactly, or
7196 * specify a dataset underneath an exported dataset. If found, return
7197 * true and note that it is writable.
7198 */
7199 for (zd = list_head(&zone->zone_datasets); zd != NULL;
7200 zd = list_next(&zone->zone_datasets, zd)) {
7201
7202 len = strlen(zd->zd_dataset);
7203 if (strlen(dataset) >= len &&
7204 bcmp(dataset, zd->zd_dataset, len) == 0 &&
7205 (dataset[len] == '\0' || dataset[len] == '/' ||
7206 dataset[len] == '@')) {
7207 if (write)
7235 /*
7236 * We reach here if the given dataset is not found in the zone_dataset
7237 * list. Check if this dataset was added as a filesystem (ie. "add fs")
7238 * instead of delegation. For this we search for the dataset in the
7239 * zone_vfslist of this zone. If found, return true and note that it is
7240 * not writable.
7241 */
7242
7243 /*
7244 * Initialize zfstype if it is not initialized yet.
7245 */
7246 if (zfstype == -1) {
7247 struct vfssw *vswp = vfs_getvfssw("zfs");
7248 zfstype = vswp - vfssw;
7249 vfs_unrefvfssw(vswp);
7250 }
7251
7252 vfs_list_read_lock();
7253 vfsp = zone->zone_vfslist;
7254 do {
7255 if (vfsp == NULL)
7256 break;
7257 if (vfsp->vfs_fstype == zfstype) {
7258 name = refstr_value(vfsp->vfs_resource);
7259
7260 /*
7261 * Check if we have an exact match.
7262 */
7263 if (strcmp(dataset, name) == 0) {
7264 vfs_list_unlock();
7265 if (write)
7266 *write = 0;
7267 return (1);
7268 }
7269 /*
7270 * We need to check if we are looking for parents of
7271 * a dataset. These should be visible, but read-only.
7272 */
7273 len = strlen(dataset);
7274 if (dataset[len - 1] == '/')
7275 len--;
7276
7277 if (len < strlen(name) &&
7278 bcmp(dataset, name, len) == 0 && name[len] == '/') {
7279 vfs_list_unlock();
7280 if (write)
7281 *write = 0;
7282 return (1);
7283 }
7284 }
7285 vfsp = vfsp->vfs_zone_next;
7286 } while (vfsp != zone->zone_vfslist);
7287
7288 vfs_list_unlock();
7289 return (0);
7290 }
7291
7292 /*
7293 * Returns true if the named dataset is visible in the current zone.
7294 * The 'write' parameter is set to 1 if the dataset is also writable.
7295 */
7296 int
7297 zone_dataset_visible(const char *dataset, int *write)
7298 {
7299 zone_t *zone = curproc->p_zone;
7300
7301 return (zone_dataset_visible_inzone(zone, dataset, write));
7302 }
7303
7304 /*
7305 * zone_find_by_any_path() -
7306 *
7307 * kernel-private routine similar to zone_find_by_path(), but which
7308 * effectively compares against zone paths rather than zonerootpath
7309 * (i.e., the last component of zonerootpaths, which should be "root/",
7310 * are not compared.) This is done in order to accurately identify all
7311 * paths, whether zone-visible or not, including those which are parallel
7312 * to /root/, such as /dev/, /home/, etc...
7313 *
7314 * If the specified path does not fall under any zone path then global
7315 * zone is returned.
7316 *
7317 * The treat_abs parameter indicates whether the path should be treated as
7318 * an absolute path although it does not begin with "/". (This supports
7319 * nfs mount syntax such as host:any/path.)
7320 *
7321 * The caller is responsible for zone_rele of the returned zone.
7322 */
7323 zone_t *
|