358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368 
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_phys_mem;
 376 rctl_hndl_t rc_zone_max_lofi;
 377 rctl_hndl_t rc_zone_cpu_cap;
 378 rctl_hndl_t rc_zone_cpu_baseline;
 379 rctl_hndl_t rc_zone_cpu_burst_time;
 380 rctl_hndl_t rc_zone_zfs_io_pri;
 381 rctl_hndl_t rc_zone_nlwps;
 382 rctl_hndl_t rc_zone_nprocs;
 383 rctl_hndl_t rc_zone_shmmax;
 384 rctl_hndl_t rc_zone_shmmni;
 385 rctl_hndl_t rc_zone_semmni;
 386 rctl_hndl_t rc_zone_msgmni;
 387 
 388 const char * const zone_default_initname = "/sbin/init";
 389 static char * const zone_prefix = "/zone/";
 390 static int zone_shutdown(zoneid_t zoneid);
 391 static int zone_add_datalink(zoneid_t, datalink_id_t);
 392 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 393 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 394 static int zone_set_network(zoneid_t, zone_net_data_t *);
 395 static int zone_get_network(zoneid_t, zone_net_data_t *);
 396 
 397 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 398 
 399 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 
 406     kmutex_t *);
 407 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 408     kmutex_t *);
 409 
 410 /*
 411  * Bump this number when you alter the zone syscall interfaces; this is
 412  * because we need to have support for previous API versions in libc
 413  * to support patching; libc calls into the kernel to determine this number.
 414  *
 415  * Version 1 of the API is the version originally shipped with Solaris 10
 416  * Version 2 alters the zone_create system call in order to support more
 417  *     arguments by moving the args into a structure; and to do better
 418  *     error reporting when zone_create() fails.
 419  * Version 3 alters the zone_create system call in order to support the
 420  *     import of ZFS datasets to zones.
 421  * Version 4 alters the zone_create system call in order to support
 422  *     Trusted Extensions.
 423  * Version 5 alters the zone_boot system call, and converts its old
 424  *     bootargs parameter to be set by the zone_setattr API instead.
 425  * Version 6 adds the flag argument to zone_create.
 426  * Version 7 adds the requested zoneid to zone_create.
 427  */
 428 static const int ZONE_SYSCALL_API_VERSION = 7;
 429 
 430 /*
 431  * Certain filesystems (such as NFS and autofs) need to know which zone
 432  * the mount is being placed in.  Because of this, we need to be able to
 433  * ensure that a zone isn't in the process of being created/destroyed such
 434  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 435  * it gets added the list of mounted zones, it ends up on the wrong zone's
 436  * mount list. Since a zone can't reside on an NFS file system, we don't
 437  * have to worry about the zonepath itself.
 438  *
 439  * The following functions: block_mounts()/resume_mounts() and
 440  * mount_in_progress()/mount_completed() are used by zones and the VFS
 441  * layer (respectively) to synchronize zone state transitions and new
 442  * mounts within a zone. This syncronization is on a per-zone basis, so
 443  * activity for one zone will not interfere with activity for another zone.
 444  *
 445  * The semantics are like a reader-reader lock such that there may
 446  * either be multiple mounts (or zone state transitions, if that weren't
 447  * serialized by zonehash_lock) in progress at the same time, but not
 448  * both.
 
 
1365 
1366         ASSERT(MUTEX_HELD(&p->p_lock));
1367         ASSERT(e->rcep_t == RCENTITY_ZONE);
1368 
1369         if (zone == NULL)
1370                 return (0);
1371 
1372         /*
1373          * set cap to the new value.
1374          */
1375         return (cpucaps_zone_set(zone, nv));
1376 }
1377 
1378 static rctl_ops_t zone_cpu_cap_ops = {
1379         rcop_no_action,
1380         zone_cpu_cap_get,
1381         zone_cpu_cap_set,
1382         rcop_no_test
1383 };
1384 
1385 /*ARGSUSED*/
1386 static rctl_qty_t
1387 zone_cpu_base_get(rctl_t *rctl, struct proc *p)
1388 {
1389         ASSERT(MUTEX_HELD(&p->p_lock));
1390         return (cpucaps_zone_get_base(p->p_zone));
1391 }
1392 
1393 /*
1394  * The zone cpu base is used to set the baseline CPU for the zone
1395  * so we can track when the zone is bursting.
1396  */
1397 /*ARGSUSED*/
1398 static int
1399 zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1400     rctl_qty_t nv)
1401 {
1402         zone_t *zone = e->rcep_p.zone;
1403 
1404         ASSERT(MUTEX_HELD(&p->p_lock));
1405         ASSERT(e->rcep_t == RCENTITY_ZONE);
1406 
1407         if (zone == NULL)
1408                 return (0);
1409 
1410         return (cpucaps_zone_set_base(zone, nv));
1411 }
1412 
1413 static rctl_ops_t zone_cpu_base_ops = {
1414         rcop_no_action,
1415         zone_cpu_base_get,
1416         zone_cpu_base_set,
1417         rcop_no_test
1418 };
1419 
1420 /*ARGSUSED*/
1421 static rctl_qty_t
1422 zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
1423 {
1424         ASSERT(MUTEX_HELD(&p->p_lock));
1425         return (cpucaps_zone_get_burst_time(p->p_zone));
1426 }
1427 
1428 /*
1429  * The zone cpu burst time is used to set the amount of time CPU(s) can be
1430  * bursting for the zone.
1431  */
1432 /*ARGSUSED*/
1433 static int
1434 zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1435     rctl_qty_t nv)
1436 {
1437         zone_t *zone = e->rcep_p.zone;
1438 
1439         ASSERT(MUTEX_HELD(&p->p_lock));
1440         ASSERT(e->rcep_t == RCENTITY_ZONE);
1441 
1442         if (zone == NULL)
1443                 return (0);
1444 
1445         return (cpucaps_zone_set_burst_time(zone, nv));
1446 }
1447 
1448 static rctl_ops_t zone_cpu_burst_time_ops = {
1449         rcop_no_action,
1450         zone_cpu_burst_time_get,
1451         zone_cpu_burst_time_set,
1452         rcop_no_test
1453 };
1454 
1455 /*
1456  * zone.zfs-io-pri resource control support (IO priority).
1457  */
1458 /*ARGSUSED*/
1459 static rctl_qty_t
1460 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1461 {
1462         ASSERT(MUTEX_HELD(&p->p_lock));
1463         return (p->p_zone->zone_zfs_io_pri);
1464 }
1465 
1466 /*ARGSUSED*/
1467 static int
1468 zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1469     rctl_qty_t nv)
1470 {
1471         zone_t *zone = e->rcep_p.zone;
1472 
1473         ASSERT(MUTEX_HELD(&p->p_lock));
1474         ASSERT(e->rcep_t == RCENTITY_ZONE);
1475 
 
1969         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1970         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1971         return (0);
1972 }
1973 
1974 static int
1975 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1976 {
1977         zone_t *zone = ksp->ks_private;
1978         zone_kstat_t *zk = ksp->ks_data;
1979 
1980         if (rw == KSTAT_WRITE)
1981                 return (EACCES);
1982 
1983         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1984         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1985         return (0);
1986 }
1987 
1988 static kstat_t *
1989 zone_rctl_kstat_create_common(zone_t *zone, char *name,
1990     int (*updatefunc) (kstat_t *, int))
1991 {
1992         kstat_t *ksp;
1993         zone_kstat_t *zk;
1994 
1995         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1996             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1997             KSTAT_FLAG_VIRTUAL);
1998 
1999         if (ksp == NULL)
2000                 return (NULL);
2001 
2002         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
2003         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2004         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
2005         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
2006         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
2007         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
2008         ksp->ks_update = updatefunc;
2009         ksp->ks_private = zone;
 
2198         kstat_t *ksp;
2199         zone_mcap_kstat_t *zmp;
2200 
2201         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2202             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2203             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2204             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2205                 return (NULL);
2206 
2207         if (zone->zone_id != GLOBAL_ZONEID)
2208                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2209 
2210         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2211         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2212         ksp->ks_lock = &zone->zone_mcap_lock;
2213         zone->zone_mcap_stats = zmp;
2214 
2215         /* The kstat "name" field is not large enough for a full zonename */
2216         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2217         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2218         kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2219         kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2220         kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2221         kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2222         kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2223         kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2224         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2225         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2226         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2227         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2228         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2229             KSTAT_DATA_UINT64);
2230         kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
2231             KSTAT_DATA_UINT64);
2232         kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
2233             KSTAT_DATA_UINT64);
2234 
2235         ksp->ks_update = zone_mcap_kstat_update;
2236         ksp->ks_private = zone;
2237 
 
2251 
2252         tmp = zone->zone_utime;
2253         scalehrtime(&tmp);
2254         zmp->zm_utime.value.ui64 = tmp;
2255         tmp = zone->zone_stime;
2256         scalehrtime(&tmp);
2257         zmp->zm_stime.value.ui64 = tmp;
2258         tmp = zone->zone_wtime;
2259         scalehrtime(&tmp);
2260         zmp->zm_wtime.value.ui64 = tmp;
2261 
2262         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
2263         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
2264         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
2265 
2266         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
2267         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
2268         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
2269         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
2270 
2271         zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
2272 
2273         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
2274 
2275         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
2276         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
2277 
2278         return (0);
2279 }
2280 
2281 static kstat_t *
2282 zone_misc_kstat_create(zone_t *zone)
2283 {
2284         kstat_t *ksp;
2285         zone_misc_kstat_t *zmp;
2286 
2287         if ((ksp = kstat_create_zone("zones", zone->zone_id,
2288             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
2289             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
2290             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2291                 return (NULL);
2292 
 
2296         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2297         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2298         ksp->ks_lock = &zone->zone_misc_lock;
2299         zone->zone_misc_stats = zmp;
2300 
2301         /* The kstat "name" field is not large enough for a full zonename */
2302         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2303         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2304         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2305         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
2306         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2307         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2308         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2309         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2310             KSTAT_DATA_UINT32);
2311         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2312         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2313             KSTAT_DATA_UINT32);
2314         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2315         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2316         kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
2317             KSTAT_DATA_UINT32);
2318         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2319             KSTAT_DATA_UINT32);
2320         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2321         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2322 
2323         ksp->ks_update = zone_misc_kstat_update;
2324         ksp->ks_private = zone;
2325 
2326         kstat_install(ksp);
2327         return (ksp);
2328 }
2329 
2330 static void
2331 zone_kstat_create(zone_t *zone)
2332 {
2333         zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
2334             "lockedmem", zone_lockedmem_kstat_update);
2335         zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
2336             "swapresv", zone_swapresv_kstat_update);
2337         zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
2338             "physicalmem", zone_physmem_kstat_update);
2339         zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
2340             "nprocs", zone_nprocs_kstat_update);
2341 
2342         if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2343                 zone->zone_vfs_stats = kmem_zalloc(
2344                     sizeof (zone_vfs_kstat_t), KM_SLEEP);
2345         }
2346 
2347         if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
2348                 zone->zone_zfs_stats = kmem_zalloc(
2349                     sizeof (zone_zfs_kstat_t), KM_SLEEP);
2350         }
2351 
2352         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2353                 zone->zone_mcap_stats = kmem_zalloc(
2354                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2355         }
2356 
2357         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2358                 zone->zone_misc_stats = kmem_zalloc(
2359                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2360         }
2361 }
2362 
2363 static void
2364 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2365 {
2366         void *data;
2367 
2368         if (*pkstat != NULL) {
2369                 data = (*pkstat)->ks_data;
2370                 kstat_delete(*pkstat);
2371                 kmem_free(data, datasz);
2372                 *pkstat = NULL;
2373         }
2374 }
2375 
2376 static void
2377 zone_kstat_delete(zone_t *zone)
2378 {
2379         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2380             sizeof (zone_kstat_t));
2381         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2382             sizeof (zone_kstat_t));
2383         zone_kstat_delete_common(&zone->zone_physmem_kstat,
2384             sizeof (zone_kstat_t));
2385         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2386             sizeof (zone_kstat_t));
2387 
2388         zone_kstat_delete_common(&zone->zone_vfs_ksp,
2389             sizeof (zone_vfs_kstat_t));
2390         zone_kstat_delete_common(&zone->zone_zfs_ksp,
2391             sizeof (zone_zfs_kstat_t));
2392         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2393             sizeof (zone_mcap_kstat_t));
2394         zone_kstat_delete_common(&zone->zone_misc_ksp,
2395             sizeof (zone_misc_kstat_t));
2396 }
2397 
2398 /*
2399  * Called very early on in boot to initialize the ZSD list so that
2400  * zone_key_create() can be called before zone_init().  It also initializes
2401  * portions of zone0 which may be used before zone_init() is called.  The
2402  * variable "global_zone" will be set when zone0 is fully initialized by
2403  * zone_init().
2404  */
2405 void
2406 zone_zsd_init(void)
2407 {
2408         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2409         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2410         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2411             offsetof(struct zsd_entry, zsd_linkage));
2412         list_create(&zone_active, sizeof (zone_t),
2413             offsetof(zone_t, zone_linkage));
2414         list_create(&zone_deathrow, sizeof (zone_t),
2415             offsetof(zone_t, zone_linkage));
 
2437         zone0.zone_name = GLOBAL_ZONENAME;
2438         zone0.zone_nodename = utsname.nodename;
2439         zone0.zone_domain = srpc_domain;
2440         zone0.zone_hostid = HW_INVALID_HOSTID;
2441         zone0.zone_fs_allowed = NULL;
2442         zone0.zone_ref = 1;
2443         zone0.zone_id = GLOBAL_ZONEID;
2444         zone0.zone_status = ZONE_IS_RUNNING;
2445         zone0.zone_rootpath = "/";
2446         zone0.zone_rootpathlen = 2;
2447         zone0.zone_psetid = ZONE_PS_INVAL;
2448         zone0.zone_ncpus = 0;
2449         zone0.zone_ncpus_online = 0;
2450         zone0.zone_proc_initpid = 1;
2451         zone0.zone_initname = initname;
2452         zone0.zone_lockedmem_kstat = NULL;
2453         zone0.zone_swapresv_kstat = NULL;
2454         zone0.zone_physmem_kstat = NULL;
2455         zone0.zone_nprocs_kstat = NULL;
2456         zone0.zone_zfs_io_pri = 1;
2457         zone0.zone_stime = 0;
2458         zone0.zone_utime = 0;
2459         zone0.zone_wtime = 0;
2460 
2461         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2462             offsetof(zone_ref_t, zref_linkage));
2463         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2464             offsetof(struct zsd_entry, zsd_linkage));
2465         list_insert_head(&zone_active, &zone0);
2466 
2467         /*
2468          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2469          * to anything meaningful.  It is assigned to be 'rootdir' in
2470          * vfs_mountroot().
2471          */
2472         zone0.zone_rootvp = NULL;
2473         zone0.zone_vfslist = NULL;
2474         zone0.zone_bootargs = initargs;
2475         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2476         /*
 
 
2547         /*
2548          * Create ID space for zone IDs.  ID 0 is reserved for the
2549          * global zone.
2550          */
2551         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2552 
2553         /*
2554          * Initialize generic zone resource controls, if any.
2555          */
2556         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2557             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2558             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2559             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2560 
2561         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2562             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2563             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2564             RCTL_GLOBAL_INFINITE,
2565             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2566 
2567         rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
2568             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2569             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2570             MAXCAP, MAXCAP, &zone_cpu_base_ops);
2571 
2572         rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
2573             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2574             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2575             INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
2576 
2577         rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2578             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2579             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2580             16384, 16384, &zone_zfs_io_pri_ops);
2581 
2582         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2583             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2584             INT_MAX, INT_MAX, &zone_lwps_ops);
2585 
2586         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2587             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2588             INT_MAX, INT_MAX, &zone_procs_ops);
2589 
2590         /*
2591          * System V IPC resource controls
2592          */
2593         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2594             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2595             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2596 
 
2603             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2604 
2605         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2606             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2607             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2608 
2609         /*
2610          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2611          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2612          */
2613         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2614         bzero(dval, sizeof (rctl_val_t));
2615         dval->rcv_value = 1;
2616         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2617         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2618         dval->rcv_action_recip_pid = -1;
2619 
2620         rde = rctl_dict_lookup("zone.cpu-shares");
2621         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2622 
2623         /*
2624          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2625          * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
2626          */
2627         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2628         bzero(dval, sizeof (rctl_val_t));
2629         dval->rcv_value = 1;
2630         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2631         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2632         dval->rcv_action_recip_pid = -1;
2633 
2634         rde = rctl_dict_lookup("zone.zfs-io-priority");
2635         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2636 
2637         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2638             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2639             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2640             &zone_locked_mem_ops);
2641 
2642         rc_zone_max_swap = rctl_register("zone.max-swap",
2643             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2644             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2645             &zone_max_swap_ops);
2646 
2647         rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2648             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2649             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2650             &zone_phys_mem_ops);
2651 
2652         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2653             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2654             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2655             &zone_max_lofi_ops);
2656 
 
2821         if (zone->zone_pfexecd != NULL)
2822                 klpd_freelist(&zone->zone_pfexecd);
2823         id_free(zoneid_space, zone->zone_id);
2824         mutex_destroy(&zone->zone_lock);
2825         cv_destroy(&zone->zone_cv);
2826         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2827         rw_destroy(&zone->zone_mntfs_db_lock);
2828         kmem_free(zone, sizeof (zone_t));
2829 }
2830 
2831 /*
2832  * See block comment at the top of this file for information about zone
2833  * status values.
2834  */
2835 /*
2836  * Convenience function for setting zone status.
2837  */
2838 static void
2839 zone_status_set(zone_t *zone, zone_status_t status)
2840 {
2841         timestruc_t now;
2842         uint64_t t;
2843 
2844         nvlist_t *nvl = NULL;
2845         ASSERT(MUTEX_HELD(&zone_status_lock));
2846         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2847             status >= zone_status_get(zone));
2848 
2849         /* Current time since Jan 1 1970 but consumers expect NS */
2850         gethrestime(&now);
2851         t = (now.tv_sec * NANOSEC) + now.tv_nsec;
2852 
2853         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2854             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2855             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2856             zone_status_table[status]) ||
2857             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2858             zone_status_table[zone->zone_status]) ||
2859             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2860             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
2861             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2862             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2863 #ifdef DEBUG
2864                 (void) printf(
2865                     "Failed to allocate and send zone state change event.\n");
2866 #endif
2867         }
2868         nvlist_free(nvl);
2869 
2870         zone->zone_status = status;
2871 
2872         cv_broadcast(&zone->zone_cv);
2873 }
2874 
2875 /*
2876  * Public function to retrieve the zone status.  The zone status may
2877  * change after it is retrieved.
2878  */
2879 zone_status_t
2880 zone_status_get(zone_t *zone)
 
3441         }
3442 
3443         ASSERT(refcnt == 0);
3444         /*
3445          * zsched has exited; the zone is dead.
3446          */
3447         zone->zone_zsched = NULL;            /* paranoia */
3448         mutex_enter(&zone_status_lock);
3449         zone_status_set(zone, ZONE_IS_DEAD);
3450 out:
3451         mutex_exit(&zone_status_lock);
3452         zone_rele(zone);
3453 }
3454 
3455 zoneid_t
3456 getzoneid(void)
3457 {
3458         return (curproc->p_zone->zone_id);
3459 }
3460 
3461 zoneid_t
3462 getzonedid(void)
3463 {
3464         return (curproc->p_zone->zone_did);
3465 }
3466 
3467 /*
3468  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3469  * check the validity of a zone's state.
3470  */
3471 static zone_t *
3472 zone_find_all_by_id(zoneid_t zoneid)
3473 {
3474         mod_hash_val_t hv;
3475         zone_t *zone = NULL;
3476 
3477         ASSERT(MUTEX_HELD(&zonehash_lock));
3478 
3479         if (mod_hash_find(zonehashbyid,
3480             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3481                 zone = (zone_t *)hv;
3482         return (zone);
3483 }
3484 
3485 static zone_t *
3486 zone_find_all_by_label(const ts_label_t *label)
 
4791                 ASSERT(error == 0);
4792                 for (i = 0; i < nelem; i++) {
4793                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4794                                 goto out;
4795                 }
4796                 if (rctl_invalid_value(rde, &rv)) {
4797                         error = EINVAL;
4798                         goto out;
4799                 }
4800         }
4801         error = 0;
4802         *nvlp = nvl;
4803 out:
4804         kmem_free(kbuf, buflen);
4805         if (error && nvl != NULL)
4806                 nvlist_free(nvl);
4807         return (error);
4808 }
4809 
4810 int
4811 zone_create_error(int er_error, int er_ext, int *er_out)
4812 {
4813         if (er_out != NULL) {
4814                 if (copyout(&er_ext, er_out, sizeof (int))) {
4815                         return (set_errno(EFAULT));
4816                 }
4817         }
4818         return (set_errno(er_error));
4819 }
4820 
4821 static int
4822 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4823 {
4824         ts_label_t *tsl;
4825         bslabel_t blab;
4826 
4827         /* Get label from user */
4828         if (copyin(lab, &blab, sizeof (blab)) != 0)
4829                 return (EFAULT);
4830         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4831         if (tsl == NULL)
4832                 return (ENOMEM);
 
4882 
4883         kmem_free(kbuf, buflen);
4884         return (0);
4885 }
4886 
4887 /*
4888  * System call to create/initialize a new zone named 'zone_name', rooted
4889  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4890  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4891  * with labeling set by 'match', 'doi', and 'label'.
4892  *
4893  * If extended error is non-null, we may use it to return more detailed
4894  * error information.
4895  */
4896 static zoneid_t
4897 zone_create(const char *zone_name, const char *zone_root,
4898     const priv_set_t *zone_privs, size_t zone_privssz,
4899     caddr_t rctlbuf, size_t rctlbufsz,
4900     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4901     int match, uint32_t doi, const bslabel_t *label,
4902     int flags, zoneid_t zone_did)
4903 {
4904         struct zsched_arg zarg;
4905         nvlist_t *rctls = NULL;
4906         proc_t *pp = curproc;
4907         zone_t *zone, *ztmp;
4908         zoneid_t zoneid, start = GLOBAL_ZONEID;
4909         int error;
4910         int error2 = 0;
4911         char *str;
4912         cred_t *zkcr;
4913         boolean_t insert_label_hash;
4914 
4915         if (secpolicy_zone_config(CRED()) != 0)
4916                 return (set_errno(EPERM));
4917 
4918         /* can't boot zone from within chroot environment */
4919         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4920                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4921                     extended_error));
4922 
4923         /*
4924          * As the first step of zone creation, we want to allocate a zoneid.
4925          * This allocation is complicated by the fact that netstacks use the
4926          * zoneid to determine their stackid, but netstacks themselves are
4927          * freed asynchronously with respect to zone destruction.  This means
4928          * that a netstack reference leak (or in principle, an extraordinarily
4929          * long netstack reference hold) could result in a zoneid being
4930          * allocated that in fact corresponds to a stackid from an active
4931          * (referenced) netstack -- unleashing all sorts of havoc when that
4932          * netstack is actually (re)used.  (In the abstract, we might wish a
4933          * zoneid to not be deallocated until its last referencing netstack
4934          * has been released, but netstacks lack a backpointer into their
4935          * referencing zone -- and changing them to have such a pointer would
4936          * be substantial, to put it euphemistically.)  To avoid this, we
4937          * detect this condition on allocation: if we have allocated a zoneid
4938          * that corresponds to a netstack that's still in use, we warn about
4939          * it (as it is much more likely to be a reference leak than an actual
4940          * netstack reference), free it, and allocate another.  That these
4941          * identifers are allocated out of an ID space assures that we won't
4942          * see the identifier we just allocated.
4943          */
4944         for (;;) {
4945                 zoneid = id_alloc(zoneid_space);
4946 
4947                 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4948                         break;
4949 
4950                 id_free(zoneid_space, zoneid);
4951 
4952                 if (start == GLOBAL_ZONEID) {
4953                         start = zoneid;
4954                 } else if (zoneid == start) {
4955                         /*
4956                          * We have managed to iterate over the entire available
4957                          * zoneid space -- there are no identifiers available,
4958                          * presumably due to some number of leaked netstack
4959                          * references.  While it's in principle possible for us
4960                          * to continue to try, it seems wiser to give up at
4961                          * this point to warn and fail explicitly with a
4962                          * distinctive error.
4963                          */
4964                         cmn_err(CE_WARN, "zone_create() failed: all available "
4965                             "zone IDs have netstacks still in use");
4966                         return (set_errno(ENFILE));
4967                 }
4968 
4969                 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4970                     "netstack still in use", zoneid);
4971         }
4972 
4973         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4974         zone->zone_id = zoneid;
4975         zone->zone_did = zone_did;
4976         zone->zone_status = ZONE_IS_UNINITIALIZED;
4977         zone->zone_pool = pool_default;
4978         zone->zone_pool_mod = gethrtime();
4979         zone->zone_psetid = ZONE_PS_INVAL;
4980         zone->zone_ncpus = 0;
4981         zone->zone_ncpus_online = 0;
4982         zone->zone_restart_init = B_TRUE;
4983         zone->zone_reboot_on_init_exit = B_FALSE;
4984         zone->zone_init_status = -1;
4985         zone->zone_brand = &native_brand;
4986         zone->zone_initname = NULL;
4987         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4988         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4989         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4990         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4991         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4992             offsetof(zone_ref_t, zref_linkage));
4993         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4994             offsetof(struct zsd_entry, zsd_linkage));
4995         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
 
5328                 return (set_errno(EINTR));
5329         }
5330 
5331         /*
5332          * Boot (starting init) might have failed, in which case the zone
5333          * will go to the SHUTTING_DOWN state; an appropriate errno will
5334          * be placed in zone->zone_boot_err, and so we return that.
5335          */
5336         err = zone->zone_boot_err;
5337         zone_rele(zone);
5338         return (err ? set_errno(err) : 0);
5339 }
5340 
5341 /*
5342  * Kills all user processes in the zone, waiting for them all to exit
5343  * before returning.
5344  */
5345 static int
5346 zone_empty(zone_t *zone)
5347 {
5348         int cnt = 0;
5349         int waitstatus;
5350 
5351         /*
5352          * We need to drop zonehash_lock before killing all
5353          * processes, otherwise we'll deadlock with zone_find_*
5354          * which can be called from the exit path.
5355          */
5356         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
5357         while ((waitstatus = zone_status_timedwait_sig(zone,
5358             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
5359                 boolean_t force = B_FALSE;
5360 
5361                 /* Every 30 seconds, try harder */
5362                 if (cnt++ >= 30) {
5363                         cmn_err(CE_WARN, "attempt to force kill zone %d\n",
5364                             zone->zone_id);
5365                         force = B_TRUE;
5366                         cnt = 0;
5367                 }
5368                 killall(zone->zone_id, force);
5369         }
5370         /*
5371          * return EINTR if we were signaled
5372          */
5373         if (waitstatus == 0)
5374                 return (EINTR);
5375         return (0);
5376 }
5377 
5378 /*
5379  * This function implements the policy for zone visibility.
5380  *
5381  * In standard Solaris, a non-global zone can only see itself.
5382  *
5383  * In Trusted Extensions, a labeled zone can lookup any zone whose label
5384  * it dominates. For this test, the label of the global zone is treated as
5385  * admin_high so it is special-cased instead of being checked for dominance.
5386  *
5387  * Returns true if zone attributes are viewable, false otherwise.
5388  */
5389 static boolean_t
 
6153                 size = strlen(outstr) + 1;
6154                 if (bufsize > size)
6155                         bufsize = size;
6156                 if (buf != NULL) {
6157                         err = copyoutstr(outstr, buf, bufsize, NULL);
6158                         if (err != 0 && err != ENAMETOOLONG)
6159                                 error = EFAULT;
6160                 }
6161                 break;
6162         case ZONE_ATTR_NETWORK:
6163                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6164                 if (copyin(buf, zbuf, bufsize) != 0) {
6165                         error = EFAULT;
6166                 } else {
6167                         error = zone_get_network(zoneid, zbuf);
6168                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
6169                                 error = EFAULT;
6170                 }
6171                 kmem_free(zbuf, bufsize);
6172                 break;
6173         case ZONE_ATTR_DID:
6174                 size = sizeof (zoneid_t);
6175                 if (bufsize > size)
6176                         bufsize = size;
6177 
6178                 if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
6179                         error = EFAULT;
6180                 break;
6181         case ZONE_ATTR_SCHED_FIXEDHI:
6182                 size = sizeof (boolean_t);
6183                 if (bufsize > size)
6184                         bufsize = size;
6185 
6186                 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6187                     bufsize) != 0)
6188                         error = EFAULT;
6189                 break;
6190         default:
6191                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6192                         size = bufsize;
6193                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6194                 } else {
6195                         error = EINVAL;
6196                 }
6197         }
6198         zone_rele(zone);
6199 
6200         if (error)
 
6993                                 return (set_errno(EFAULT));
6994                         }
6995                         zs.zone_name =
6996                             (const char *)(unsigned long)zs32.zone_name;
6997                         zs.zone_root =
6998                             (const char *)(unsigned long)zs32.zone_root;
6999                         zs.zone_privs =
7000                             (const struct priv_set *)
7001                             (unsigned long)zs32.zone_privs;
7002                         zs.zone_privssz = zs32.zone_privssz;
7003                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
7004                         zs.rctlbufsz = zs32.rctlbufsz;
7005                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
7006                         zs.zfsbufsz = zs32.zfsbufsz;
7007                         zs.extended_error =
7008                             (int *)(unsigned long)zs32.extended_error;
7009                         zs.match = zs32.match;
7010                         zs.doi = zs32.doi;
7011                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
7012                         zs.flags = zs32.flags;
7013                         zs.zoneid = zs32.zoneid;
7014 #else
7015                         panic("get_udatamodel() returned bogus result\n");
7016 #endif
7017                 }
7018 
7019                 return (zone_create(zs.zone_name, zs.zone_root,
7020                     zs.zone_privs, zs.zone_privssz,
7021                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
7022                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
7023                     zs.extended_error, zs.match, zs.doi,
7024                     zs.label, zs.flags, zs.zoneid));
7025         case ZONE_BOOT:
7026                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
7027         case ZONE_DESTROY:
7028                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
7029         case ZONE_GETATTR:
7030                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
7031                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7032         case ZONE_SETATTR:
7033                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
7034                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7035         case ZONE_ENTER:
7036                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
7037         case ZONE_LIST:
7038                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
7039         case ZONE_SHUTDOWN:
7040                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
7041         case ZONE_LOOKUP:
7042                 return (zone_lookup((const char *)arg1));
7043         case ZONE_VERSION:
7044                 return (zone_version((int *)arg1));
 
7283          */
7284         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
7285         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
7286                 /*
7287                  * This zone is already on its way down.
7288                  */
7289                 mutex_exit(&zone_status_lock);
7290                 return (0);
7291         }
7292         /*
7293          * Prevent future zone_enter()s
7294          */
7295         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
7296         mutex_exit(&zone_status_lock);
7297 
7298         /*
7299          * Kill everyone now and call zoneadmd later.
7300          * zone_ki_call_zoneadmd() will do a more thorough job of this
7301          * later.
7302          */
7303         killall(zone->zone_id, B_FALSE);
7304         /*
7305          * Now, create the thread to contact zoneadmd and do the rest of the
7306          * work.  This thread can't be created in our zone otherwise
7307          * zone_destroy() would deadlock.
7308          */
7309         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
7310         zargp->arg.cmd = zcmd;
7311         zargp->arg.uniqid = zone->zone_uniqid;
7312         zargp->zone = zone;
7313         (void) strcpy(zargp->arg.locale, "C");
7314         /* mdep was already copied in for us by uadmin */
7315         if (mdep != NULL)
7316                 (void) strlcpy(zargp->arg.bootbuf, mdep,
7317                     sizeof (zargp->arg.bootbuf));
7318         zone_hold(zone);
7319 
7320         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
7321             TS_RUN, minclsyspri);
7322         exit(CLD_EXITED, 0);
7323 
 
 | 
 
 
 358  */
 359 static char *zone_ref_subsys_names[] = {
 360         "NFS",          /* ZONE_REF_NFS */
 361         "NFSv4",        /* ZONE_REF_NFSV4 */
 362         "SMBFS",        /* ZONE_REF_SMBFS */
 363         "MNTFS",        /* ZONE_REF_MNTFS */
 364         "LOFI",         /* ZONE_REF_LOFI */
 365         "VFS",          /* ZONE_REF_VFS */
 366         "IPC"           /* ZONE_REF_IPC */
 367 };
 368 
 369 /*
 370  * This isn't static so lint doesn't complain.
 371  */
 372 rctl_hndl_t rc_zone_cpu_shares;
 373 rctl_hndl_t rc_zone_locked_mem;
 374 rctl_hndl_t rc_zone_max_swap;
 375 rctl_hndl_t rc_zone_phys_mem;
 376 rctl_hndl_t rc_zone_max_lofi;
 377 rctl_hndl_t rc_zone_cpu_cap;
 378 rctl_hndl_t rc_zone_zfs_io_pri;
 379 rctl_hndl_t rc_zone_nlwps;
 380 rctl_hndl_t rc_zone_nprocs;
 381 rctl_hndl_t rc_zone_shmmax;
 382 rctl_hndl_t rc_zone_shmmni;
 383 rctl_hndl_t rc_zone_semmni;
 384 rctl_hndl_t rc_zone_msgmni;
 385 
 386 const char * const zone_default_initname = "/sbin/init";
 387 static char * const zone_prefix = "/zone/";
 388 static int zone_shutdown(zoneid_t zoneid);
 389 static int zone_add_datalink(zoneid_t, datalink_id_t);
 390 static int zone_remove_datalink(zoneid_t, datalink_id_t);
 391 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
 392 static int zone_set_network(zoneid_t, zone_net_data_t *);
 393 static int zone_get_network(zoneid_t, zone_net_data_t *);
 394 
 395 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
 396 
 397 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
 
 404     kmutex_t *);
 405 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
 406     kmutex_t *);
 407 
 408 /*
 409  * Bump this number when you alter the zone syscall interfaces; this is
 410  * because we need to have support for previous API versions in libc
 411  * to support patching; libc calls into the kernel to determine this number.
 412  *
 413  * Version 1 of the API is the version originally shipped with Solaris 10
 414  * Version 2 alters the zone_create system call in order to support more
 415  *     arguments by moving the args into a structure; and to do better
 416  *     error reporting when zone_create() fails.
 417  * Version 3 alters the zone_create system call in order to support the
 418  *     import of ZFS datasets to zones.
 419  * Version 4 alters the zone_create system call in order to support
 420  *     Trusted Extensions.
 421  * Version 5 alters the zone_boot system call, and converts its old
 422  *     bootargs parameter to be set by the zone_setattr API instead.
 423  * Version 6 adds the flag argument to zone_create.
 424  */
 425 static const int ZONE_SYSCALL_API_VERSION = 6;
 426 
 427 /*
 428  * Certain filesystems (such as NFS and autofs) need to know which zone
 429  * the mount is being placed in.  Because of this, we need to be able to
 430  * ensure that a zone isn't in the process of being created/destroyed such
 431  * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
 432  * it gets added the list of mounted zones, it ends up on the wrong zone's
 433  * mount list. Since a zone can't reside on an NFS file system, we don't
 434  * have to worry about the zonepath itself.
 435  *
 436  * The following functions: block_mounts()/resume_mounts() and
 437  * mount_in_progress()/mount_completed() are used by zones and the VFS
 438  * layer (respectively) to synchronize zone state transitions and new
 439  * mounts within a zone. This syncronization is on a per-zone basis, so
 440  * activity for one zone will not interfere with activity for another zone.
 441  *
 442  * The semantics are like a reader-reader lock such that there may
 443  * either be multiple mounts (or zone state transitions, if that weren't
 444  * serialized by zonehash_lock) in progress at the same time, but not
 445  * both.
 
 
1362 
1363         ASSERT(MUTEX_HELD(&p->p_lock));
1364         ASSERT(e->rcep_t == RCENTITY_ZONE);
1365 
1366         if (zone == NULL)
1367                 return (0);
1368 
1369         /*
1370          * set cap to the new value.
1371          */
1372         return (cpucaps_zone_set(zone, nv));
1373 }
1374 
1375 static rctl_ops_t zone_cpu_cap_ops = {
1376         rcop_no_action,
1377         zone_cpu_cap_get,
1378         zone_cpu_cap_set,
1379         rcop_no_test
1380 };
1381 
1382 /*
1383  * zone.zfs-io-pri resource control support (IO priority).
1384  */
1385 /*ARGSUSED*/
1386 static rctl_qty_t
1387 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1388 {
1389         ASSERT(MUTEX_HELD(&p->p_lock));
1390         return (p->p_zone->zone_zfs_io_pri);
1391 }
1392 
1393 /*ARGSUSED*/
1394 static int
1395 zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1396     rctl_qty_t nv)
1397 {
1398         zone_t *zone = e->rcep_p.zone;
1399 
1400         ASSERT(MUTEX_HELD(&p->p_lock));
1401         ASSERT(e->rcep_t == RCENTITY_ZONE);
1402 
 
1896         zk->zk_usage.value.ui64 = zone->zone_nprocs;
1897         zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1898         return (0);
1899 }
1900 
1901 static int
1902 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1903 {
1904         zone_t *zone = ksp->ks_private;
1905         zone_kstat_t *zk = ksp->ks_data;
1906 
1907         if (rw == KSTAT_WRITE)
1908                 return (EACCES);
1909 
1910         zk->zk_usage.value.ui64 = zone->zone_max_swap;
1911         zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1912         return (0);
1913 }
1914 
1915 static kstat_t *
1916 zone_kstat_create_common(zone_t *zone, char *name,
1917     int (*updatefunc) (kstat_t *, int))
1918 {
1919         kstat_t *ksp;
1920         zone_kstat_t *zk;
1921 
1922         ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1923             sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1924             KSTAT_FLAG_VIRTUAL);
1925 
1926         if (ksp == NULL)
1927                 return (NULL);
1928 
1929         zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1930         ksp->ks_data_size += strlen(zone->zone_name) + 1;
1931         kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1932         kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1933         kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1934         kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1935         ksp->ks_update = updatefunc;
1936         ksp->ks_private = zone;
 
2125         kstat_t *ksp;
2126         zone_mcap_kstat_t *zmp;
2127 
2128         if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2129             zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2130             sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2131             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2132                 return (NULL);
2133 
2134         if (zone->zone_id != GLOBAL_ZONEID)
2135                 kstat_zone_add(ksp, GLOBAL_ZONEID);
2136 
2137         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2138         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2139         ksp->ks_lock = &zone->zone_mcap_lock;
2140         zone->zone_mcap_stats = zmp;
2141 
2142         /* The kstat "name" field is not large enough for a full zonename */
2143         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2144         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2145         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2146         kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2147         kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2148         kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2149         kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2150         kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2151         kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2152         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2153         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2154         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2155         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2156         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2157             KSTAT_DATA_UINT64);
2158         kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
2159             KSTAT_DATA_UINT64);
2160         kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
2161             KSTAT_DATA_UINT64);
2162 
2163         ksp->ks_update = zone_mcap_kstat_update;
2164         ksp->ks_private = zone;
2165 
 
2179 
2180         tmp = zone->zone_utime;
2181         scalehrtime(&tmp);
2182         zmp->zm_utime.value.ui64 = tmp;
2183         tmp = zone->zone_stime;
2184         scalehrtime(&tmp);
2185         zmp->zm_stime.value.ui64 = tmp;
2186         tmp = zone->zone_wtime;
2187         scalehrtime(&tmp);
2188         zmp->zm_wtime.value.ui64 = tmp;
2189 
2190         zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
2191         zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
2192         zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
2193 
2194         zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
2195         zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
2196         zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
2197         zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
2198 
2199         zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
2200 
2201         zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
2202         zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
2203 
2204         return (0);
2205 }
2206 
2207 static kstat_t *
2208 zone_misc_kstat_create(zone_t *zone)
2209 {
2210         kstat_t *ksp;
2211         zone_misc_kstat_t *zmp;
2212 
2213         if ((ksp = kstat_create_zone("zones", zone->zone_id,
2214             zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
2215             sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
2216             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2217                 return (NULL);
2218 
 
2222         zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2223         ksp->ks_data_size += strlen(zone->zone_name) + 1;
2224         ksp->ks_lock = &zone->zone_misc_lock;
2225         zone->zone_misc_stats = zmp;
2226 
2227         /* The kstat "name" field is not large enough for a full zonename */
2228         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2229         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2230         kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2231         kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
2232         kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2233         kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2234         kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2235         kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2236             KSTAT_DATA_UINT32);
2237         kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2238         kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2239             KSTAT_DATA_UINT32);
2240         kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2241         kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2242         kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2243             KSTAT_DATA_UINT32);
2244         kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2245         kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2246 
2247         ksp->ks_update = zone_misc_kstat_update;
2248         ksp->ks_private = zone;
2249 
2250         kstat_install(ksp);
2251         return (ksp);
2252 }
2253 
2254 static void
2255 zone_kstat_create(zone_t *zone)
2256 {
2257         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2258             "lockedmem", zone_lockedmem_kstat_update);
2259         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2260             "swapresv", zone_swapresv_kstat_update);
2261         zone->zone_physmem_kstat = zone_kstat_create_common(zone,
2262             "physicalmem", zone_physmem_kstat_update);
2263         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2264             "nprocs", zone_nprocs_kstat_update);
2265 
2266         if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2267                 zone->zone_vfs_stats = kmem_zalloc(
2268                     sizeof (zone_vfs_kstat_t), KM_SLEEP);
2269         }
2270 
2271         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2272                 zone->zone_mcap_stats = kmem_zalloc(
2273                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
2274         }
2275 
2276         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2277                 zone->zone_misc_stats = kmem_zalloc(
2278                     sizeof (zone_misc_kstat_t), KM_SLEEP);
2279         }
2280 
2281 }
2282 
2283 static void
2284 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2285 {
2286         void *data;
2287 
2288         if (*pkstat != NULL) {
2289                 data = (*pkstat)->ks_data;
2290                 kstat_delete(*pkstat);
2291                 kmem_free(data, datasz);
2292                 *pkstat = NULL;
2293         }
2294 }
2295 
2296 static void
2297 zone_kstat_delete(zone_t *zone)
2298 {
2299         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2300             sizeof (zone_kstat_t));
2301         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2302             sizeof (zone_kstat_t));
2303         zone_kstat_delete_common(&zone->zone_physmem_kstat,
2304             sizeof (zone_kstat_t));
2305         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2306             sizeof (zone_kstat_t));
2307 
2308         zone_kstat_delete_common(&zone->zone_vfs_ksp,
2309             sizeof (zone_vfs_kstat_t));
2310         zone_kstat_delete_common(&zone->zone_mcap_ksp,
2311             sizeof (zone_mcap_kstat_t));
2312         zone_kstat_delete_common(&zone->zone_misc_ksp,
2313             sizeof (zone_misc_kstat_t));
2314 
2315 }
2316 
2317 /*
2318  * Called very early on in boot to initialize the ZSD list so that
2319  * zone_key_create() can be called before zone_init().  It also initializes
2320  * portions of zone0 which may be used before zone_init() is called.  The
2321  * variable "global_zone" will be set when zone0 is fully initialized by
2322  * zone_init().
2323  */
2324 void
2325 zone_zsd_init(void)
2326 {
2327         mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2328         mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2329         list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2330             offsetof(struct zsd_entry, zsd_linkage));
2331         list_create(&zone_active, sizeof (zone_t),
2332             offsetof(zone_t, zone_linkage));
2333         list_create(&zone_deathrow, sizeof (zone_t),
2334             offsetof(zone_t, zone_linkage));
 
2356         zone0.zone_name = GLOBAL_ZONENAME;
2357         zone0.zone_nodename = utsname.nodename;
2358         zone0.zone_domain = srpc_domain;
2359         zone0.zone_hostid = HW_INVALID_HOSTID;
2360         zone0.zone_fs_allowed = NULL;
2361         zone0.zone_ref = 1;
2362         zone0.zone_id = GLOBAL_ZONEID;
2363         zone0.zone_status = ZONE_IS_RUNNING;
2364         zone0.zone_rootpath = "/";
2365         zone0.zone_rootpathlen = 2;
2366         zone0.zone_psetid = ZONE_PS_INVAL;
2367         zone0.zone_ncpus = 0;
2368         zone0.zone_ncpus_online = 0;
2369         zone0.zone_proc_initpid = 1;
2370         zone0.zone_initname = initname;
2371         zone0.zone_lockedmem_kstat = NULL;
2372         zone0.zone_swapresv_kstat = NULL;
2373         zone0.zone_physmem_kstat = NULL;
2374         zone0.zone_nprocs_kstat = NULL;
2375         zone0.zone_zfs_io_pri = 1;
2376 
2377         zone0.zone_stime = 0;
2378         zone0.zone_utime = 0;
2379         zone0.zone_wtime = 0;
2380 
2381         list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2382             offsetof(zone_ref_t, zref_linkage));
2383         list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2384             offsetof(struct zsd_entry, zsd_linkage));
2385         list_insert_head(&zone_active, &zone0);
2386 
2387         /*
2388          * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2389          * to anything meaningful.  It is assigned to be 'rootdir' in
2390          * vfs_mountroot().
2391          */
2392         zone0.zone_rootvp = NULL;
2393         zone0.zone_vfslist = NULL;
2394         zone0.zone_bootargs = initargs;
2395         zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2396         /*
 
 
2467         /*
2468          * Create ID space for zone IDs.  ID 0 is reserved for the
2469          * global zone.
2470          */
2471         zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2472 
2473         /*
2474          * Initialize generic zone resource controls, if any.
2475          */
2476         rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2477             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2478             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2479             FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2480 
2481         rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2482             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2483             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2484             RCTL_GLOBAL_INFINITE,
2485             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2486 
2487         rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2488             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2489             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2490             16384, 16384, &zone_zfs_io_pri_ops);
2491 
2492         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2493             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2494             INT_MAX, INT_MAX, &zone_lwps_ops);
2495 
2496         rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2497             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2498             INT_MAX, INT_MAX, &zone_procs_ops);
2499 
2500         /*
2501          * System V IPC resource controls
2502          */
2503         rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2504             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2505             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2506 
 
2513             RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2514 
2515         rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2516             RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2517             RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2518 
2519         /*
2520          * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2521          * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2522          */
2523         dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2524         bzero(dval, sizeof (rctl_val_t));
2525         dval->rcv_value = 1;
2526         dval->rcv_privilege = RCPRIV_PRIVILEGED;
2527         dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2528         dval->rcv_action_recip_pid = -1;
2529 
2530         rde = rctl_dict_lookup("zone.cpu-shares");
2531         (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2532 
2533         rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2534             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2535             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2536             &zone_locked_mem_ops);
2537 
2538         rc_zone_max_swap = rctl_register("zone.max-swap",
2539             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2540             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2541             &zone_max_swap_ops);
2542 
2543         rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2544             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2545             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2546             &zone_phys_mem_ops);
2547 
2548         rc_zone_max_lofi = rctl_register("zone.max-lofi",
2549             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2550             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2551             &zone_max_lofi_ops);
2552 
 
2717         if (zone->zone_pfexecd != NULL)
2718                 klpd_freelist(&zone->zone_pfexecd);
2719         id_free(zoneid_space, zone->zone_id);
2720         mutex_destroy(&zone->zone_lock);
2721         cv_destroy(&zone->zone_cv);
2722         rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2723         rw_destroy(&zone->zone_mntfs_db_lock);
2724         kmem_free(zone, sizeof (zone_t));
2725 }
2726 
2727 /*
2728  * See block comment at the top of this file for information about zone
2729  * status values.
2730  */
2731 /*
2732  * Convenience function for setting zone status.
2733  */
2734 static void
2735 zone_status_set(zone_t *zone, zone_status_t status)
2736 {
2737 
2738         nvlist_t *nvl = NULL;
2739         ASSERT(MUTEX_HELD(&zone_status_lock));
2740         ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2741             status >= zone_status_get(zone));
2742 
2743         if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2744             nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2745             nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2746             zone_status_table[status]) ||
2747             nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2748             zone_status_table[zone->zone_status]) ||
2749             nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2750             nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2751             sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2752             ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2753 #ifdef DEBUG
2754                 (void) printf(
2755                     "Failed to allocate and send zone state change event.\n");
2756 #endif
2757         }
2758         nvlist_free(nvl);
2759 
2760         zone->zone_status = status;
2761 
2762         cv_broadcast(&zone->zone_cv);
2763 }
2764 
2765 /*
2766  * Public function to retrieve the zone status.  The zone status may
2767  * change after it is retrieved.
2768  */
2769 zone_status_t
2770 zone_status_get(zone_t *zone)
 
3331         }
3332 
3333         ASSERT(refcnt == 0);
3334         /*
3335          * zsched has exited; the zone is dead.
3336          */
3337         zone->zone_zsched = NULL;            /* paranoia */
3338         mutex_enter(&zone_status_lock);
3339         zone_status_set(zone, ZONE_IS_DEAD);
3340 out:
3341         mutex_exit(&zone_status_lock);
3342         zone_rele(zone);
3343 }
3344 
3345 zoneid_t
3346 getzoneid(void)
3347 {
3348         return (curproc->p_zone->zone_id);
3349 }
3350 
3351 /*
3352  * Internal versions of zone_find_by_*().  These don't zone_hold() or
3353  * check the validity of a zone's state.
3354  */
3355 static zone_t *
3356 zone_find_all_by_id(zoneid_t zoneid)
3357 {
3358         mod_hash_val_t hv;
3359         zone_t *zone = NULL;
3360 
3361         ASSERT(MUTEX_HELD(&zonehash_lock));
3362 
3363         if (mod_hash_find(zonehashbyid,
3364             (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3365                 zone = (zone_t *)hv;
3366         return (zone);
3367 }
3368 
3369 static zone_t *
3370 zone_find_all_by_label(const ts_label_t *label)
 
4675                 ASSERT(error == 0);
4676                 for (i = 0; i < nelem; i++) {
4677                         if (error = nvlist2rctlval(nvlarray[i], &rv))
4678                                 goto out;
4679                 }
4680                 if (rctl_invalid_value(rde, &rv)) {
4681                         error = EINVAL;
4682                         goto out;
4683                 }
4684         }
4685         error = 0;
4686         *nvlp = nvl;
4687 out:
4688         kmem_free(kbuf, buflen);
4689         if (error && nvl != NULL)
4690                 nvlist_free(nvl);
4691         return (error);
4692 }
4693 
4694 int
4695 zone_create_error(int er_error, int er_ext, int *er_out) {
4696         if (er_out != NULL) {
4697                 if (copyout(&er_ext, er_out, sizeof (int))) {
4698                         return (set_errno(EFAULT));
4699                 }
4700         }
4701         return (set_errno(er_error));
4702 }
4703 
4704 static int
4705 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4706 {
4707         ts_label_t *tsl;
4708         bslabel_t blab;
4709 
4710         /* Get label from user */
4711         if (copyin(lab, &blab, sizeof (blab)) != 0)
4712                 return (EFAULT);
4713         tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4714         if (tsl == NULL)
4715                 return (ENOMEM);
 
4765 
4766         kmem_free(kbuf, buflen);
4767         return (0);
4768 }
4769 
4770 /*
4771  * System call to create/initialize a new zone named 'zone_name', rooted
4772  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4773  * and initialized with the zone-wide rctls described in 'rctlbuf', and
4774  * with labeling set by 'match', 'doi', and 'label'.
4775  *
4776  * If extended error is non-null, we may use it to return more detailed
4777  * error information.
4778  */
4779 static zoneid_t
4780 zone_create(const char *zone_name, const char *zone_root,
4781     const priv_set_t *zone_privs, size_t zone_privssz,
4782     caddr_t rctlbuf, size_t rctlbufsz,
4783     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4784     int match, uint32_t doi, const bslabel_t *label,
4785     int flags)
4786 {
4787         struct zsched_arg zarg;
4788         nvlist_t *rctls = NULL;
4789         proc_t *pp = curproc;
4790         zone_t *zone, *ztmp;
4791         zoneid_t zoneid;
4792         int error;
4793         int error2 = 0;
4794         char *str;
4795         cred_t *zkcr;
4796         boolean_t insert_label_hash;
4797 
4798         if (secpolicy_zone_config(CRED()) != 0)
4799                 return (set_errno(EPERM));
4800 
4801         /* can't boot zone from within chroot environment */
4802         if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4803                 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4804                     extended_error));
4805 
4806         zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4807         zoneid = zone->zone_id = id_alloc(zoneid_space);
4808         zone->zone_status = ZONE_IS_UNINITIALIZED;
4809         zone->zone_pool = pool_default;
4810         zone->zone_pool_mod = gethrtime();
4811         zone->zone_psetid = ZONE_PS_INVAL;
4812         zone->zone_ncpus = 0;
4813         zone->zone_ncpus_online = 0;
4814         zone->zone_restart_init = B_TRUE;
4815         zone->zone_reboot_on_init_exit = B_FALSE;
4816         zone->zone_init_status = -1;
4817         zone->zone_brand = &native_brand;
4818         zone->zone_initname = NULL;
4819         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4820         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4821         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4822         cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4823         list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4824             offsetof(zone_ref_t, zref_linkage));
4825         list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4826             offsetof(struct zsd_entry, zsd_linkage));
4827         list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
 
5160                 return (set_errno(EINTR));
5161         }
5162 
5163         /*
5164          * Boot (starting init) might have failed, in which case the zone
5165          * will go to the SHUTTING_DOWN state; an appropriate errno will
5166          * be placed in zone->zone_boot_err, and so we return that.
5167          */
5168         err = zone->zone_boot_err;
5169         zone_rele(zone);
5170         return (err ? set_errno(err) : 0);
5171 }
5172 
5173 /*
5174  * Kills all user processes in the zone, waiting for them all to exit
5175  * before returning.
5176  */
5177 static int
5178 zone_empty(zone_t *zone)
5179 {
5180         int waitstatus;
5181 
5182         /*
5183          * We need to drop zonehash_lock before killing all
5184          * processes, otherwise we'll deadlock with zone_find_*
5185          * which can be called from the exit path.
5186          */
5187         ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
5188         while ((waitstatus = zone_status_timedwait_sig(zone,
5189             ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
5190                 killall(zone->zone_id);
5191         }
5192         /*
5193          * return EINTR if we were signaled
5194          */
5195         if (waitstatus == 0)
5196                 return (EINTR);
5197         return (0);
5198 }
5199 
5200 /*
5201  * This function implements the policy for zone visibility.
5202  *
5203  * In standard Solaris, a non-global zone can only see itself.
5204  *
5205  * In Trusted Extensions, a labeled zone can lookup any zone whose label
5206  * it dominates. For this test, the label of the global zone is treated as
5207  * admin_high so it is special-cased instead of being checked for dominance.
5208  *
5209  * Returns true if zone attributes are viewable, false otherwise.
5210  */
5211 static boolean_t
 
5975                 size = strlen(outstr) + 1;
5976                 if (bufsize > size)
5977                         bufsize = size;
5978                 if (buf != NULL) {
5979                         err = copyoutstr(outstr, buf, bufsize, NULL);
5980                         if (err != 0 && err != ENAMETOOLONG)
5981                                 error = EFAULT;
5982                 }
5983                 break;
5984         case ZONE_ATTR_NETWORK:
5985                 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5986                 if (copyin(buf, zbuf, bufsize) != 0) {
5987                         error = EFAULT;
5988                 } else {
5989                         error = zone_get_network(zoneid, zbuf);
5990                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5991                                 error = EFAULT;
5992                 }
5993                 kmem_free(zbuf, bufsize);
5994                 break;
5995         case ZONE_ATTR_SCHED_FIXEDHI:
5996                 size = sizeof (boolean_t);
5997                 if (bufsize > size)
5998                         bufsize = size;
5999 
6000                 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6001                     bufsize) != 0)
6002                         error = EFAULT;
6003                 break;
6004         default:
6005                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6006                         size = bufsize;
6007                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6008                 } else {
6009                         error = EINVAL;
6010                 }
6011         }
6012         zone_rele(zone);
6013 
6014         if (error)
 
6807                                 return (set_errno(EFAULT));
6808                         }
6809                         zs.zone_name =
6810                             (const char *)(unsigned long)zs32.zone_name;
6811                         zs.zone_root =
6812                             (const char *)(unsigned long)zs32.zone_root;
6813                         zs.zone_privs =
6814                             (const struct priv_set *)
6815                             (unsigned long)zs32.zone_privs;
6816                         zs.zone_privssz = zs32.zone_privssz;
6817                         zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6818                         zs.rctlbufsz = zs32.rctlbufsz;
6819                         zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6820                         zs.zfsbufsz = zs32.zfsbufsz;
6821                         zs.extended_error =
6822                             (int *)(unsigned long)zs32.extended_error;
6823                         zs.match = zs32.match;
6824                         zs.doi = zs32.doi;
6825                         zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6826                         zs.flags = zs32.flags;
6827 #else
6828                         panic("get_udatamodel() returned bogus result\n");
6829 #endif
6830                 }
6831 
6832                 return (zone_create(zs.zone_name, zs.zone_root,
6833                     zs.zone_privs, zs.zone_privssz,
6834                     (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6835                     (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6836                     zs.extended_error, zs.match, zs.doi,
6837                     zs.label, zs.flags));
6838         case ZONE_BOOT:
6839                 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6840         case ZONE_DESTROY:
6841                 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6842         case ZONE_GETATTR:
6843                 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6844                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6845         case ZONE_SETATTR:
6846                 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6847                     (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6848         case ZONE_ENTER:
6849                 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6850         case ZONE_LIST:
6851                 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6852         case ZONE_SHUTDOWN:
6853                 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6854         case ZONE_LOOKUP:
6855                 return (zone_lookup((const char *)arg1));
6856         case ZONE_VERSION:
6857                 return (zone_version((int *)arg1));
 
7096          */
7097         ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
7098         if (zone_status_get(zone) > ZONE_IS_RUNNING) {
7099                 /*
7100                  * This zone is already on its way down.
7101                  */
7102                 mutex_exit(&zone_status_lock);
7103                 return (0);
7104         }
7105         /*
7106          * Prevent future zone_enter()s
7107          */
7108         zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
7109         mutex_exit(&zone_status_lock);
7110 
7111         /*
7112          * Kill everyone now and call zoneadmd later.
7113          * zone_ki_call_zoneadmd() will do a more thorough job of this
7114          * later.
7115          */
7116         killall(zone->zone_id);
7117         /*
7118          * Now, create the thread to contact zoneadmd and do the rest of the
7119          * work.  This thread can't be created in our zone otherwise
7120          * zone_destroy() would deadlock.
7121          */
7122         zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
7123         zargp->arg.cmd = zcmd;
7124         zargp->arg.uniqid = zone->zone_uniqid;
7125         zargp->zone = zone;
7126         (void) strcpy(zargp->arg.locale, "C");
7127         /* mdep was already copied in for us by uadmin */
7128         if (mdep != NULL)
7129                 (void) strlcpy(zargp->arg.bootbuf, mdep,
7130                     sizeof (zargp->arg.bootbuf));
7131         zone_hold(zone);
7132 
7133         (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
7134             TS_RUN, minclsyspri);
7135         exit(CLD_EXITED, 0);
7136 
 
 |