358 */
359 static char *zone_ref_subsys_names[] = {
360 "NFS", /* ZONE_REF_NFS */
361 "NFSv4", /* ZONE_REF_NFSV4 */
362 "SMBFS", /* ZONE_REF_SMBFS */
363 "MNTFS", /* ZONE_REF_MNTFS */
364 "LOFI", /* ZONE_REF_LOFI */
365 "VFS", /* ZONE_REF_VFS */
366 "IPC" /* ZONE_REF_IPC */
367 };
368
369 /*
370 * This isn't static so lint doesn't complain.
371 */
372 rctl_hndl_t rc_zone_cpu_shares;
373 rctl_hndl_t rc_zone_locked_mem;
374 rctl_hndl_t rc_zone_max_swap;
375 rctl_hndl_t rc_zone_phys_mem;
376 rctl_hndl_t rc_zone_max_lofi;
377 rctl_hndl_t rc_zone_cpu_cap;
378 rctl_hndl_t rc_zone_cpu_baseline;
379 rctl_hndl_t rc_zone_cpu_burst_time;
380 rctl_hndl_t rc_zone_zfs_io_pri;
381 rctl_hndl_t rc_zone_nlwps;
382 rctl_hndl_t rc_zone_nprocs;
383 rctl_hndl_t rc_zone_shmmax;
384 rctl_hndl_t rc_zone_shmmni;
385 rctl_hndl_t rc_zone_semmni;
386 rctl_hndl_t rc_zone_msgmni;
387
388 const char * const zone_default_initname = "/sbin/init";
389 static char * const zone_prefix = "/zone/";
390 static int zone_shutdown(zoneid_t zoneid);
391 static int zone_add_datalink(zoneid_t, datalink_id_t);
392 static int zone_remove_datalink(zoneid_t, datalink_id_t);
393 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
394 static int zone_set_network(zoneid_t, zone_net_data_t *);
395 static int zone_get_network(zoneid_t, zone_net_data_t *);
396
397 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
398
399 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
406 kmutex_t *);
407 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
408 kmutex_t *);
409
410 /*
411 * Bump this number when you alter the zone syscall interfaces; this is
412 * because we need to have support for previous API versions in libc
413 * to support patching; libc calls into the kernel to determine this number.
414 *
415 * Version 1 of the API is the version originally shipped with Solaris 10
416 * Version 2 alters the zone_create system call in order to support more
417 * arguments by moving the args into a structure; and to do better
418 * error reporting when zone_create() fails.
419 * Version 3 alters the zone_create system call in order to support the
420 * import of ZFS datasets to zones.
421 * Version 4 alters the zone_create system call in order to support
422 * Trusted Extensions.
423 * Version 5 alters the zone_boot system call, and converts its old
424 * bootargs parameter to be set by the zone_setattr API instead.
425 * Version 6 adds the flag argument to zone_create.
426 * Version 7 adds the requested zoneid to zone_create.
427 */
428 static const int ZONE_SYSCALL_API_VERSION = 7;
429
430 /*
431 * Certain filesystems (such as NFS and autofs) need to know which zone
432 * the mount is being placed in. Because of this, we need to be able to
433 * ensure that a zone isn't in the process of being created/destroyed such
434 * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
435 * it gets added the list of mounted zones, it ends up on the wrong zone's
436 * mount list. Since a zone can't reside on an NFS file system, we don't
437 * have to worry about the zonepath itself.
438 *
439 * The following functions: block_mounts()/resume_mounts() and
440 * mount_in_progress()/mount_completed() are used by zones and the VFS
441 * layer (respectively) to synchronize zone state transitions and new
442 * mounts within a zone. This syncronization is on a per-zone basis, so
443 * activity for one zone will not interfere with activity for another zone.
444 *
445 * The semantics are like a reader-reader lock such that there may
446 * either be multiple mounts (or zone state transitions, if that weren't
447 * serialized by zonehash_lock) in progress at the same time, but not
448 * both.
1365
1366 ASSERT(MUTEX_HELD(&p->p_lock));
1367 ASSERT(e->rcep_t == RCENTITY_ZONE);
1368
1369 if (zone == NULL)
1370 return (0);
1371
1372 /*
1373 * set cap to the new value.
1374 */
1375 return (cpucaps_zone_set(zone, nv));
1376 }
1377
1378 static rctl_ops_t zone_cpu_cap_ops = {
1379 rcop_no_action,
1380 zone_cpu_cap_get,
1381 zone_cpu_cap_set,
1382 rcop_no_test
1383 };
1384
1385 /*ARGSUSED*/
1386 static rctl_qty_t
1387 zone_cpu_base_get(rctl_t *rctl, struct proc *p)
1388 {
1389 ASSERT(MUTEX_HELD(&p->p_lock));
1390 return (cpucaps_zone_get_base(p->p_zone));
1391 }
1392
1393 /*
1394 * The zone cpu base is used to set the baseline CPU for the zone
1395 * so we can track when the zone is bursting.
1396 */
1397 /*ARGSUSED*/
1398 static int
1399 zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1400 rctl_qty_t nv)
1401 {
1402 zone_t *zone = e->rcep_p.zone;
1403
1404 ASSERT(MUTEX_HELD(&p->p_lock));
1405 ASSERT(e->rcep_t == RCENTITY_ZONE);
1406
1407 if (zone == NULL)
1408 return (0);
1409
1410 return (cpucaps_zone_set_base(zone, nv));
1411 }
1412
1413 static rctl_ops_t zone_cpu_base_ops = {
1414 rcop_no_action,
1415 zone_cpu_base_get,
1416 zone_cpu_base_set,
1417 rcop_no_test
1418 };
1419
1420 /*ARGSUSED*/
1421 static rctl_qty_t
1422 zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p)
1423 {
1424 ASSERT(MUTEX_HELD(&p->p_lock));
1425 return (cpucaps_zone_get_burst_time(p->p_zone));
1426 }
1427
1428 /*
1429 * The zone cpu burst time is used to set the amount of time CPU(s) can be
1430 * bursting for the zone.
1431 */
1432 /*ARGSUSED*/
1433 static int
1434 zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1435 rctl_qty_t nv)
1436 {
1437 zone_t *zone = e->rcep_p.zone;
1438
1439 ASSERT(MUTEX_HELD(&p->p_lock));
1440 ASSERT(e->rcep_t == RCENTITY_ZONE);
1441
1442 if (zone == NULL)
1443 return (0);
1444
1445 return (cpucaps_zone_set_burst_time(zone, nv));
1446 }
1447
1448 static rctl_ops_t zone_cpu_burst_time_ops = {
1449 rcop_no_action,
1450 zone_cpu_burst_time_get,
1451 zone_cpu_burst_time_set,
1452 rcop_no_test
1453 };
1454
1455 /*
1456 * zone.zfs-io-pri resource control support (IO priority).
1457 */
1458 /*ARGSUSED*/
1459 static rctl_qty_t
1460 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1461 {
1462 ASSERT(MUTEX_HELD(&p->p_lock));
1463 return (p->p_zone->zone_zfs_io_pri);
1464 }
1465
1466 /*ARGSUSED*/
1467 static int
1468 zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1469 rctl_qty_t nv)
1470 {
1471 zone_t *zone = e->rcep_p.zone;
1472
1473 ASSERT(MUTEX_HELD(&p->p_lock));
1474 ASSERT(e->rcep_t == RCENTITY_ZONE);
1475
1969 zk->zk_usage.value.ui64 = zone->zone_nprocs;
1970 zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1971 return (0);
1972 }
1973
1974 static int
1975 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1976 {
1977 zone_t *zone = ksp->ks_private;
1978 zone_kstat_t *zk = ksp->ks_data;
1979
1980 if (rw == KSTAT_WRITE)
1981 return (EACCES);
1982
1983 zk->zk_usage.value.ui64 = zone->zone_max_swap;
1984 zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1985 return (0);
1986 }
1987
1988 static kstat_t *
1989 zone_rctl_kstat_create_common(zone_t *zone, char *name,
1990 int (*updatefunc) (kstat_t *, int))
1991 {
1992 kstat_t *ksp;
1993 zone_kstat_t *zk;
1994
1995 ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1996 sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1997 KSTAT_FLAG_VIRTUAL);
1998
1999 if (ksp == NULL)
2000 return (NULL);
2001
2002 zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
2003 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2004 kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
2005 kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
2006 kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
2007 kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
2008 ksp->ks_update = updatefunc;
2009 ksp->ks_private = zone;
2198 kstat_t *ksp;
2199 zone_mcap_kstat_t *zmp;
2200
2201 if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2202 zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2203 sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2204 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2205 return (NULL);
2206
2207 if (zone->zone_id != GLOBAL_ZONEID)
2208 kstat_zone_add(ksp, GLOBAL_ZONEID);
2209
2210 zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2211 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2212 ksp->ks_lock = &zone->zone_mcap_lock;
2213 zone->zone_mcap_stats = zmp;
2214
2215 /* The kstat "name" field is not large enough for a full zonename */
2216 kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2217 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2218 kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2219 kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2220 kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2221 kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2222 kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2223 kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2224 kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2225 kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2226 kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2227 kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2228 kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2229 KSTAT_DATA_UINT64);
2230 kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
2231 KSTAT_DATA_UINT64);
2232 kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
2233 KSTAT_DATA_UINT64);
2234
2235 ksp->ks_update = zone_mcap_kstat_update;
2236 ksp->ks_private = zone;
2237
2251
2252 tmp = zone->zone_utime;
2253 scalehrtime(&tmp);
2254 zmp->zm_utime.value.ui64 = tmp;
2255 tmp = zone->zone_stime;
2256 scalehrtime(&tmp);
2257 zmp->zm_stime.value.ui64 = tmp;
2258 tmp = zone->zone_wtime;
2259 scalehrtime(&tmp);
2260 zmp->zm_wtime.value.ui64 = tmp;
2261
2262 zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
2263 zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
2264 zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
2265
2266 zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
2267 zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
2268 zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
2269 zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
2270
2271 zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim;
2272
2273 zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
2274
2275 zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
2276 zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
2277
2278 return (0);
2279 }
2280
2281 static kstat_t *
2282 zone_misc_kstat_create(zone_t *zone)
2283 {
2284 kstat_t *ksp;
2285 zone_misc_kstat_t *zmp;
2286
2287 if ((ksp = kstat_create_zone("zones", zone->zone_id,
2288 zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
2289 sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
2290 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2291 return (NULL);
2292
2296 zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2297 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2298 ksp->ks_lock = &zone->zone_misc_lock;
2299 zone->zone_misc_stats = zmp;
2300
2301 /* The kstat "name" field is not large enough for a full zonename */
2302 kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2303 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2304 kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2305 kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
2306 kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2307 kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2308 kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2309 kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2310 KSTAT_DATA_UINT32);
2311 kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2312 kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2313 KSTAT_DATA_UINT32);
2314 kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2315 kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2316 kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim",
2317 KSTAT_DATA_UINT32);
2318 kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2319 KSTAT_DATA_UINT32);
2320 kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2321 kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2322
2323 ksp->ks_update = zone_misc_kstat_update;
2324 ksp->ks_private = zone;
2325
2326 kstat_install(ksp);
2327 return (ksp);
2328 }
2329
2330 static void
2331 zone_kstat_create(zone_t *zone)
2332 {
2333 zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone,
2334 "lockedmem", zone_lockedmem_kstat_update);
2335 zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone,
2336 "swapresv", zone_swapresv_kstat_update);
2337 zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone,
2338 "physicalmem", zone_physmem_kstat_update);
2339 zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone,
2340 "nprocs", zone_nprocs_kstat_update);
2341
2342 if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2343 zone->zone_vfs_stats = kmem_zalloc(
2344 sizeof (zone_vfs_kstat_t), KM_SLEEP);
2345 }
2346
2347 if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) {
2348 zone->zone_zfs_stats = kmem_zalloc(
2349 sizeof (zone_zfs_kstat_t), KM_SLEEP);
2350 }
2351
2352 if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2353 zone->zone_mcap_stats = kmem_zalloc(
2354 sizeof (zone_mcap_kstat_t), KM_SLEEP);
2355 }
2356
2357 if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2358 zone->zone_misc_stats = kmem_zalloc(
2359 sizeof (zone_misc_kstat_t), KM_SLEEP);
2360 }
2361 }
2362
2363 static void
2364 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2365 {
2366 void *data;
2367
2368 if (*pkstat != NULL) {
2369 data = (*pkstat)->ks_data;
2370 kstat_delete(*pkstat);
2371 kmem_free(data, datasz);
2372 *pkstat = NULL;
2373 }
2374 }
2375
2376 static void
2377 zone_kstat_delete(zone_t *zone)
2378 {
2379 zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2380 sizeof (zone_kstat_t));
2381 zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2382 sizeof (zone_kstat_t));
2383 zone_kstat_delete_common(&zone->zone_physmem_kstat,
2384 sizeof (zone_kstat_t));
2385 zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2386 sizeof (zone_kstat_t));
2387
2388 zone_kstat_delete_common(&zone->zone_vfs_ksp,
2389 sizeof (zone_vfs_kstat_t));
2390 zone_kstat_delete_common(&zone->zone_zfs_ksp,
2391 sizeof (zone_zfs_kstat_t));
2392 zone_kstat_delete_common(&zone->zone_mcap_ksp,
2393 sizeof (zone_mcap_kstat_t));
2394 zone_kstat_delete_common(&zone->zone_misc_ksp,
2395 sizeof (zone_misc_kstat_t));
2396 }
2397
2398 /*
2399 * Called very early on in boot to initialize the ZSD list so that
2400 * zone_key_create() can be called before zone_init(). It also initializes
2401 * portions of zone0 which may be used before zone_init() is called. The
2402 * variable "global_zone" will be set when zone0 is fully initialized by
2403 * zone_init().
2404 */
2405 void
2406 zone_zsd_init(void)
2407 {
2408 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2409 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2410 list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2411 offsetof(struct zsd_entry, zsd_linkage));
2412 list_create(&zone_active, sizeof (zone_t),
2413 offsetof(zone_t, zone_linkage));
2414 list_create(&zone_deathrow, sizeof (zone_t),
2415 offsetof(zone_t, zone_linkage));
2437 zone0.zone_name = GLOBAL_ZONENAME;
2438 zone0.zone_nodename = utsname.nodename;
2439 zone0.zone_domain = srpc_domain;
2440 zone0.zone_hostid = HW_INVALID_HOSTID;
2441 zone0.zone_fs_allowed = NULL;
2442 zone0.zone_ref = 1;
2443 zone0.zone_id = GLOBAL_ZONEID;
2444 zone0.zone_status = ZONE_IS_RUNNING;
2445 zone0.zone_rootpath = "/";
2446 zone0.zone_rootpathlen = 2;
2447 zone0.zone_psetid = ZONE_PS_INVAL;
2448 zone0.zone_ncpus = 0;
2449 zone0.zone_ncpus_online = 0;
2450 zone0.zone_proc_initpid = 1;
2451 zone0.zone_initname = initname;
2452 zone0.zone_lockedmem_kstat = NULL;
2453 zone0.zone_swapresv_kstat = NULL;
2454 zone0.zone_physmem_kstat = NULL;
2455 zone0.zone_nprocs_kstat = NULL;
2456 zone0.zone_zfs_io_pri = 1;
2457 zone0.zone_stime = 0;
2458 zone0.zone_utime = 0;
2459 zone0.zone_wtime = 0;
2460
2461 list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2462 offsetof(zone_ref_t, zref_linkage));
2463 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2464 offsetof(struct zsd_entry, zsd_linkage));
2465 list_insert_head(&zone_active, &zone0);
2466
2467 /*
2468 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2469 * to anything meaningful. It is assigned to be 'rootdir' in
2470 * vfs_mountroot().
2471 */
2472 zone0.zone_rootvp = NULL;
2473 zone0.zone_vfslist = NULL;
2474 zone0.zone_bootargs = initargs;
2475 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2476 /*
2547 /*
2548 * Create ID space for zone IDs. ID 0 is reserved for the
2549 * global zone.
2550 */
2551 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2552
2553 /*
2554 * Initialize generic zone resource controls, if any.
2555 */
2556 rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2557 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2558 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2559 FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2560
2561 rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2562 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2563 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2564 RCTL_GLOBAL_INFINITE,
2565 MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2566
2567 rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline",
2568 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2569 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2570 MAXCAP, MAXCAP, &zone_cpu_base_ops);
2571
2572 rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time",
2573 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2574 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2575 INT_MAX, INT_MAX, &zone_cpu_burst_time_ops);
2576
2577 rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2578 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2579 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2580 16384, 16384, &zone_zfs_io_pri_ops);
2581
2582 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2583 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2584 INT_MAX, INT_MAX, &zone_lwps_ops);
2585
2586 rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2587 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2588 INT_MAX, INT_MAX, &zone_procs_ops);
2589
2590 /*
2591 * System V IPC resource controls
2592 */
2593 rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2594 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2595 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2596
2603 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2604
2605 rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2606 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2607 RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2608
2609 /*
2610 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
2611 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2612 */
2613 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2614 bzero(dval, sizeof (rctl_val_t));
2615 dval->rcv_value = 1;
2616 dval->rcv_privilege = RCPRIV_PRIVILEGED;
2617 dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2618 dval->rcv_action_recip_pid = -1;
2619
2620 rde = rctl_dict_lookup("zone.cpu-shares");
2621 (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2622
2623 /*
2624 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
2625 * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'.
2626 */
2627 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2628 bzero(dval, sizeof (rctl_val_t));
2629 dval->rcv_value = 1;
2630 dval->rcv_privilege = RCPRIV_PRIVILEGED;
2631 dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2632 dval->rcv_action_recip_pid = -1;
2633
2634 rde = rctl_dict_lookup("zone.zfs-io-priority");
2635 (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2636
2637 rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2638 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2639 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2640 &zone_locked_mem_ops);
2641
2642 rc_zone_max_swap = rctl_register("zone.max-swap",
2643 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2644 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2645 &zone_max_swap_ops);
2646
2647 rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2648 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2649 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2650 &zone_phys_mem_ops);
2651
2652 rc_zone_max_lofi = rctl_register("zone.max-lofi",
2653 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2654 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2655 &zone_max_lofi_ops);
2656
2821 if (zone->zone_pfexecd != NULL)
2822 klpd_freelist(&zone->zone_pfexecd);
2823 id_free(zoneid_space, zone->zone_id);
2824 mutex_destroy(&zone->zone_lock);
2825 cv_destroy(&zone->zone_cv);
2826 rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2827 rw_destroy(&zone->zone_mntfs_db_lock);
2828 kmem_free(zone, sizeof (zone_t));
2829 }
2830
2831 /*
2832 * See block comment at the top of this file for information about zone
2833 * status values.
2834 */
2835 /*
2836 * Convenience function for setting zone status.
2837 */
2838 static void
2839 zone_status_set(zone_t *zone, zone_status_t status)
2840 {
2841 timestruc_t now;
2842 uint64_t t;
2843
2844 nvlist_t *nvl = NULL;
2845 ASSERT(MUTEX_HELD(&zone_status_lock));
2846 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2847 status >= zone_status_get(zone));
2848
2849 /* Current time since Jan 1 1970 but consumers expect NS */
2850 gethrestime(&now);
2851 t = (now.tv_sec * NANOSEC) + now.tv_nsec;
2852
2853 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2854 nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2855 nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2856 zone_status_table[status]) ||
2857 nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2858 zone_status_table[zone->zone_status]) ||
2859 nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2860 nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) ||
2861 sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2862 ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2863 #ifdef DEBUG
2864 (void) printf(
2865 "Failed to allocate and send zone state change event.\n");
2866 #endif
2867 }
2868 nvlist_free(nvl);
2869
2870 zone->zone_status = status;
2871
2872 cv_broadcast(&zone->zone_cv);
2873 }
2874
2875 /*
2876 * Public function to retrieve the zone status. The zone status may
2877 * change after it is retrieved.
2878 */
2879 zone_status_t
2880 zone_status_get(zone_t *zone)
3441 }
3442
3443 ASSERT(refcnt == 0);
3444 /*
3445 * zsched has exited; the zone is dead.
3446 */
3447 zone->zone_zsched = NULL; /* paranoia */
3448 mutex_enter(&zone_status_lock);
3449 zone_status_set(zone, ZONE_IS_DEAD);
3450 out:
3451 mutex_exit(&zone_status_lock);
3452 zone_rele(zone);
3453 }
3454
3455 zoneid_t
3456 getzoneid(void)
3457 {
3458 return (curproc->p_zone->zone_id);
3459 }
3460
3461 zoneid_t
3462 getzonedid(void)
3463 {
3464 return (curproc->p_zone->zone_did);
3465 }
3466
3467 /*
3468 * Internal versions of zone_find_by_*(). These don't zone_hold() or
3469 * check the validity of a zone's state.
3470 */
3471 static zone_t *
3472 zone_find_all_by_id(zoneid_t zoneid)
3473 {
3474 mod_hash_val_t hv;
3475 zone_t *zone = NULL;
3476
3477 ASSERT(MUTEX_HELD(&zonehash_lock));
3478
3479 if (mod_hash_find(zonehashbyid,
3480 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3481 zone = (zone_t *)hv;
3482 return (zone);
3483 }
3484
3485 static zone_t *
3486 zone_find_all_by_label(const ts_label_t *label)
4791 ASSERT(error == 0);
4792 for (i = 0; i < nelem; i++) {
4793 if (error = nvlist2rctlval(nvlarray[i], &rv))
4794 goto out;
4795 }
4796 if (rctl_invalid_value(rde, &rv)) {
4797 error = EINVAL;
4798 goto out;
4799 }
4800 }
4801 error = 0;
4802 *nvlp = nvl;
4803 out:
4804 kmem_free(kbuf, buflen);
4805 if (error && nvl != NULL)
4806 nvlist_free(nvl);
4807 return (error);
4808 }
4809
4810 int
4811 zone_create_error(int er_error, int er_ext, int *er_out)
4812 {
4813 if (er_out != NULL) {
4814 if (copyout(&er_ext, er_out, sizeof (int))) {
4815 return (set_errno(EFAULT));
4816 }
4817 }
4818 return (set_errno(er_error));
4819 }
4820
4821 static int
4822 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4823 {
4824 ts_label_t *tsl;
4825 bslabel_t blab;
4826
4827 /* Get label from user */
4828 if (copyin(lab, &blab, sizeof (blab)) != 0)
4829 return (EFAULT);
4830 tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4831 if (tsl == NULL)
4832 return (ENOMEM);
4882
4883 kmem_free(kbuf, buflen);
4884 return (0);
4885 }
4886
4887 /*
4888 * System call to create/initialize a new zone named 'zone_name', rooted
4889 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4890 * and initialized with the zone-wide rctls described in 'rctlbuf', and
4891 * with labeling set by 'match', 'doi', and 'label'.
4892 *
4893 * If extended error is non-null, we may use it to return more detailed
4894 * error information.
4895 */
4896 static zoneid_t
4897 zone_create(const char *zone_name, const char *zone_root,
4898 const priv_set_t *zone_privs, size_t zone_privssz,
4899 caddr_t rctlbuf, size_t rctlbufsz,
4900 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4901 int match, uint32_t doi, const bslabel_t *label,
4902 int flags, zoneid_t zone_did)
4903 {
4904 struct zsched_arg zarg;
4905 nvlist_t *rctls = NULL;
4906 proc_t *pp = curproc;
4907 zone_t *zone, *ztmp;
4908 zoneid_t zoneid, start = GLOBAL_ZONEID;
4909 int error;
4910 int error2 = 0;
4911 char *str;
4912 cred_t *zkcr;
4913 boolean_t insert_label_hash;
4914
4915 if (secpolicy_zone_config(CRED()) != 0)
4916 return (set_errno(EPERM));
4917
4918 /* can't boot zone from within chroot environment */
4919 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4920 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4921 extended_error));
4922
4923 /*
4924 * As the first step of zone creation, we want to allocate a zoneid.
4925 * This allocation is complicated by the fact that netstacks use the
4926 * zoneid to determine their stackid, but netstacks themselves are
4927 * freed asynchronously with respect to zone destruction. This means
4928 * that a netstack reference leak (or in principle, an extraordinarily
4929 * long netstack reference hold) could result in a zoneid being
4930 * allocated that in fact corresponds to a stackid from an active
4931 * (referenced) netstack -- unleashing all sorts of havoc when that
4932 * netstack is actually (re)used. (In the abstract, we might wish a
4933 * zoneid to not be deallocated until its last referencing netstack
4934 * has been released, but netstacks lack a backpointer into their
4935 * referencing zone -- and changing them to have such a pointer would
4936 * be substantial, to put it euphemistically.) To avoid this, we
4937 * detect this condition on allocation: if we have allocated a zoneid
4938 * that corresponds to a netstack that's still in use, we warn about
4939 * it (as it is much more likely to be a reference leak than an actual
4940 * netstack reference), free it, and allocate another. That these
4941 * identifers are allocated out of an ID space assures that we won't
4942 * see the identifier we just allocated.
4943 */
4944 for (;;) {
4945 zoneid = id_alloc(zoneid_space);
4946
4947 if (!netstack_inuse_by_stackid(zoneid_to_netstackid(zoneid)))
4948 break;
4949
4950 id_free(zoneid_space, zoneid);
4951
4952 if (start == GLOBAL_ZONEID) {
4953 start = zoneid;
4954 } else if (zoneid == start) {
4955 /*
4956 * We have managed to iterate over the entire available
4957 * zoneid space -- there are no identifiers available,
4958 * presumably due to some number of leaked netstack
4959 * references. While it's in principle possible for us
4960 * to continue to try, it seems wiser to give up at
4961 * this point to warn and fail explicitly with a
4962 * distinctive error.
4963 */
4964 cmn_err(CE_WARN, "zone_create() failed: all available "
4965 "zone IDs have netstacks still in use");
4966 return (set_errno(ENFILE));
4967 }
4968
4969 cmn_err(CE_WARN, "unable to reuse zone ID %d; "
4970 "netstack still in use", zoneid);
4971 }
4972
4973 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4974 zone->zone_id = zoneid;
4975 zone->zone_did = zone_did;
4976 zone->zone_status = ZONE_IS_UNINITIALIZED;
4977 zone->zone_pool = pool_default;
4978 zone->zone_pool_mod = gethrtime();
4979 zone->zone_psetid = ZONE_PS_INVAL;
4980 zone->zone_ncpus = 0;
4981 zone->zone_ncpus_online = 0;
4982 zone->zone_restart_init = B_TRUE;
4983 zone->zone_reboot_on_init_exit = B_FALSE;
4984 zone->zone_init_status = -1;
4985 zone->zone_brand = &native_brand;
4986 zone->zone_initname = NULL;
4987 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4988 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4989 mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4990 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4991 list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4992 offsetof(zone_ref_t, zref_linkage));
4993 list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4994 offsetof(struct zsd_entry, zsd_linkage));
4995 list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
5328 return (set_errno(EINTR));
5329 }
5330
5331 /*
5332 * Boot (starting init) might have failed, in which case the zone
5333 * will go to the SHUTTING_DOWN state; an appropriate errno will
5334 * be placed in zone->zone_boot_err, and so we return that.
5335 */
5336 err = zone->zone_boot_err;
5337 zone_rele(zone);
5338 return (err ? set_errno(err) : 0);
5339 }
5340
5341 /*
5342 * Kills all user processes in the zone, waiting for them all to exit
5343 * before returning.
5344 */
5345 static int
5346 zone_empty(zone_t *zone)
5347 {
5348 int cnt = 0;
5349 int waitstatus;
5350
5351 /*
5352 * We need to drop zonehash_lock before killing all
5353 * processes, otherwise we'll deadlock with zone_find_*
5354 * which can be called from the exit path.
5355 */
5356 ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
5357 while ((waitstatus = zone_status_timedwait_sig(zone,
5358 ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
5359 boolean_t force = B_FALSE;
5360
5361 /* Every 30 seconds, try harder */
5362 if (cnt++ >= 30) {
5363 cmn_err(CE_WARN, "attempt to force kill zone %d\n",
5364 zone->zone_id);
5365 force = B_TRUE;
5366 cnt = 0;
5367 }
5368 killall(zone->zone_id, force);
5369 }
5370 /*
5371 * return EINTR if we were signaled
5372 */
5373 if (waitstatus == 0)
5374 return (EINTR);
5375 return (0);
5376 }
5377
5378 /*
5379 * This function implements the policy for zone visibility.
5380 *
5381 * In standard Solaris, a non-global zone can only see itself.
5382 *
5383 * In Trusted Extensions, a labeled zone can lookup any zone whose label
5384 * it dominates. For this test, the label of the global zone is treated as
5385 * admin_high so it is special-cased instead of being checked for dominance.
5386 *
5387 * Returns true if zone attributes are viewable, false otherwise.
5388 */
5389 static boolean_t
6153 size = strlen(outstr) + 1;
6154 if (bufsize > size)
6155 bufsize = size;
6156 if (buf != NULL) {
6157 err = copyoutstr(outstr, buf, bufsize, NULL);
6158 if (err != 0 && err != ENAMETOOLONG)
6159 error = EFAULT;
6160 }
6161 break;
6162 case ZONE_ATTR_NETWORK:
6163 zbuf = kmem_alloc(bufsize, KM_SLEEP);
6164 if (copyin(buf, zbuf, bufsize) != 0) {
6165 error = EFAULT;
6166 } else {
6167 error = zone_get_network(zoneid, zbuf);
6168 if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
6169 error = EFAULT;
6170 }
6171 kmem_free(zbuf, bufsize);
6172 break;
6173 case ZONE_ATTR_DID:
6174 size = sizeof (zoneid_t);
6175 if (bufsize > size)
6176 bufsize = size;
6177
6178 if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0)
6179 error = EFAULT;
6180 break;
6181 case ZONE_ATTR_SCHED_FIXEDHI:
6182 size = sizeof (boolean_t);
6183 if (bufsize > size)
6184 bufsize = size;
6185
6186 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6187 bufsize) != 0)
6188 error = EFAULT;
6189 break;
6190 default:
6191 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6192 size = bufsize;
6193 error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6194 } else {
6195 error = EINVAL;
6196 }
6197 }
6198 zone_rele(zone);
6199
6200 if (error)
6993 return (set_errno(EFAULT));
6994 }
6995 zs.zone_name =
6996 (const char *)(unsigned long)zs32.zone_name;
6997 zs.zone_root =
6998 (const char *)(unsigned long)zs32.zone_root;
6999 zs.zone_privs =
7000 (const struct priv_set *)
7001 (unsigned long)zs32.zone_privs;
7002 zs.zone_privssz = zs32.zone_privssz;
7003 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
7004 zs.rctlbufsz = zs32.rctlbufsz;
7005 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
7006 zs.zfsbufsz = zs32.zfsbufsz;
7007 zs.extended_error =
7008 (int *)(unsigned long)zs32.extended_error;
7009 zs.match = zs32.match;
7010 zs.doi = zs32.doi;
7011 zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
7012 zs.flags = zs32.flags;
7013 zs.zoneid = zs32.zoneid;
7014 #else
7015 panic("get_udatamodel() returned bogus result\n");
7016 #endif
7017 }
7018
7019 return (zone_create(zs.zone_name, zs.zone_root,
7020 zs.zone_privs, zs.zone_privssz,
7021 (caddr_t)zs.rctlbuf, zs.rctlbufsz,
7022 (caddr_t)zs.zfsbuf, zs.zfsbufsz,
7023 zs.extended_error, zs.match, zs.doi,
7024 zs.label, zs.flags, zs.zoneid));
7025 case ZONE_BOOT:
7026 return (zone_boot((zoneid_t)(uintptr_t)arg1));
7027 case ZONE_DESTROY:
7028 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
7029 case ZONE_GETATTR:
7030 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
7031 (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7032 case ZONE_SETATTR:
7033 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
7034 (int)(uintptr_t)arg2, arg3, (size_t)arg4));
7035 case ZONE_ENTER:
7036 return (zone_enter((zoneid_t)(uintptr_t)arg1));
7037 case ZONE_LIST:
7038 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
7039 case ZONE_SHUTDOWN:
7040 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
7041 case ZONE_LOOKUP:
7042 return (zone_lookup((const char *)arg1));
7043 case ZONE_VERSION:
7044 return (zone_version((int *)arg1));
7283 */
7284 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
7285 if (zone_status_get(zone) > ZONE_IS_RUNNING) {
7286 /*
7287 * This zone is already on its way down.
7288 */
7289 mutex_exit(&zone_status_lock);
7290 return (0);
7291 }
7292 /*
7293 * Prevent future zone_enter()s
7294 */
7295 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
7296 mutex_exit(&zone_status_lock);
7297
7298 /*
7299 * Kill everyone now and call zoneadmd later.
7300 * zone_ki_call_zoneadmd() will do a more thorough job of this
7301 * later.
7302 */
7303 killall(zone->zone_id, B_FALSE);
7304 /*
7305 * Now, create the thread to contact zoneadmd and do the rest of the
7306 * work. This thread can't be created in our zone otherwise
7307 * zone_destroy() would deadlock.
7308 */
7309 zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
7310 zargp->arg.cmd = zcmd;
7311 zargp->arg.uniqid = zone->zone_uniqid;
7312 zargp->zone = zone;
7313 (void) strcpy(zargp->arg.locale, "C");
7314 /* mdep was already copied in for us by uadmin */
7315 if (mdep != NULL)
7316 (void) strlcpy(zargp->arg.bootbuf, mdep,
7317 sizeof (zargp->arg.bootbuf));
7318 zone_hold(zone);
7319
7320 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
7321 TS_RUN, minclsyspri);
7322 exit(CLD_EXITED, 0);
7323
|
358 */
359 static char *zone_ref_subsys_names[] = {
360 "NFS", /* ZONE_REF_NFS */
361 "NFSv4", /* ZONE_REF_NFSV4 */
362 "SMBFS", /* ZONE_REF_SMBFS */
363 "MNTFS", /* ZONE_REF_MNTFS */
364 "LOFI", /* ZONE_REF_LOFI */
365 "VFS", /* ZONE_REF_VFS */
366 "IPC" /* ZONE_REF_IPC */
367 };
368
369 /*
370 * This isn't static so lint doesn't complain.
371 */
372 rctl_hndl_t rc_zone_cpu_shares;
373 rctl_hndl_t rc_zone_locked_mem;
374 rctl_hndl_t rc_zone_max_swap;
375 rctl_hndl_t rc_zone_phys_mem;
376 rctl_hndl_t rc_zone_max_lofi;
377 rctl_hndl_t rc_zone_cpu_cap;
378 rctl_hndl_t rc_zone_zfs_io_pri;
379 rctl_hndl_t rc_zone_nlwps;
380 rctl_hndl_t rc_zone_nprocs;
381 rctl_hndl_t rc_zone_shmmax;
382 rctl_hndl_t rc_zone_shmmni;
383 rctl_hndl_t rc_zone_semmni;
384 rctl_hndl_t rc_zone_msgmni;
385
386 const char * const zone_default_initname = "/sbin/init";
387 static char * const zone_prefix = "/zone/";
388 static int zone_shutdown(zoneid_t zoneid);
389 static int zone_add_datalink(zoneid_t, datalink_id_t);
390 static int zone_remove_datalink(zoneid_t, datalink_id_t);
391 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
392 static int zone_set_network(zoneid_t, zone_net_data_t *);
393 static int zone_get_network(zoneid_t, zone_net_data_t *);
394
395 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
396
397 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
404 kmutex_t *);
405 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
406 kmutex_t *);
407
408 /*
409 * Bump this number when you alter the zone syscall interfaces; this is
410 * because we need to have support for previous API versions in libc
411 * to support patching; libc calls into the kernel to determine this number.
412 *
413 * Version 1 of the API is the version originally shipped with Solaris 10
414 * Version 2 alters the zone_create system call in order to support more
415 * arguments by moving the args into a structure; and to do better
416 * error reporting when zone_create() fails.
417 * Version 3 alters the zone_create system call in order to support the
418 * import of ZFS datasets to zones.
419 * Version 4 alters the zone_create system call in order to support
420 * Trusted Extensions.
421 * Version 5 alters the zone_boot system call, and converts its old
422 * bootargs parameter to be set by the zone_setattr API instead.
423 * Version 6 adds the flag argument to zone_create.
424 */
425 static const int ZONE_SYSCALL_API_VERSION = 6;
426
427 /*
428 * Certain filesystems (such as NFS and autofs) need to know which zone
429 * the mount is being placed in. Because of this, we need to be able to
430 * ensure that a zone isn't in the process of being created/destroyed such
431 * that nfs_mount() thinks it is in the global/NGZ zone, while by the time
432 * it gets added the list of mounted zones, it ends up on the wrong zone's
433 * mount list. Since a zone can't reside on an NFS file system, we don't
434 * have to worry about the zonepath itself.
435 *
436 * The following functions: block_mounts()/resume_mounts() and
437 * mount_in_progress()/mount_completed() are used by zones and the VFS
438 * layer (respectively) to synchronize zone state transitions and new
439 * mounts within a zone. This syncronization is on a per-zone basis, so
440 * activity for one zone will not interfere with activity for another zone.
441 *
442 * The semantics are like a reader-reader lock such that there may
443 * either be multiple mounts (or zone state transitions, if that weren't
444 * serialized by zonehash_lock) in progress at the same time, but not
445 * both.
1362
1363 ASSERT(MUTEX_HELD(&p->p_lock));
1364 ASSERT(e->rcep_t == RCENTITY_ZONE);
1365
1366 if (zone == NULL)
1367 return (0);
1368
1369 /*
1370 * set cap to the new value.
1371 */
1372 return (cpucaps_zone_set(zone, nv));
1373 }
1374
1375 static rctl_ops_t zone_cpu_cap_ops = {
1376 rcop_no_action,
1377 zone_cpu_cap_get,
1378 zone_cpu_cap_set,
1379 rcop_no_test
1380 };
1381
1382 /*
1383 * zone.zfs-io-pri resource control support (IO priority).
1384 */
1385 /*ARGSUSED*/
1386 static rctl_qty_t
1387 zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p)
1388 {
1389 ASSERT(MUTEX_HELD(&p->p_lock));
1390 return (p->p_zone->zone_zfs_io_pri);
1391 }
1392
1393 /*ARGSUSED*/
1394 static int
1395 zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1396 rctl_qty_t nv)
1397 {
1398 zone_t *zone = e->rcep_p.zone;
1399
1400 ASSERT(MUTEX_HELD(&p->p_lock));
1401 ASSERT(e->rcep_t == RCENTITY_ZONE);
1402
1896 zk->zk_usage.value.ui64 = zone->zone_nprocs;
1897 zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1898 return (0);
1899 }
1900
1901 static int
1902 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1903 {
1904 zone_t *zone = ksp->ks_private;
1905 zone_kstat_t *zk = ksp->ks_data;
1906
1907 if (rw == KSTAT_WRITE)
1908 return (EACCES);
1909
1910 zk->zk_usage.value.ui64 = zone->zone_max_swap;
1911 zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1912 return (0);
1913 }
1914
1915 static kstat_t *
1916 zone_kstat_create_common(zone_t *zone, char *name,
1917 int (*updatefunc) (kstat_t *, int))
1918 {
1919 kstat_t *ksp;
1920 zone_kstat_t *zk;
1921
1922 ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1923 sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1924 KSTAT_FLAG_VIRTUAL);
1925
1926 if (ksp == NULL)
1927 return (NULL);
1928
1929 zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1930 ksp->ks_data_size += strlen(zone->zone_name) + 1;
1931 kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1932 kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1933 kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1934 kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1935 ksp->ks_update = updatefunc;
1936 ksp->ks_private = zone;
2125 kstat_t *ksp;
2126 zone_mcap_kstat_t *zmp;
2127
2128 if ((ksp = kstat_create_zone("memory_cap", zone->zone_id,
2129 zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED,
2130 sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t),
2131 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2132 return (NULL);
2133
2134 if (zone->zone_id != GLOBAL_ZONEID)
2135 kstat_zone_add(ksp, GLOBAL_ZONEID);
2136
2137 zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
2138 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2139 ksp->ks_lock = &zone->zone_mcap_lock;
2140 zone->zone_mcap_stats = zmp;
2141
2142 /* The kstat "name" field is not large enough for a full zonename */
2143 kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2144 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2145 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2146 kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
2147 kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
2148 kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
2149 kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
2150 kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
2151 kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
2152 kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
2153 kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
2154 kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
2155 kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
2156 kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
2157 KSTAT_DATA_UINT64);
2158 kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
2159 KSTAT_DATA_UINT64);
2160 kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
2161 KSTAT_DATA_UINT64);
2162
2163 ksp->ks_update = zone_mcap_kstat_update;
2164 ksp->ks_private = zone;
2165
2179
2180 tmp = zone->zone_utime;
2181 scalehrtime(&tmp);
2182 zmp->zm_utime.value.ui64 = tmp;
2183 tmp = zone->zone_stime;
2184 scalehrtime(&tmp);
2185 zmp->zm_stime.value.ui64 = tmp;
2186 tmp = zone->zone_wtime;
2187 scalehrtime(&tmp);
2188 zmp->zm_wtime.value.ui64 = tmp;
2189
2190 zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
2191 zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
2192 zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2];
2193
2194 zmp->zm_ffcap.value.ui32 = zone->zone_ffcap;
2195 zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc;
2196 zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem;
2197 zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc;
2198
2199 zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp;
2200
2201 zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid;
2202 zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time;
2203
2204 return (0);
2205 }
2206
2207 static kstat_t *
2208 zone_misc_kstat_create(zone_t *zone)
2209 {
2210 kstat_t *ksp;
2211 zone_misc_kstat_t *zmp;
2212
2213 if ((ksp = kstat_create_zone("zones", zone->zone_id,
2214 zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED,
2215 sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t),
2216 KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
2217 return (NULL);
2218
2222 zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP);
2223 ksp->ks_data_size += strlen(zone->zone_name) + 1;
2224 ksp->ks_lock = &zone->zone_misc_lock;
2225 zone->zone_misc_stats = zmp;
2226
2227 /* The kstat "name" field is not large enough for a full zonename */
2228 kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
2229 kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
2230 kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64);
2231 kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64);
2232 kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64);
2233 kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32);
2234 kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32);
2235 kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min",
2236 KSTAT_DATA_UINT32);
2237 kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32);
2238 kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc",
2239 KSTAT_DATA_UINT32);
2240 kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32);
2241 kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32);
2242 kstat_named_init(&zmp->zm_nested_intp, "nested_interp",
2243 KSTAT_DATA_UINT32);
2244 kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32);
2245 kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64);
2246
2247 ksp->ks_update = zone_misc_kstat_update;
2248 ksp->ks_private = zone;
2249
2250 kstat_install(ksp);
2251 return (ksp);
2252 }
2253
2254 static void
2255 zone_kstat_create(zone_t *zone)
2256 {
2257 zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2258 "lockedmem", zone_lockedmem_kstat_update);
2259 zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2260 "swapresv", zone_swapresv_kstat_update);
2261 zone->zone_physmem_kstat = zone_kstat_create_common(zone,
2262 "physicalmem", zone_physmem_kstat_update);
2263 zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2264 "nprocs", zone_nprocs_kstat_update);
2265
2266 if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
2267 zone->zone_vfs_stats = kmem_zalloc(
2268 sizeof (zone_vfs_kstat_t), KM_SLEEP);
2269 }
2270
2271 if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2272 zone->zone_mcap_stats = kmem_zalloc(
2273 sizeof (zone_mcap_kstat_t), KM_SLEEP);
2274 }
2275
2276 if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2277 zone->zone_misc_stats = kmem_zalloc(
2278 sizeof (zone_misc_kstat_t), KM_SLEEP);
2279 }
2280
2281 }
2282
2283 static void
2284 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2285 {
2286 void *data;
2287
2288 if (*pkstat != NULL) {
2289 data = (*pkstat)->ks_data;
2290 kstat_delete(*pkstat);
2291 kmem_free(data, datasz);
2292 *pkstat = NULL;
2293 }
2294 }
2295
2296 static void
2297 zone_kstat_delete(zone_t *zone)
2298 {
2299 zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2300 sizeof (zone_kstat_t));
2301 zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2302 sizeof (zone_kstat_t));
2303 zone_kstat_delete_common(&zone->zone_physmem_kstat,
2304 sizeof (zone_kstat_t));
2305 zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2306 sizeof (zone_kstat_t));
2307
2308 zone_kstat_delete_common(&zone->zone_vfs_ksp,
2309 sizeof (zone_vfs_kstat_t));
2310 zone_kstat_delete_common(&zone->zone_mcap_ksp,
2311 sizeof (zone_mcap_kstat_t));
2312 zone_kstat_delete_common(&zone->zone_misc_ksp,
2313 sizeof (zone_misc_kstat_t));
2314
2315 }
2316
2317 /*
2318 * Called very early on in boot to initialize the ZSD list so that
2319 * zone_key_create() can be called before zone_init(). It also initializes
2320 * portions of zone0 which may be used before zone_init() is called. The
2321 * variable "global_zone" will be set when zone0 is fully initialized by
2322 * zone_init().
2323 */
2324 void
2325 zone_zsd_init(void)
2326 {
2327 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
2328 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
2329 list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
2330 offsetof(struct zsd_entry, zsd_linkage));
2331 list_create(&zone_active, sizeof (zone_t),
2332 offsetof(zone_t, zone_linkage));
2333 list_create(&zone_deathrow, sizeof (zone_t),
2334 offsetof(zone_t, zone_linkage));
2356 zone0.zone_name = GLOBAL_ZONENAME;
2357 zone0.zone_nodename = utsname.nodename;
2358 zone0.zone_domain = srpc_domain;
2359 zone0.zone_hostid = HW_INVALID_HOSTID;
2360 zone0.zone_fs_allowed = NULL;
2361 zone0.zone_ref = 1;
2362 zone0.zone_id = GLOBAL_ZONEID;
2363 zone0.zone_status = ZONE_IS_RUNNING;
2364 zone0.zone_rootpath = "/";
2365 zone0.zone_rootpathlen = 2;
2366 zone0.zone_psetid = ZONE_PS_INVAL;
2367 zone0.zone_ncpus = 0;
2368 zone0.zone_ncpus_online = 0;
2369 zone0.zone_proc_initpid = 1;
2370 zone0.zone_initname = initname;
2371 zone0.zone_lockedmem_kstat = NULL;
2372 zone0.zone_swapresv_kstat = NULL;
2373 zone0.zone_physmem_kstat = NULL;
2374 zone0.zone_nprocs_kstat = NULL;
2375 zone0.zone_zfs_io_pri = 1;
2376
2377 zone0.zone_stime = 0;
2378 zone0.zone_utime = 0;
2379 zone0.zone_wtime = 0;
2380
2381 list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2382 offsetof(zone_ref_t, zref_linkage));
2383 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
2384 offsetof(struct zsd_entry, zsd_linkage));
2385 list_insert_head(&zone_active, &zone0);
2386
2387 /*
2388 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
2389 * to anything meaningful. It is assigned to be 'rootdir' in
2390 * vfs_mountroot().
2391 */
2392 zone0.zone_rootvp = NULL;
2393 zone0.zone_vfslist = NULL;
2394 zone0.zone_bootargs = initargs;
2395 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2396 /*
2467 /*
2468 * Create ID space for zone IDs. ID 0 is reserved for the
2469 * global zone.
2470 */
2471 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
2472
2473 /*
2474 * Initialize generic zone resource controls, if any.
2475 */
2476 rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
2477 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2478 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2479 FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
2480
2481 rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2482 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2483 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2484 RCTL_GLOBAL_INFINITE,
2485 MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2486
2487 rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2488 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2489 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2490 16384, 16384, &zone_zfs_io_pri_ops);
2491
2492 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2493 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2494 INT_MAX, INT_MAX, &zone_lwps_ops);
2495
2496 rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2497 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2498 INT_MAX, INT_MAX, &zone_procs_ops);
2499
2500 /*
2501 * System V IPC resource controls
2502 */
2503 rc_zone_msgmni = rctl_register("zone.max-msg-ids",
2504 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2505 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
2506
2513 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2514
2515 rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2516 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2517 RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2518
2519 /*
2520 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach
2521 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2522 */
2523 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2524 bzero(dval, sizeof (rctl_val_t));
2525 dval->rcv_value = 1;
2526 dval->rcv_privilege = RCPRIV_PRIVILEGED;
2527 dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2528 dval->rcv_action_recip_pid = -1;
2529
2530 rde = rctl_dict_lookup("zone.cpu-shares");
2531 (void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2532
2533 rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2534 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2535 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2536 &zone_locked_mem_ops);
2537
2538 rc_zone_max_swap = rctl_register("zone.max-swap",
2539 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2540 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2541 &zone_max_swap_ops);
2542
2543 rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
2544 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2545 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2546 &zone_phys_mem_ops);
2547
2548 rc_zone_max_lofi = rctl_register("zone.max-lofi",
2549 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2550 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2551 &zone_max_lofi_ops);
2552
2717 if (zone->zone_pfexecd != NULL)
2718 klpd_freelist(&zone->zone_pfexecd);
2719 id_free(zoneid_space, zone->zone_id);
2720 mutex_destroy(&zone->zone_lock);
2721 cv_destroy(&zone->zone_cv);
2722 rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2723 rw_destroy(&zone->zone_mntfs_db_lock);
2724 kmem_free(zone, sizeof (zone_t));
2725 }
2726
2727 /*
2728 * See block comment at the top of this file for information about zone
2729 * status values.
2730 */
2731 /*
2732 * Convenience function for setting zone status.
2733 */
2734 static void
2735 zone_status_set(zone_t *zone, zone_status_t status)
2736 {
2737
2738 nvlist_t *nvl = NULL;
2739 ASSERT(MUTEX_HELD(&zone_status_lock));
2740 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2741 status >= zone_status_get(zone));
2742
2743 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2744 nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2745 nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2746 zone_status_table[status]) ||
2747 nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2748 zone_status_table[zone->zone_status]) ||
2749 nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2750 nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2751 sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2752 ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2753 #ifdef DEBUG
2754 (void) printf(
2755 "Failed to allocate and send zone state change event.\n");
2756 #endif
2757 }
2758 nvlist_free(nvl);
2759
2760 zone->zone_status = status;
2761
2762 cv_broadcast(&zone->zone_cv);
2763 }
2764
2765 /*
2766 * Public function to retrieve the zone status. The zone status may
2767 * change after it is retrieved.
2768 */
2769 zone_status_t
2770 zone_status_get(zone_t *zone)
3331 }
3332
3333 ASSERT(refcnt == 0);
3334 /*
3335 * zsched has exited; the zone is dead.
3336 */
3337 zone->zone_zsched = NULL; /* paranoia */
3338 mutex_enter(&zone_status_lock);
3339 zone_status_set(zone, ZONE_IS_DEAD);
3340 out:
3341 mutex_exit(&zone_status_lock);
3342 zone_rele(zone);
3343 }
3344
3345 zoneid_t
3346 getzoneid(void)
3347 {
3348 return (curproc->p_zone->zone_id);
3349 }
3350
3351 /*
3352 * Internal versions of zone_find_by_*(). These don't zone_hold() or
3353 * check the validity of a zone's state.
3354 */
3355 static zone_t *
3356 zone_find_all_by_id(zoneid_t zoneid)
3357 {
3358 mod_hash_val_t hv;
3359 zone_t *zone = NULL;
3360
3361 ASSERT(MUTEX_HELD(&zonehash_lock));
3362
3363 if (mod_hash_find(zonehashbyid,
3364 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
3365 zone = (zone_t *)hv;
3366 return (zone);
3367 }
3368
3369 static zone_t *
3370 zone_find_all_by_label(const ts_label_t *label)
4675 ASSERT(error == 0);
4676 for (i = 0; i < nelem; i++) {
4677 if (error = nvlist2rctlval(nvlarray[i], &rv))
4678 goto out;
4679 }
4680 if (rctl_invalid_value(rde, &rv)) {
4681 error = EINVAL;
4682 goto out;
4683 }
4684 }
4685 error = 0;
4686 *nvlp = nvl;
4687 out:
4688 kmem_free(kbuf, buflen);
4689 if (error && nvl != NULL)
4690 nvlist_free(nvl);
4691 return (error);
4692 }
4693
4694 int
4695 zone_create_error(int er_error, int er_ext, int *er_out) {
4696 if (er_out != NULL) {
4697 if (copyout(&er_ext, er_out, sizeof (int))) {
4698 return (set_errno(EFAULT));
4699 }
4700 }
4701 return (set_errno(er_error));
4702 }
4703
4704 static int
4705 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
4706 {
4707 ts_label_t *tsl;
4708 bslabel_t blab;
4709
4710 /* Get label from user */
4711 if (copyin(lab, &blab, sizeof (blab)) != 0)
4712 return (EFAULT);
4713 tsl = labelalloc(&blab, doi, KM_NOSLEEP);
4714 if (tsl == NULL)
4715 return (ENOMEM);
4765
4766 kmem_free(kbuf, buflen);
4767 return (0);
4768 }
4769
4770 /*
4771 * System call to create/initialize a new zone named 'zone_name', rooted
4772 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
4773 * and initialized with the zone-wide rctls described in 'rctlbuf', and
4774 * with labeling set by 'match', 'doi', and 'label'.
4775 *
4776 * If extended error is non-null, we may use it to return more detailed
4777 * error information.
4778 */
4779 static zoneid_t
4780 zone_create(const char *zone_name, const char *zone_root,
4781 const priv_set_t *zone_privs, size_t zone_privssz,
4782 caddr_t rctlbuf, size_t rctlbufsz,
4783 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
4784 int match, uint32_t doi, const bslabel_t *label,
4785 int flags)
4786 {
4787 struct zsched_arg zarg;
4788 nvlist_t *rctls = NULL;
4789 proc_t *pp = curproc;
4790 zone_t *zone, *ztmp;
4791 zoneid_t zoneid;
4792 int error;
4793 int error2 = 0;
4794 char *str;
4795 cred_t *zkcr;
4796 boolean_t insert_label_hash;
4797
4798 if (secpolicy_zone_config(CRED()) != 0)
4799 return (set_errno(EPERM));
4800
4801 /* can't boot zone from within chroot environment */
4802 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
4803 return (zone_create_error(ENOTSUP, ZE_CHROOTED,
4804 extended_error));
4805
4806 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4807 zoneid = zone->zone_id = id_alloc(zoneid_space);
4808 zone->zone_status = ZONE_IS_UNINITIALIZED;
4809 zone->zone_pool = pool_default;
4810 zone->zone_pool_mod = gethrtime();
4811 zone->zone_psetid = ZONE_PS_INVAL;
4812 zone->zone_ncpus = 0;
4813 zone->zone_ncpus_online = 0;
4814 zone->zone_restart_init = B_TRUE;
4815 zone->zone_reboot_on_init_exit = B_FALSE;
4816 zone->zone_init_status = -1;
4817 zone->zone_brand = &native_brand;
4818 zone->zone_initname = NULL;
4819 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4820 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4821 mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4822 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4823 list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4824 offsetof(zone_ref_t, zref_linkage));
4825 list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4826 offsetof(struct zsd_entry, zsd_linkage));
4827 list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
5160 return (set_errno(EINTR));
5161 }
5162
5163 /*
5164 * Boot (starting init) might have failed, in which case the zone
5165 * will go to the SHUTTING_DOWN state; an appropriate errno will
5166 * be placed in zone->zone_boot_err, and so we return that.
5167 */
5168 err = zone->zone_boot_err;
5169 zone_rele(zone);
5170 return (err ? set_errno(err) : 0);
5171 }
5172
5173 /*
5174 * Kills all user processes in the zone, waiting for them all to exit
5175 * before returning.
5176 */
5177 static int
5178 zone_empty(zone_t *zone)
5179 {
5180 int waitstatus;
5181
5182 /*
5183 * We need to drop zonehash_lock before killing all
5184 * processes, otherwise we'll deadlock with zone_find_*
5185 * which can be called from the exit path.
5186 */
5187 ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
5188 while ((waitstatus = zone_status_timedwait_sig(zone,
5189 ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
5190 killall(zone->zone_id);
5191 }
5192 /*
5193 * return EINTR if we were signaled
5194 */
5195 if (waitstatus == 0)
5196 return (EINTR);
5197 return (0);
5198 }
5199
5200 /*
5201 * This function implements the policy for zone visibility.
5202 *
5203 * In standard Solaris, a non-global zone can only see itself.
5204 *
5205 * In Trusted Extensions, a labeled zone can lookup any zone whose label
5206 * it dominates. For this test, the label of the global zone is treated as
5207 * admin_high so it is special-cased instead of being checked for dominance.
5208 *
5209 * Returns true if zone attributes are viewable, false otherwise.
5210 */
5211 static boolean_t
5975 size = strlen(outstr) + 1;
5976 if (bufsize > size)
5977 bufsize = size;
5978 if (buf != NULL) {
5979 err = copyoutstr(outstr, buf, bufsize, NULL);
5980 if (err != 0 && err != ENAMETOOLONG)
5981 error = EFAULT;
5982 }
5983 break;
5984 case ZONE_ATTR_NETWORK:
5985 zbuf = kmem_alloc(bufsize, KM_SLEEP);
5986 if (copyin(buf, zbuf, bufsize) != 0) {
5987 error = EFAULT;
5988 } else {
5989 error = zone_get_network(zoneid, zbuf);
5990 if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5991 error = EFAULT;
5992 }
5993 kmem_free(zbuf, bufsize);
5994 break;
5995 case ZONE_ATTR_SCHED_FIXEDHI:
5996 size = sizeof (boolean_t);
5997 if (bufsize > size)
5998 bufsize = size;
5999
6000 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
6001 bufsize) != 0)
6002 error = EFAULT;
6003 break;
6004 default:
6005 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
6006 size = bufsize;
6007 error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
6008 } else {
6009 error = EINVAL;
6010 }
6011 }
6012 zone_rele(zone);
6013
6014 if (error)
6807 return (set_errno(EFAULT));
6808 }
6809 zs.zone_name =
6810 (const char *)(unsigned long)zs32.zone_name;
6811 zs.zone_root =
6812 (const char *)(unsigned long)zs32.zone_root;
6813 zs.zone_privs =
6814 (const struct priv_set *)
6815 (unsigned long)zs32.zone_privs;
6816 zs.zone_privssz = zs32.zone_privssz;
6817 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
6818 zs.rctlbufsz = zs32.rctlbufsz;
6819 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
6820 zs.zfsbufsz = zs32.zfsbufsz;
6821 zs.extended_error =
6822 (int *)(unsigned long)zs32.extended_error;
6823 zs.match = zs32.match;
6824 zs.doi = zs32.doi;
6825 zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
6826 zs.flags = zs32.flags;
6827 #else
6828 panic("get_udatamodel() returned bogus result\n");
6829 #endif
6830 }
6831
6832 return (zone_create(zs.zone_name, zs.zone_root,
6833 zs.zone_privs, zs.zone_privssz,
6834 (caddr_t)zs.rctlbuf, zs.rctlbufsz,
6835 (caddr_t)zs.zfsbuf, zs.zfsbufsz,
6836 zs.extended_error, zs.match, zs.doi,
6837 zs.label, zs.flags));
6838 case ZONE_BOOT:
6839 return (zone_boot((zoneid_t)(uintptr_t)arg1));
6840 case ZONE_DESTROY:
6841 return (zone_destroy((zoneid_t)(uintptr_t)arg1));
6842 case ZONE_GETATTR:
6843 return (zone_getattr((zoneid_t)(uintptr_t)arg1,
6844 (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6845 case ZONE_SETATTR:
6846 return (zone_setattr((zoneid_t)(uintptr_t)arg1,
6847 (int)(uintptr_t)arg2, arg3, (size_t)arg4));
6848 case ZONE_ENTER:
6849 return (zone_enter((zoneid_t)(uintptr_t)arg1));
6850 case ZONE_LIST:
6851 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
6852 case ZONE_SHUTDOWN:
6853 return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
6854 case ZONE_LOOKUP:
6855 return (zone_lookup((const char *)arg1));
6856 case ZONE_VERSION:
6857 return (zone_version((int *)arg1));
7096 */
7097 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
7098 if (zone_status_get(zone) > ZONE_IS_RUNNING) {
7099 /*
7100 * This zone is already on its way down.
7101 */
7102 mutex_exit(&zone_status_lock);
7103 return (0);
7104 }
7105 /*
7106 * Prevent future zone_enter()s
7107 */
7108 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
7109 mutex_exit(&zone_status_lock);
7110
7111 /*
7112 * Kill everyone now and call zoneadmd later.
7113 * zone_ki_call_zoneadmd() will do a more thorough job of this
7114 * later.
7115 */
7116 killall(zone->zone_id);
7117 /*
7118 * Now, create the thread to contact zoneadmd and do the rest of the
7119 * work. This thread can't be created in our zone otherwise
7120 * zone_destroy() would deadlock.
7121 */
7122 zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
7123 zargp->arg.cmd = zcmd;
7124 zargp->arg.uniqid = zone->zone_uniqid;
7125 zargp->zone = zone;
7126 (void) strcpy(zargp->arg.locale, "C");
7127 /* mdep was already copied in for us by uadmin */
7128 if (mdep != NULL)
7129 (void) strlcpy(zargp->arg.bootbuf, mdep,
7130 sizeof (zargp->arg.bootbuf));
7131 zone_hold(zone);
7132
7133 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
7134 TS_RUN, minclsyspri);
7135 exit(CLD_EXITED, 0);
7136
|