Print this page
Remove most KEBE comments and accompanying unused code or variables/fields.
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
Mismerge in zone.c frees stats that aren't there.
Mismerge in zone.c create zone mcap kstats too many times
OS-338 Kstat counters to show "slow" VFS operations
OS-5189 lx dev enumeration can deadlock with zfs
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5187 improve /proc/diskstat handling
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5179 flatten zvol entries for /dev and /proc/partitions
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Undo merge damage from zone kstats
OS-4915 want FX high priority zone configuration option
OS-4925 ps pri shows misleading value for zone in RT scheduling class
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4781 would like to be able to add CT_PR_EV_EXIT to fatal event set of current contract
OS-4017 would like zfs-io-priority values > 1024
OS-3820 lxbrand ptrace(2): the next generation
OS-3685 lxbrand PTRACE_O_TRACEFORK race condition
OS-3834 lxbrand 64-bit strace(1) reports 64-bit process as using x32 ABI
OS-3794 lxbrand panic on init signal death
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-3776 project rctls should be in sync with zone rctls
OS-3429 Expose zone's init exit status
OS-3342 dlmgmtd needs to be mindful of lock ordering
OS-2608 dlmgmtd needs to record zone identifiers
OS-3492 zone_free asserts to its destruction when dlmgmtd has fallen
OS-3494 zoneadmd tears down networking too soon when boot fails
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-803 make phys mem cap a bit harder
OS-1043 minimize vm_getusage impact
OS-11 rcapd behaves poorly when under extreme load
OS-399 zone phys. mem. cap should be a rctl and have associated kstat

*** 19,29 **** * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. ! * Copyright 2015, Joyent Inc. All rights reserved. */ /* * Zones * --- 19,29 ---- * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. ! * Copyright 2016, Joyent Inc. */ /* * Zones *
*** 248,257 **** --- 248,259 ---- #include <sys/zone.h> #include <net/if.h> #include <sys/cpucaps.h> #include <vm/seg.h> #include <sys/mac.h> + #include <sys/rt.h> + #include <sys/fx.h> /* * This constant specifies the number of seconds that threads waiting for * subsystems to release a zone's general-purpose references will wait before * they log the zone's reference counts. The constant's value shouldn't
*** 368,377 **** --- 370,380 ---- * This isn't static so lint doesn't complain. */ rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; + rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs;
*** 1740,1749 **** --- 1743,1785 ---- zone_max_swap_test }; /*ARGSUSED*/ static rctl_qty_t + zone_phys_mem_usage(rctl_t *rctl, struct proc *p) + { + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + /* No additional lock because not enforced in the kernel */ + q = z->zone_phys_mem; + return (q); + } + + /*ARGSUSED*/ + static int + zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) + { + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_phys_mem_ctl = nv; + return (0); + } + + static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test + }; + + /*ARGSUSED*/ + static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; zone_t *z = p->p_zone;
*** 1833,1842 **** --- 1869,1892 ---- zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl; return (0); } static int + zone_physmem_kstat_update(kstat_t *ksp, int rw) + { + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_phys_mem; + zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl; + return (0); + } + + static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_kstat_t *zk = ksp->ks_data;
*** 1886,1910 **** --- 1936,2122 ---- ksp->ks_private = zone; kstat_install(ksp); return (ksp); } + static int + zone_vfs_kstat_update(kstat_t *ksp, int rw) + { + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rcnt.value.ui64 = kiop->rcnt; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wcnt.value.ui64 = kiop->wcnt; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); + } + + static kstat_t * + zone_vfs_kstat_create(zone_t *zone) + { + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); + } + static int + zone_zfs_kstat_update(kstat_t *ksp, int rw) + { + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_zfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the ZFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the I/O throttle + * counters are updated directly by the ZFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); + } + + static kstat_t * + zone_zfs_kstat_create(zone_t *zone) + { + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); + } + + static int zone_mcap_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_mcap_kstat_t *zmp = ksp->ks_data; if (rw == KSTAT_WRITE) return (EACCES); + zmp->zm_rss.value.ui64 = zone->zone_phys_mem; + zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl; + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zone->zone_mcap_nover; + zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout; zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; zmp->zm_fspgin.value.ui64 = zone->zone_fspgin; zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail; + zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle; + zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec; return (0); } static kstat_t *
*** 1928,1943 **** --- 2140,2166 ---- zone->zone_mcap_stats = zmp; /* The kstat "name" field is not large enough for a full zonename */ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec", + KSTAT_DATA_UINT64); ksp->ks_update = zone_mcap_kstat_update; ksp->ks_private = zone; kstat_install(ksp);
*** 2033,2054 **** --- 2256,2285 ---- { zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); zone->zone_swapresv_kstat = zone_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); + zone->zone_physmem_kstat = zone_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); zone->zone_nprocs_kstat = zone_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { zone->zone_mcap_stats = kmem_zalloc( sizeof (zone_mcap_kstat_t), KM_SLEEP); } if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) { zone->zone_misc_stats = kmem_zalloc( sizeof (zone_misc_kstat_t), KM_SLEEP); } + } static void zone_kstat_delete_common(kstat_t **pkstat, size_t datasz) {
*** 2067,2082 **** --- 2298,2319 ---- { zone_kstat_delete_common(&zone->zone_lockedmem_kstat, sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); zone_kstat_delete_common(&zone->zone_mcap_ksp, sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, sizeof (zone_misc_kstat_t)); + } /* * Called very early on in boot to initialize the ZSD list so that * zone_key_create() can be called before zone_init(). It also initializes
*** 2106,2115 **** --- 2343,2354 ---- zone0.zone_nprocs_ctl = INT_MAX; zone0.zone_locked_mem = 0; zone0.zone_locked_mem_ctl = UINT64_MAX; ASSERT(zone0.zone_max_swap == 0); zone0.zone_max_swap_ctl = UINT64_MAX; + zone0.zone_phys_mem = 0; + zone0.zone_phys_mem_ctl = UINT64_MAX; zone0.zone_max_lofi = 0; zone0.zone_max_lofi_ctl = UINT64_MAX; zone0.zone_shmmax = 0; zone0.zone_ipc.ipcq_shmmni = 0; zone0.zone_ipc.ipcq_semmni = 0;
*** 2129,2138 **** --- 2368,2378 ---- zone0.zone_ncpus_online = 0; zone0.zone_proc_initpid = 1; zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; zone0.zone_zfs_io_pri = 1; zone0.zone_stime = 0; zone0.zone_utime = 0;
*** 2245,2255 **** MAXCAP, MAXCAP, &zone_cpu_cap_ops); rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, ! 1024, 1024, &zone_zfs_io_pri_ops); rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); --- 2485,2495 ---- MAXCAP, MAXCAP, &zone_cpu_cap_ops); rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, ! 16384, 16384, &zone_zfs_io_pri_ops); rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops);
*** 2298,2307 **** --- 2538,2552 ---- rc_zone_max_swap = rctl_register("zone.max-swap", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_lofi_ops);
*** 2319,2328 **** --- 2564,2575 ---- zone0.zone_nlwps = p0.p_lwpcnt; zone0.zone_nprocs = 1; zone0.zone_ntasks = 1; mutex_exit(&p0.p_lock); zone0.zone_restart_init = B_TRUE; + zone0.zone_reboot_on_init_exit = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* * pool_default hasn't been initialized yet, so we let pool_init() * take care of making sure the global zone is in the default pool.
*** 2398,2407 **** --- 2645,2656 ---- } static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); ASSERT(zone->zone_nprocs == 0); ASSERT(zone->zone_cred_ref == 0);
*** 2426,2435 **** --- 2675,2697 ---- } list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); if (zone->zone_rootvp != NULL) VN_RELE(zone->zone_rootvp); if (zone->zone_rootpath)
*** 2561,2573 **** mutex_exit(&zone_status_lock); brand_unregister_zone(bp); return (EINVAL); } ! /* set up the brand specific data */ zone->zone_brand = bp; ! ZBROP(zone)->b_init_brand_data(zone); mutex_exit(&zone_status_lock); return (0); } --- 2823,2840 ---- mutex_exit(&zone_status_lock); brand_unregister_zone(bp); return (EINVAL); } ! /* ! * Set up the brand specific data. ! * Note that it's possible that the hook has to drop the ! * zone_status_lock and reaquire it before returning so we can't ! * assume the lock has been held the entire time. ! */ zone->zone_brand = bp; ! ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock); mutex_exit(&zone_status_lock); return (0); }
*** 2609,2631 **** zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP); (void) strcpy(zone->zone_initname, initname); return (0); } static int ! zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) { ! uint64_t mcap; ! int err = 0; ! if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) ! zone->zone_phys_mcap = mcap; return (err); } static int zone_set_sched_class(zone_t *zone, const char *new_class) { char sched_class[PC_CLNMSZ]; id_t classid; int err; --- 2876,2949 ---- zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP); (void) strcpy(zone->zone_initname, initname); return (0); } + /* + * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used + * to provide the physical memory capping kstats. Since physical memory + * capping is currently implemented in userland, that code uses the setattr + * entry point to increment the kstats. We always simply increment nover + * every time that setattr is called and we always add in the input value + * to zone_mcap_pagedout every time that is called. + */ + /*ARGSUSED*/ static int ! zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover) { ! zone->zone_mcap_nover++; ! return (0); ! } + static int + zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout) + { + uint64_t pageout; + int err; + + if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) + zone->zone_mcap_pagedout += pageout; + return (err); } + /* + * The zone_set_page_fault_delay function is used to set the number of usecs + * to throttle page faults. This is normally 0 but can be set to a non-0 value + * by the user-land memory capping code when the zone is over its physcial + * memory cap. + */ static int + zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) + { + uint32_t dusec; + int err; + + if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0) + zone->zone_pg_flt_delay = dusec; + + return (err); + } + + /* + * The zone_set_rss function is used to set the zone's RSS when we do the + * fast, approximate calculation in user-land. + */ + static int + zone_set_rss(zone_t *zone, const uint64_t *prss) + { + uint64_t rss; + int err; + + if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) + zone->zone_phys_mem = rss; + + return (err); + } + + static int zone_set_sched_class(zone_t *zone, const char *new_class) { char sched_class[PC_CLNMSZ]; id_t classid; int err;
*** 3771,3781 **** --- 4089,4110 ---- * For all purposes (ZONE_ATTR_INITPID and restart_init), * storing just the pid of init is sufficient. */ z->zone_proc_initpid = p->p_pid; + if (z->zone_setup_app_contract == B_TRUE) { /* + * Normally a process cannot modify its own contract, but we're + * just starting the zone's init process and its contract is + * always initialized from the sys_process_tmpl template, so + * this is the simplest way to setup init's contract to kill + * the process if any other process in the contract exits. + */ + p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; + } + + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. */ p->p_zone->zone_boot_err = start_init_common();
*** 3799,3811 **** --- 4128,4185 ---- mutex_enter(&p->p_lock); ASSERT(p->p_flag & SEXITLWPS); lwp_exit(); } } else { + id_t cid = curthread->t_cid; + if (zone_status_get(z) == ZONE_IS_BOOTING) zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + /* + * If the zone is using FX then by default all + * processes start at the lowest priority and stay + * there. We provide a mechanism for the zone to + * indicate that it should run at "high priority". In + * this case we setup init to run at the highest FX + * priority (which is one level higher than the + * non-fixed scheduling classes can use). + */ + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + (void) parmsset(&pcparms, curthread); + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + /* + * zsched always starts the init lwp at priority + * minclsyspri - 1. This priority gets set in t_pri and + * is invalid for RT, but RT never uses t_pri. However + * t_pri is used by procfs, so we always see processes + * within an RT zone with an invalid priority value. + * We fix that up now. + */ + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + /* cause the process to return to userland. */ lwp_rtt(); } }
*** 3843,3852 **** --- 4217,4227 ---- bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched")); bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched")); PTOU(pp)->u_argc = 0; PTOU(pp)->u_argv = NULL; PTOU(pp)->u_envp = NULL; + PTOU(pp)->u_commpagep = NULL; closeall(P_FINFO(pp)); /* * We are this zone's "zsched" process. As the zone isn't generally * visible yet we don't need to grab any locks before initializing its
*** 4285,4296 **** uint_t i, nelem; char *name; error = EINVAL; name = nvpair_name(nvp); ! if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) ! != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { goto out; } --- 4660,4672 ---- uint_t i, nelem; char *name; error = EINVAL; name = nvpair_name(nvp); ! if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && ! strncmp(name, "project.", sizeof ("project.") - 1) != 0) || ! nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { goto out; }
*** 4434,4443 **** --- 4810,4821 ---- zone->zone_pool_mod = gethrtime(); zone->zone_psetid = ZONE_PS_INVAL; zone->zone_ncpus = 0; zone->zone_ncpus_online = 0; zone->zone_restart_init = B_TRUE; + zone->zone_reboot_on_init_exit = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
*** 4495,4508 **** --- 4873,4889 ---- zone->zone_nprocs_ctl = INT_MAX; zone->zone_locked_mem = 0; zone->zone_locked_mem_ctl = UINT64_MAX; zone->zone_max_swap = 0; zone->zone_max_swap_ctl = UINT64_MAX; + zone->zone_phys_mem = 0; + zone->zone_phys_mem_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; zone->zone_lockedmem_kstat = NULL; zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; zone->zone_zfs_io_pri = 1; /* * Zsched initializes the rctls. */
*** 4654,4665 **** zarg.zone = zone; zarg.nvlist = rctls; /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, ! * and initialize zsched appropriately. I'm not sure that that ! * makes much of a difference, though. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { /* * We need to undo all globally visible state. --- 5035,5046 ---- zarg.zone = zone; zarg.nvlist = rctls; /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, ! * and initialize zsched appropriately. However, we allow zoneadmd ! * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { /* * We need to undo all globally visible state.
*** 5555,5572 **** err = copyoutstr(outstr, buf, bufsize, NULL); if (err != 0 && err != ENAMETOOLONG) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); if (zone->zone_defaultcid >= loaded_classes) outstr = ""; --- 5936,5945 ----
*** 5617,5626 **** --- 5990,6008 ---- if (error == 0 && copyout(zbuf, buf, bufsize) != 0) error = EFAULT; } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_SCHED_FIXEDHI: + size = sizeof (boolean_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, + bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; error = ZBROP(zone)->b_getattr(zone, attr, buf, &size); } else {
*** 5648,5661 **** if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); /* ! * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the ! * global zone. */ ! if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { return (set_errno(EINVAL)); } mutex_enter(&zonehash_lock); if ((zone = zone_find_all_by_id(zoneid)) == NULL) { --- 6030,6044 ---- if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); /* ! * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT ! * attributes can be set on the global zone. */ ! if (zoneid == GLOBAL_ZONEID && ! attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) { return (set_errno(EINVAL)); } mutex_enter(&zonehash_lock); if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
*** 5668,5678 **** /* * At present most attributes can only be set on non-running, * non-global zones. */ zone_status = zone_status_get(zone); ! if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } switch (attr) { --- 6051,6063 ---- /* * At present most attributes can only be set on non-running, * non-global zones. */ zone_status = zone_status_get(zone); ! if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && ! attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && ! zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } switch (attr) {
*** 5690,5702 **** err = zone_set_brand(zone, (const char *)buf); break; case ZONE_ATTR_FS_ALLOWED: err = zone_set_fs_allowed(zone, (const char *)buf); break; ! case ZONE_ATTR_PHYS_MCAP: ! err = zone_set_phys_mcap(zone, (const uint64_t *)buf); break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); break; case ZONE_ATTR_HOSTID: if (bufsize == sizeof (zone->zone_hostid)) { --- 6075,6096 ---- err = zone_set_brand(zone, (const char *)buf); break; case ZONE_ATTR_FS_ALLOWED: err = zone_set_fs_allowed(zone, (const char *)buf); break; ! case ZONE_ATTR_PMCAP_NOVER: ! err = zone_set_mcap_nover(zone, (const uint64_t *)buf); break; + case ZONE_ATTR_PMCAP_PAGEOUT: + err = zone_set_mcap_pageout(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PG_FLT_DELAY: + err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); + break; + case ZONE_ATTR_RSS: + err = zone_set_rss(zone, (const uint64_t *)buf); + break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); break; case ZONE_ATTR_HOSTID: if (bufsize == sizeof (zone->zone_hostid)) {
*** 5720,5729 **** --- 6114,6139 ---- break; } err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_APP_SVC_CT: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_setup_app_contract = (boolean_t)buf; + err = 0; + } + break; + case ZONE_ATTR_SCHED_FIXEDHI: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_fixed_hipri = (boolean_t)buf; + err = 0; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); else err = EINVAL;
*** 6523,6532 **** --- 6933,6943 ---- zone_namelen = strlen(zone->zone_name) + 1; zone_name = kmem_alloc(zone_namelen, KM_SLEEP); bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called * from a kernel thread and know we won't be delivered any signals. */
*** 6763,6782 **** mutex_exit(&zone_status_lock); mutex_exit(&zonehash_lock); } /* ! * Returns true if the named dataset is visible in the current zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int ! zone_dataset_visible(const char *dataset, int *write) { static int zfstype = -1; zone_dataset_t *zd; size_t len; - zone_t *zone = curproc->p_zone; const char *name = NULL; vfs_t *vfsp = NULL; if (dataset[0] == '\0') return (0); --- 7174,7192 ---- mutex_exit(&zone_status_lock); mutex_exit(&zonehash_lock); } /* ! * Returns true if the named dataset is visible in the specified zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int ! zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write) { static int zfstype = -1; zone_dataset_t *zd; size_t len; const char *name = NULL; vfs_t *vfsp = NULL; if (dataset[0] == '\0') return (0);
*** 6840,6850 **** } vfs_list_read_lock(); vfsp = zone->zone_vfslist; do { ! ASSERT(vfsp); if (vfsp->vfs_fstype == zfstype) { name = refstr_value(vfsp->vfs_resource); /* * Check if we have an exact match. --- 7250,7261 ---- } vfs_list_read_lock(); vfsp = zone->zone_vfslist; do { ! if (vfsp == NULL) ! break; if (vfsp->vfs_fstype == zfstype) { name = refstr_value(vfsp->vfs_resource); /* * Check if we have an exact match.
*** 6876,6885 **** --- 7287,7308 ---- vfs_list_unlock(); return (0); } + /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ + int + zone_dataset_visible(const char *dataset, int *write) + { + zone_t *zone = curproc->p_zone; + + return (zone_dataset_visible_inzone(zone, dataset, write)); + } + /* * zone_find_by_any_path() - * * kernel-private routine similar to zone_find_by_path(), but which * effectively compares against zone paths rather than zonerootpath