Print this page
Remove most KEBE comments and accompanying unused code or variables/fields.
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
Mismerge in zone.c frees stats that aren't there.
Mismerge in zone.c create zone mcap kstats too many times
OS-338 Kstat counters to show "slow" VFS operations
OS-5189 lx dev enumeration can deadlock with zfs
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5187 improve /proc/diskstat handling
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5179 flatten zvol entries for /dev and /proc/partitions
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Undo merge damage from zone kstats
OS-4915 want FX high priority zone configuration option
OS-4925 ps pri shows misleading value for zone in RT scheduling class
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4781 would like to be able to add CT_PR_EV_EXIT to fatal event set of current contract
OS-4017 would like zfs-io-priority values > 1024
OS-3820 lxbrand ptrace(2): the next generation
OS-3685 lxbrand PTRACE_O_TRACEFORK race condition
OS-3834 lxbrand 64-bit strace(1) reports 64-bit process as using x32 ABI
OS-3794 lxbrand panic on init signal death
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-3776 project rctls should be in sync with zone rctls
OS-3429 Expose zone's init exit status
OS-3342 dlmgmtd needs to be mindful of lock ordering
OS-2608 dlmgmtd needs to record zone identifiers
OS-3492 zone_free asserts to its destruction when dlmgmtd has fallen
OS-3494 zoneadmd tears down networking too soon when boot fails
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-803 make phys mem cap a bit harder
OS-1043 minimize vm_getusage impact
OS-11 rcapd behaves poorly when under extreme load
OS-399 zone phys. mem. cap should be a rctl and have associated kstat
*** 19,29 ****
* CDDL HEADER END
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
! * Copyright 2015, Joyent Inc. All rights reserved.
*/
/*
* Zones
*
--- 19,29 ----
* CDDL HEADER END
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
! * Copyright 2016, Joyent Inc.
*/
/*
* Zones
*
*** 248,257 ****
--- 248,259 ----
#include <sys/zone.h>
#include <net/if.h>
#include <sys/cpucaps.h>
#include <vm/seg.h>
#include <sys/mac.h>
+ #include <sys/rt.h>
+ #include <sys/fx.h>
/*
* This constant specifies the number of seconds that threads waiting for
* subsystems to release a zone's general-purpose references will wait before
* they log the zone's reference counts. The constant's value shouldn't
*** 368,377 ****
--- 370,380 ----
* This isn't static so lint doesn't complain.
*/
rctl_hndl_t rc_zone_cpu_shares;
rctl_hndl_t rc_zone_locked_mem;
rctl_hndl_t rc_zone_max_swap;
+ rctl_hndl_t rc_zone_phys_mem;
rctl_hndl_t rc_zone_max_lofi;
rctl_hndl_t rc_zone_cpu_cap;
rctl_hndl_t rc_zone_zfs_io_pri;
rctl_hndl_t rc_zone_nlwps;
rctl_hndl_t rc_zone_nprocs;
*** 1740,1749 ****
--- 1743,1785 ----
zone_max_swap_test
};
/*ARGSUSED*/
static rctl_qty_t
+ zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+ {
+ rctl_qty_t q;
+ zone_t *z = p->p_zone;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ /* No additional lock because not enforced in the kernel */
+ q = z->zone_phys_mem;
+ return (q);
+ }
+
+ /*ARGSUSED*/
+ static int
+ zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+ rctl_qty_t nv)
+ {
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(e->rcep_t == RCENTITY_ZONE);
+ if (e->rcep_p.zone == NULL)
+ return (0);
+ e->rcep_p.zone->zone_phys_mem_ctl = nv;
+ return (0);
+ }
+
+ static rctl_ops_t zone_phys_mem_ops = {
+ rcop_no_action,
+ zone_phys_mem_usage,
+ zone_phys_mem_set,
+ rcop_no_test
+ };
+
+ /*ARGSUSED*/
+ static rctl_qty_t
zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
{
rctl_qty_t q;
zone_t *z = p->p_zone;
*** 1833,1842 ****
--- 1869,1892 ----
zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
return (0);
}
static int
+ zone_physmem_kstat_update(kstat_t *ksp, int rw)
+ {
+ zone_t *zone = ksp->ks_private;
+ zone_kstat_t *zk = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ zk->zk_usage.value.ui64 = zone->zone_phys_mem;
+ zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+ return (0);
+ }
+
+ static int
zone_nprocs_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_kstat_t *zk = ksp->ks_data;
*** 1886,1910 ****
--- 1936,2122 ----
ksp->ks_private = zone;
kstat_install(ksp);
return (ksp);
}
+ static int
+ zone_vfs_kstat_update(kstat_t *ksp, int rw)
+ {
+ zone_t *zone = ksp->ks_private;
+ zone_vfs_kstat_t *zvp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_vfs_rwstats;
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the VFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the slow ops
+ * counters are updated directly by the VFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zvp->zv_nread.value.ui64 = kiop->nread;
+ zvp->zv_reads.value.ui64 = kiop->reads;
+ zvp->zv_rtime.value.ui64 = kiop->rtime;
+ zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+ zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+ zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+ zvp->zv_writes.value.ui64 = kiop->writes;
+ zvp->zv_wtime.value.ui64 = kiop->wtime;
+ zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+ zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+ scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+ scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+ return (0);
+ }
+
+ static kstat_t *
+ zone_vfs_kstat_create(zone_t *zone)
+ {
+ kstat_t *ksp;
+ zone_vfs_kstat_t *zvp;
+
+ if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+ zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_vfs_lock;
+ zone->zone_vfs_stats = zvp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+ kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+ kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_vfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+ }
+
static int
+ zone_zfs_kstat_update(kstat_t *ksp, int rw)
+ {
+ zone_t *zone = ksp->ks_private;
+ zone_zfs_kstat_t *zzp = ksp->ks_data;
+ kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ /*
+ * Extract the ZFS statistics from the kstat_io_t structure used by
+ * kstat_runq_enter() and related functions. Since the I/O throttle
+ * counters are updated directly by the ZFS layer, there's no need to
+ * copy those statistics here.
+ *
+ * Note that kstat_runq_enter() and the related functions use
+ * gethrtime_unscaled(), so scale the time here.
+ */
+ zzp->zz_nread.value.ui64 = kiop->nread;
+ zzp->zz_reads.value.ui64 = kiop->reads;
+ zzp->zz_rtime.value.ui64 = kiop->rtime;
+ zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+ zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+ zzp->zz_writes.value.ui64 = kiop->writes;
+
+ scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+ scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+ return (0);
+ }
+
+ static kstat_t *
+ zone_zfs_kstat_create(zone_t *zone)
+ {
+ kstat_t *ksp;
+ zone_zfs_kstat_t *zzp;
+
+ if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+ zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+ sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+ return (NULL);
+
+ if (zone->zone_id != GLOBAL_ZONEID)
+ kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+ zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+ ksp->ks_data_size += strlen(zone->zone_name) + 1;
+ ksp->ks_lock = &zone->zone_zfs_lock;
+ zone->zone_zfs_stats = zzp;
+
+ /* The kstat "name" field is not large enough for a full zonename */
+ kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+ kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+ kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+ kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+ ksp->ks_update = zone_zfs_kstat_update;
+ ksp->ks_private = zone;
+
+ kstat_install(ksp);
+ return (ksp);
+ }
+
+ static int
zone_mcap_kstat_update(kstat_t *ksp, int rw)
{
zone_t *zone = ksp->ks_private;
zone_mcap_kstat_t *zmp = ksp->ks_data;
if (rw == KSTAT_WRITE)
return (EACCES);
+ zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
+ zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+ zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+ zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+ zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
+ zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
+ zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
+ zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
return (0);
}
static kstat_t *
*** 1928,1943 ****
--- 2140,2166 ----
zone->zone_mcap_stats = zmp;
/* The kstat "name" field is not large enough for a full zonename */
kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+ kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
+ KSTAT_DATA_UINT64);
ksp->ks_update = zone_mcap_kstat_update;
ksp->ks_private = zone;
kstat_install(ksp);
*** 2033,2054 ****
--- 2256,2285 ----
{
zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
"lockedmem", zone_lockedmem_kstat_update);
zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
"swapresv", zone_swapresv_kstat_update);
+ zone->zone_physmem_kstat = zone_kstat_create_common(zone,
+ "physicalmem", zone_physmem_kstat_update);
zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
"nprocs", zone_nprocs_kstat_update);
+ if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+ zone->zone_vfs_stats = kmem_zalloc(
+ sizeof (zone_vfs_kstat_t), KM_SLEEP);
+ }
+
if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
zone->zone_mcap_stats = kmem_zalloc(
sizeof (zone_mcap_kstat_t), KM_SLEEP);
}
if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
zone->zone_misc_stats = kmem_zalloc(
sizeof (zone_misc_kstat_t), KM_SLEEP);
}
+
}
static void
zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
{
*** 2067,2082 ****
--- 2298,2319 ----
{
zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_swapresv_kstat,
sizeof (zone_kstat_t));
+ zone_kstat_delete_common(&zone->zone_physmem_kstat,
+ sizeof (zone_kstat_t));
zone_kstat_delete_common(&zone->zone_nprocs_kstat,
sizeof (zone_kstat_t));
+
+ zone_kstat_delete_common(&zone->zone_vfs_ksp,
+ sizeof (zone_vfs_kstat_t));
zone_kstat_delete_common(&zone->zone_mcap_ksp,
sizeof (zone_mcap_kstat_t));
zone_kstat_delete_common(&zone->zone_misc_ksp,
sizeof (zone_misc_kstat_t));
+
}
/*
* Called very early on in boot to initialize the ZSD list so that
* zone_key_create() can be called before zone_init(). It also initializes
*** 2106,2115 ****
--- 2343,2354 ----
zone0.zone_nprocs_ctl = INT_MAX;
zone0.zone_locked_mem = 0;
zone0.zone_locked_mem_ctl = UINT64_MAX;
ASSERT(zone0.zone_max_swap == 0);
zone0.zone_max_swap_ctl = UINT64_MAX;
+ zone0.zone_phys_mem = 0;
+ zone0.zone_phys_mem_ctl = UINT64_MAX;
zone0.zone_max_lofi = 0;
zone0.zone_max_lofi_ctl = UINT64_MAX;
zone0.zone_shmmax = 0;
zone0.zone_ipc.ipcq_shmmni = 0;
zone0.zone_ipc.ipcq_semmni = 0;
*** 2129,2138 ****
--- 2368,2378 ----
zone0.zone_ncpus_online = 0;
zone0.zone_proc_initpid = 1;
zone0.zone_initname = initname;
zone0.zone_lockedmem_kstat = NULL;
zone0.zone_swapresv_kstat = NULL;
+ zone0.zone_physmem_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
zone0.zone_zfs_io_pri = 1;
zone0.zone_stime = 0;
zone0.zone_utime = 0;
*** 2245,2255 ****
MAXCAP, MAXCAP, &zone_cpu_cap_ops);
rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
! 1024, 1024, &zone_zfs_io_pri_ops);
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
INT_MAX, INT_MAX, &zone_lwps_ops);
--- 2485,2495 ----
MAXCAP, MAXCAP, &zone_cpu_cap_ops);
rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
! 16384, 16384, &zone_zfs_io_pri_ops);
rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
INT_MAX, INT_MAX, &zone_lwps_ops);
*** 2298,2307 ****
--- 2538,2552 ----
rc_zone_max_swap = rctl_register("zone.max-swap",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_max_swap_ops);
+ rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+ RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+ RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+ &zone_phys_mem_ops);
+
rc_zone_max_lofi = rctl_register("zone.max-lofi",
RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
&zone_max_lofi_ops);
*** 2319,2328 ****
--- 2564,2575 ----
zone0.zone_nlwps = p0.p_lwpcnt;
zone0.zone_nprocs = 1;
zone0.zone_ntasks = 1;
mutex_exit(&p0.p_lock);
zone0.zone_restart_init = B_TRUE;
+ zone0.zone_reboot_on_init_exit = B_FALSE;
+ zone0.zone_init_status = -1;
zone0.zone_brand = &native_brand;
rctl_prealloc_destroy(gp);
/*
* pool_default hasn't been initialized yet, so we let pool_init()
* take care of making sure the global zone is in the default pool.
*** 2398,2407 ****
--- 2645,2656 ----
}
static void
zone_free(zone_t *zone)
{
+ zone_dl_t *zdl;
+
ASSERT(zone != global_zone);
ASSERT(zone->zone_ntasks == 0);
ASSERT(zone->zone_nlwps == 0);
ASSERT(zone->zone_nprocs == 0);
ASSERT(zone->zone_cred_ref == 0);
*** 2426,2435 ****
--- 2675,2697 ----
}
list_destroy(&zone->zone_ref_list);
zone_free_zsd(zone);
zone_free_datasets(zone);
+
+ /*
+ * While dlmgmtd should have removed all of these, it could have left
+ * something behind or crashed. In which case it's not safe for us to
+ * assume that the list is empty which list_destroy() will ASSERT. We
+ * clean up for our userland comrades which may have crashed, or worse,
+ * been disabled by SMF.
+ */
+ while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+ if (zdl->zdl_net != NULL)
+ nvlist_free(zdl->zdl_net);
+ kmem_free(zdl, sizeof (zone_dl_t));
+ }
list_destroy(&zone->zone_dl_list);
if (zone->zone_rootvp != NULL)
VN_RELE(zone->zone_rootvp);
if (zone->zone_rootpath)
*** 2561,2573 ****
mutex_exit(&zone_status_lock);
brand_unregister_zone(bp);
return (EINVAL);
}
! /* set up the brand specific data */
zone->zone_brand = bp;
! ZBROP(zone)->b_init_brand_data(zone);
mutex_exit(&zone_status_lock);
return (0);
}
--- 2823,2840 ----
mutex_exit(&zone_status_lock);
brand_unregister_zone(bp);
return (EINVAL);
}
! /*
! * Set up the brand specific data.
! * Note that it's possible that the hook has to drop the
! * zone_status_lock and reaquire it before returning so we can't
! * assume the lock has been held the entire time.
! */
zone->zone_brand = bp;
! ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
mutex_exit(&zone_status_lock);
return (0);
}
*** 2609,2631 ****
zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
(void) strcpy(zone->zone_initname, initname);
return (0);
}
static int
! zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
{
! uint64_t mcap;
! int err = 0;
! if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
! zone->zone_phys_mcap = mcap;
return (err);
}
static int
zone_set_sched_class(zone_t *zone, const char *new_class)
{
char sched_class[PC_CLNMSZ];
id_t classid;
int err;
--- 2876,2949 ----
zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
(void) strcpy(zone->zone_initname, initname);
return (0);
}
+ /*
+ * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
+ * to provide the physical memory capping kstats. Since physical memory
+ * capping is currently implemented in userland, that code uses the setattr
+ * entry point to increment the kstats. We always simply increment nover
+ * every time that setattr is called and we always add in the input value
+ * to zone_mcap_pagedout every time that is called.
+ */
+ /*ARGSUSED*/
static int
! zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
{
! zone->zone_mcap_nover++;
! return (0);
! }
+ static int
+ zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
+ {
+ uint64_t pageout;
+ int err;
+
+ if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
+ zone->zone_mcap_pagedout += pageout;
+
return (err);
}
+ /*
+ * The zone_set_page_fault_delay function is used to set the number of usecs
+ * to throttle page faults. This is normally 0 but can be set to a non-0 value
+ * by the user-land memory capping code when the zone is over its physcial
+ * memory cap.
+ */
static int
+ zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
+ {
+ uint32_t dusec;
+ int err;
+
+ if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
+ zone->zone_pg_flt_delay = dusec;
+
+ return (err);
+ }
+
+ /*
+ * The zone_set_rss function is used to set the zone's RSS when we do the
+ * fast, approximate calculation in user-land.
+ */
+ static int
+ zone_set_rss(zone_t *zone, const uint64_t *prss)
+ {
+ uint64_t rss;
+ int err;
+
+ if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
+ zone->zone_phys_mem = rss;
+
+ return (err);
+ }
+
+ static int
zone_set_sched_class(zone_t *zone, const char *new_class)
{
char sched_class[PC_CLNMSZ];
id_t classid;
int err;
*** 3771,3781 ****
--- 4089,4110 ----
* For all purposes (ZONE_ATTR_INITPID and restart_init),
* storing just the pid of init is sufficient.
*/
z->zone_proc_initpid = p->p_pid;
+ if (z->zone_setup_app_contract == B_TRUE) {
/*
+ * Normally a process cannot modify its own contract, but we're
+ * just starting the zone's init process and its contract is
+ * always initialized from the sys_process_tmpl template, so
+ * this is the simplest way to setup init's contract to kill
+ * the process if any other process in the contract exits.
+ */
+ p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+ }
+
+ /*
* We maintain zone_boot_err so that we can return the cause of the
* failure back to the caller of the zone_boot syscall.
*/
p->p_zone->zone_boot_err = start_init_common();
*** 3799,3811 ****
--- 4128,4185 ----
mutex_enter(&p->p_lock);
ASSERT(p->p_flag & SEXITLWPS);
lwp_exit();
}
} else {
+ id_t cid = curthread->t_cid;
+
if (zone_status_get(z) == ZONE_IS_BOOTING)
zone_status_set(z, ZONE_IS_RUNNING);
mutex_exit(&zone_status_lock);
+
+ mutex_enter(&class_lock);
+ ASSERT(cid < loaded_classes);
+ if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+ z->zone_fixed_hipri) {
+ /*
+ * If the zone is using FX then by default all
+ * processes start at the lowest priority and stay
+ * there. We provide a mechanism for the zone to
+ * indicate that it should run at "high priority". In
+ * this case we setup init to run at the highest FX
+ * priority (which is one level higher than the
+ * non-fixed scheduling classes can use).
+ */
+ pcparms_t pcparms;
+
+ pcparms.pc_cid = cid;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+ FXMAXUPRI;
+ ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+ FX_DOUPRILIM | FX_DOUPRI;
+
+ mutex_enter(&pidlock);
+ mutex_enter(&curproc->p_lock);
+
+ (void) parmsset(&pcparms, curthread);
+
+ mutex_exit(&curproc->p_lock);
+ mutex_exit(&pidlock);
+ } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+ /*
+ * zsched always starts the init lwp at priority
+ * minclsyspri - 1. This priority gets set in t_pri and
+ * is invalid for RT, but RT never uses t_pri. However
+ * t_pri is used by procfs, so we always see processes
+ * within an RT zone with an invalid priority value.
+ * We fix that up now.
+ */
+ curthread->t_pri = RTGPPRIO0;
+ }
+ mutex_exit(&class_lock);
+
/* cause the process to return to userland. */
lwp_rtt();
}
}
*** 3843,3852 ****
--- 4217,4227 ----
bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
PTOU(pp)->u_argc = 0;
PTOU(pp)->u_argv = NULL;
PTOU(pp)->u_envp = NULL;
+ PTOU(pp)->u_commpagep = NULL;
closeall(P_FINFO(pp));
/*
* We are this zone's "zsched" process. As the zone isn't generally
* visible yet we don't need to grab any locks before initializing its
*** 4285,4296 ****
uint_t i, nelem;
char *name;
error = EINVAL;
name = nvpair_name(nvp);
! if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
! != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
goto out;
}
if ((hndl = rctl_hndl_lookup(name)) == -1) {
goto out;
}
--- 4660,4672 ----
uint_t i, nelem;
char *name;
error = EINVAL;
name = nvpair_name(nvp);
! if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
! strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
! nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
goto out;
}
if ((hndl = rctl_hndl_lookup(name)) == -1) {
goto out;
}
*** 4434,4443 ****
--- 4810,4821 ----
zone->zone_pool_mod = gethrtime();
zone->zone_psetid = ZONE_PS_INVAL;
zone->zone_ncpus = 0;
zone->zone_ncpus_online = 0;
zone->zone_restart_init = B_TRUE;
+ zone->zone_reboot_on_init_exit = B_FALSE;
+ zone->zone_init_status = -1;
zone->zone_brand = &native_brand;
zone->zone_initname = NULL;
mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
*** 4495,4508 ****
--- 4873,4889 ----
zone->zone_nprocs_ctl = INT_MAX;
zone->zone_locked_mem = 0;
zone->zone_locked_mem_ctl = UINT64_MAX;
zone->zone_max_swap = 0;
zone->zone_max_swap_ctl = UINT64_MAX;
+ zone->zone_phys_mem = 0;
+ zone->zone_phys_mem_ctl = UINT64_MAX;
zone->zone_max_lofi = 0;
zone->zone_max_lofi_ctl = UINT64_MAX;
zone->zone_lockedmem_kstat = NULL;
zone->zone_swapresv_kstat = NULL;
+ zone->zone_physmem_kstat = NULL;
zone->zone_zfs_io_pri = 1;
/*
* Zsched initializes the rctls.
*/
*** 4654,4665 ****
zarg.zone = zone;
zarg.nvlist = rctls;
/*
* The process, task, and project rctls are probably wrong;
* we need an interface to get the default values of all rctls,
! * and initialize zsched appropriately. I'm not sure that that
! * makes much of a difference, though.
*/
error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
if (error != 0) {
/*
* We need to undo all globally visible state.
--- 5035,5046 ----
zarg.zone = zone;
zarg.nvlist = rctls;
/*
* The process, task, and project rctls are probably wrong;
* we need an interface to get the default values of all rctls,
! * and initialize zsched appropriately. However, we allow zoneadmd
! * to pass down both zone and project rctls for the zone's init.
*/
error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
if (error != 0) {
/*
* We need to undo all globally visible state.
*** 5555,5572 ****
err = copyoutstr(outstr, buf, bufsize, NULL);
if (err != 0 && err != ENAMETOOLONG)
error = EFAULT;
}
break;
- case ZONE_ATTR_PHYS_MCAP:
- size = sizeof (zone->zone_phys_mcap);
- if (bufsize > size)
- bufsize = size;
- if (buf != NULL &&
- copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
- error = EFAULT;
- break;
case ZONE_ATTR_SCHED_CLASS:
mutex_enter(&class_lock);
if (zone->zone_defaultcid >= loaded_classes)
outstr = "";
--- 5936,5945 ----
*** 5617,5626 ****
--- 5990,6008 ----
if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
error = EFAULT;
}
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ size = sizeof (boolean_t);
+ if (bufsize > size)
+ bufsize = size;
+
+ if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+ bufsize) != 0)
+ error = EFAULT;
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
size = bufsize;
error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
} else {
*** 5648,5661 ****
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
/*
! * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
! * global zone.
*/
! if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
return (set_errno(EINVAL));
}
mutex_enter(&zonehash_lock);
if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
--- 6030,6044 ----
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
/*
! * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
! * attributes can be set on the global zone.
*/
! if (zoneid == GLOBAL_ZONEID &&
! attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
return (set_errno(EINVAL));
}
mutex_enter(&zonehash_lock);
if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
*** 5668,5678 ****
/*
* At present most attributes can only be set on non-running,
* non-global zones.
*/
zone_status = zone_status_get(zone);
! if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
switch (attr) {
--- 6051,6063 ----
/*
* At present most attributes can only be set on non-running,
* non-global zones.
*/
zone_status = zone_status_get(zone);
! if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
! attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
! zone_status > ZONE_IS_READY) {
err = EINVAL;
goto done;
}
switch (attr) {
*** 5690,5702 ****
err = zone_set_brand(zone, (const char *)buf);
break;
case ZONE_ATTR_FS_ALLOWED:
err = zone_set_fs_allowed(zone, (const char *)buf);
break;
! case ZONE_ATTR_PHYS_MCAP:
! err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
break;
case ZONE_ATTR_SCHED_CLASS:
err = zone_set_sched_class(zone, (const char *)buf);
break;
case ZONE_ATTR_HOSTID:
if (bufsize == sizeof (zone->zone_hostid)) {
--- 6075,6096 ----
err = zone_set_brand(zone, (const char *)buf);
break;
case ZONE_ATTR_FS_ALLOWED:
err = zone_set_fs_allowed(zone, (const char *)buf);
break;
! case ZONE_ATTR_PMCAP_NOVER:
! err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
break;
+ case ZONE_ATTR_PMCAP_PAGEOUT:
+ err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
+ break;
+ case ZONE_ATTR_PG_FLT_DELAY:
+ err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
+ break;
+ case ZONE_ATTR_RSS:
+ err = zone_set_rss(zone, (const uint64_t *)buf);
+ break;
case ZONE_ATTR_SCHED_CLASS:
err = zone_set_sched_class(zone, (const char *)buf);
break;
case ZONE_ATTR_HOSTID:
if (bufsize == sizeof (zone->zone_hostid)) {
*** 5720,5729 ****
--- 6114,6139 ----
break;
}
err = zone_set_network(zoneid, zbuf);
kmem_free(zbuf, bufsize);
break;
+ case ZONE_ATTR_APP_SVC_CT:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_setup_app_contract = (boolean_t)buf;
+ err = 0;
+ }
+ break;
+ case ZONE_ATTR_SCHED_FIXEDHI:
+ if (bufsize != sizeof (boolean_t)) {
+ err = EINVAL;
+ } else {
+ zone->zone_fixed_hipri = (boolean_t)buf;
+ err = 0;
+ }
+ break;
default:
if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
else
err = EINVAL;
*** 6523,6532 ****
--- 6933,6943 ----
zone_namelen = strlen(zone->zone_name) + 1;
zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
bcopy(zone->zone_name, zone_name, zone_namelen);
zoneid = zone->zone_id;
uniqid = zone->zone_uniqid;
+ arg.status = zone->zone_init_status;
/*
* zoneadmd may be down, but at least we can empty out the zone.
* We can ignore the return value of zone_empty() since we're called
* from a kernel thread and know we won't be delivered any signals.
*/
*** 6763,6782 ****
mutex_exit(&zone_status_lock);
mutex_exit(&zonehash_lock);
}
/*
! * Returns true if the named dataset is visible in the current zone.
* The 'write' parameter is set to 1 if the dataset is also writable.
*/
int
! zone_dataset_visible(const char *dataset, int *write)
{
static int zfstype = -1;
zone_dataset_t *zd;
size_t len;
- zone_t *zone = curproc->p_zone;
const char *name = NULL;
vfs_t *vfsp = NULL;
if (dataset[0] == '\0')
return (0);
--- 7174,7192 ----
mutex_exit(&zone_status_lock);
mutex_exit(&zonehash_lock);
}
/*
! * Returns true if the named dataset is visible in the specified zone.
* The 'write' parameter is set to 1 if the dataset is also writable.
*/
int
! zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
{
static int zfstype = -1;
zone_dataset_t *zd;
size_t len;
const char *name = NULL;
vfs_t *vfsp = NULL;
if (dataset[0] == '\0')
return (0);
*** 6840,6850 ****
}
vfs_list_read_lock();
vfsp = zone->zone_vfslist;
do {
! ASSERT(vfsp);
if (vfsp->vfs_fstype == zfstype) {
name = refstr_value(vfsp->vfs_resource);
/*
* Check if we have an exact match.
--- 7250,7261 ----
}
vfs_list_read_lock();
vfsp = zone->zone_vfslist;
do {
! if (vfsp == NULL)
! break;
if (vfsp->vfs_fstype == zfstype) {
name = refstr_value(vfsp->vfs_resource);
/*
* Check if we have an exact match.
*** 6876,6885 ****
--- 7287,7308 ----
vfs_list_unlock();
return (0);
}
+ /*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+ int
+ zone_dataset_visible(const char *dataset, int *write)
+ {
+ zone_t *zone = curproc->p_zone;
+
+ return (zone_dataset_visible_inzone(zone, dataset, write));
+ }
+
/*
* zone_find_by_any_path() -
*
* kernel-private routine similar to zone_find_by_path(), but which
* effectively compares against zone paths rather than zonerootpath