io-lx-public Udiff usr/src/uts/common/os/zone.c

Print this page

Remove most KEBE comments and accompanying unused code or variables/fields.
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
Mismerge in zone.c frees stats that aren't there.
Mismerge in zone.c create zone mcap kstats too many times
OS-338 Kstat counters to show "slow" VFS operations
OS-5189 lx dev enumeration can deadlock with zfs
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5187 improve /proc/diskstat handling
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5179 flatten zvol entries for /dev and /proc/partitions
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Undo merge damage from zone kstats
OS-4915 want FX high priority zone configuration option
OS-4925 ps pri shows misleading value for zone in RT scheduling class
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4781 would like to be able to add CT_PR_EV_EXIT to fatal event set of current contract
OS-4017 would like zfs-io-priority values > 1024
OS-3820 lxbrand ptrace(2): the next generation
OS-3685 lxbrand PTRACE_O_TRACEFORK race condition
OS-3834 lxbrand 64-bit strace(1) reports 64-bit process as using x32 ABI
OS-3794 lxbrand panic on init signal death
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-3776 project rctls should be in sync with zone rctls
OS-3429 Expose zone's init exit status
OS-3342 dlmgmtd needs to be mindful of lock ordering
OS-2608 dlmgmtd needs to record zone identifiers
OS-3492 zone_free asserts to its destruction when dlmgmtd has fallen
OS-3494 zoneadmd tears down networking too soon when boot fails
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-803 make phys mem cap a bit harder
OS-1043 minimize vm_getusage impact
OS-11 rcapd behaves poorly when under extreme load
OS-399 zone phys. mem. cap should be a rctl and have associated kstat

@@ -19,11 +19,11 @@
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent Inc. All rights reserved.
+ * Copyright 2016, Joyent Inc.
  */
 
 /*
  * Zones
  *

@@ -248,10 +248,12 @@
 #include <sys/zone.h>
 #include <net/if.h>
 #include <sys/cpucaps.h>
 #include <vm/seg.h>
 #include <sys/mac.h>
+#include <sys/rt.h>
+#include <sys/fx.h>
 
 /*
  * This constant specifies the number of seconds that threads waiting for
  * subsystems to release a zone's general-purpose references will wait before
  * they log the zone's reference counts.  The constant's value shouldn't

@@ -368,10 +370,11 @@
  * This isn't static so lint doesn't complain.
  */
 rctl_hndl_t rc_zone_cpu_shares;
 rctl_hndl_t rc_zone_locked_mem;
 rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_phys_mem;
 rctl_hndl_t rc_zone_max_lofi;
 rctl_hndl_t rc_zone_cpu_cap;
 rctl_hndl_t rc_zone_zfs_io_pri;
 rctl_hndl_t rc_zone_nlwps;
 rctl_hndl_t rc_zone_nprocs;

@@ -1740,10 +1743,43 @@
         zone_max_swap_test
 };
 
 /*ARGSUSED*/
 static rctl_qty_t
+zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+{
+        rctl_qty_t q;
+        zone_t *z = p->p_zone;
+
+        ASSERT(MUTEX_HELD(&p->p_lock));
+        /* No additional lock because not enforced in the kernel */
+        q = z->zone_phys_mem;
+        return (q);
+}
+
+/*ARGSUSED*/
+static int
+zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+        ASSERT(MUTEX_HELD(&p->p_lock));
+        ASSERT(e->rcep_t == RCENTITY_ZONE);
+        if (e->rcep_p.zone == NULL)
+                return (0);
+        e->rcep_p.zone->zone_phys_mem_ctl = nv;
+        return (0);
+}
+
+static rctl_ops_t zone_phys_mem_ops = {
+        rcop_no_action,
+        zone_phys_mem_usage,
+        zone_phys_mem_set,
+        rcop_no_test
+};
+
+/*ARGSUSED*/
+static rctl_qty_t
 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
 {
         rctl_qty_t q;
         zone_t *z = p->p_zone;

@@ -1833,10 +1869,24 @@
         zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
         return (0);
 }
 
 static int
+zone_physmem_kstat_update(kstat_t *ksp, int rw)
+{
+        zone_t *zone = ksp->ks_private;
+        zone_kstat_t *zk = ksp->ks_data;
+
+        if (rw == KSTAT_WRITE)
+                return (EACCES);
+
+        zk->zk_usage.value.ui64 = zone->zone_phys_mem;
+        zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+        return (0);
+}
+
+static int
 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
 {
         zone_t *zone = ksp->ks_private;
         zone_kstat_t *zk = ksp->ks_data;

@@ -1886,25 +1936,187 @@
         ksp->ks_private = zone;
         kstat_install(ksp);
         return (ksp);
 }
 
+static int
+zone_vfs_kstat_update(kstat_t *ksp, int rw)
+{
+        zone_t *zone = ksp->ks_private;
+        zone_vfs_kstat_t *zvp = ksp->ks_data;
+        kstat_io_t *kiop = &zone->zone_vfs_rwstats;
 
+        if (rw == KSTAT_WRITE)
+                return (EACCES);
+
+        /*
+         * Extract the VFS statistics from the kstat_io_t structure used by
+         * kstat_runq_enter() and related functions.  Since the slow ops
+         * counters are updated directly by the VFS layer, there's no need to
+         * copy those statistics here.
+         *
+         * Note that kstat_runq_enter() and the related functions use
+         * gethrtime_unscaled(), so scale the time here.
+         */
+        zvp->zv_nread.value.ui64 = kiop->nread;
+        zvp->zv_reads.value.ui64 = kiop->reads;
+        zvp->zv_rtime.value.ui64 = kiop->rtime;
+        zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+        zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+        zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+        zvp->zv_writes.value.ui64 = kiop->writes;
+        zvp->zv_wtime.value.ui64 = kiop->wtime;
+        zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+        zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+
+        scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+        scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+        scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+        scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+
+        return (0);
+}
+
+static kstat_t *
+zone_vfs_kstat_create(zone_t *zone)
+{
+        kstat_t *ksp;
+        zone_vfs_kstat_t *zvp;
+
+        if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+            zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+            sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+            KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+                return (NULL);
+
+        if (zone->zone_id != GLOBAL_ZONEID)
+                kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+        zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+        ksp->ks_data_size += strlen(zone->zone_name) + 1;
+        ksp->ks_lock = &zone->zone_vfs_lock;
+        zone->zone_vfs_stats = zvp;
+
+        /* The kstat "name" field is not large enough for a full zonename */
+        kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+        kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+        kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+        kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+
+        ksp->ks_update = zone_vfs_kstat_update;
+        ksp->ks_private = zone;
+
+        kstat_install(ksp);
+        return (ksp);
+}
+
 static int
+zone_zfs_kstat_update(kstat_t *ksp, int rw)
+{
+        zone_t *zone = ksp->ks_private;
+        zone_zfs_kstat_t *zzp = ksp->ks_data;
+        kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+
+        if (rw == KSTAT_WRITE)
+                return (EACCES);
+
+        /*
+         * Extract the ZFS statistics from the kstat_io_t structure used by
+         * kstat_runq_enter() and related functions.  Since the I/O throttle
+         * counters are updated directly by the ZFS layer, there's no need to
+         * copy those statistics here.
+         *
+         * Note that kstat_runq_enter() and the related functions use
+         * gethrtime_unscaled(), so scale the time here.
+         */
+        zzp->zz_nread.value.ui64 = kiop->nread;
+        zzp->zz_reads.value.ui64 = kiop->reads;
+        zzp->zz_rtime.value.ui64 = kiop->rtime;
+        zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+        zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+        zzp->zz_writes.value.ui64 = kiop->writes;
+
+        scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+        scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+
+        return (0);
+}
+
+static kstat_t *
+zone_zfs_kstat_create(zone_t *zone)
+{
+        kstat_t *ksp;
+        zone_zfs_kstat_t *zzp;
+
+        if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+            zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+            sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+            KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+                return (NULL);
+
+        if (zone->zone_id != GLOBAL_ZONEID)
+                kstat_zone_add(ksp, GLOBAL_ZONEID);
+
+        zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+        ksp->ks_data_size += strlen(zone->zone_name) + 1;
+        ksp->ks_lock = &zone->zone_zfs_lock;
+        zone->zone_zfs_stats = zzp;
+
+        /* The kstat "name" field is not large enough for a full zonename */
+        kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+        kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+        kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+        kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+        kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+        kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+        kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+        kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+        kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+
+        ksp->ks_update = zone_zfs_kstat_update;
+        ksp->ks_private = zone;
+
+        kstat_install(ksp);
+        return (ksp);
+}
+
+static int
 zone_mcap_kstat_update(kstat_t *ksp, int rw)
 {
         zone_t *zone = ksp->ks_private;
         zone_mcap_kstat_t *zmp = ksp->ks_data;
 
         if (rw == KSTAT_WRITE)
                 return (EACCES);
 
+        zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
+        zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+        zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+        zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+        zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
+        zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
         zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
         zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
         zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
         zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
         zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
+        zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
+        zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
 
         return (0);
 }
 
 static kstat_t *

@@ -1928,16 +2140,27 @@
         zone->zone_mcap_stats = zmp;
 
         /* The kstat "name" field is not large enough for a full zonename */
         kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+        kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+        kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+        kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+        kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+        kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+        kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+        kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
         kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
         kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
         kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
         kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
         kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
             KSTAT_DATA_UINT64);
+        kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
+            KSTAT_DATA_UINT64);
+        kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
+            KSTAT_DATA_UINT64);
 
         ksp->ks_update = zone_mcap_kstat_update;
         ksp->ks_private = zone;
 
         kstat_install(ksp);

@@ -2033,22 +2256,30 @@
 {
         zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
             "lockedmem", zone_lockedmem_kstat_update);
         zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
             "swapresv", zone_swapresv_kstat_update);
+        zone->zone_physmem_kstat = zone_kstat_create_common(zone,
+            "physicalmem", zone_physmem_kstat_update);
         zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
             "nprocs", zone_nprocs_kstat_update);
 
+        if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+                zone->zone_vfs_stats = kmem_zalloc(
+                    sizeof (zone_vfs_kstat_t), KM_SLEEP);
+        }
+
         if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
                 zone->zone_mcap_stats = kmem_zalloc(
                     sizeof (zone_mcap_kstat_t), KM_SLEEP);
         }
 
         if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
                 zone->zone_misc_stats = kmem_zalloc(
                     sizeof (zone_misc_kstat_t), KM_SLEEP);
         }
+
 }
 
 static void
 zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
 {

@@ -2067,16 +2298,22 @@
 {
         zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
             sizeof (zone_kstat_t));
         zone_kstat_delete_common(&zone->zone_swapresv_kstat,
             sizeof (zone_kstat_t));
+        zone_kstat_delete_common(&zone->zone_physmem_kstat,
+            sizeof (zone_kstat_t));
         zone_kstat_delete_common(&zone->zone_nprocs_kstat,
             sizeof (zone_kstat_t));
+
+        zone_kstat_delete_common(&zone->zone_vfs_ksp,
+            sizeof (zone_vfs_kstat_t));
         zone_kstat_delete_common(&zone->zone_mcap_ksp,
             sizeof (zone_mcap_kstat_t));
         zone_kstat_delete_common(&zone->zone_misc_ksp,
             sizeof (zone_misc_kstat_t));
+
 }
 
 /*
  * Called very early on in boot to initialize the ZSD list so that
  * zone_key_create() can be called before zone_init().  It also initializes

@@ -2106,10 +2343,12 @@
         zone0.zone_nprocs_ctl = INT_MAX;
         zone0.zone_locked_mem = 0;
         zone0.zone_locked_mem_ctl = UINT64_MAX;
         ASSERT(zone0.zone_max_swap == 0);
         zone0.zone_max_swap_ctl = UINT64_MAX;
+        zone0.zone_phys_mem = 0;
+        zone0.zone_phys_mem_ctl = UINT64_MAX;
         zone0.zone_max_lofi = 0;
         zone0.zone_max_lofi_ctl = UINT64_MAX;
         zone0.zone_shmmax = 0;
         zone0.zone_ipc.ipcq_shmmni = 0;
         zone0.zone_ipc.ipcq_semmni = 0;

@@ -2129,10 +2368,11 @@
         zone0.zone_ncpus_online = 0;
         zone0.zone_proc_initpid = 1;
         zone0.zone_initname = initname;
         zone0.zone_lockedmem_kstat = NULL;
         zone0.zone_swapresv_kstat = NULL;
+        zone0.zone_physmem_kstat = NULL;
         zone0.zone_nprocs_kstat = NULL;
         zone0.zone_zfs_io_pri = 1;
 
         zone0.zone_stime = 0;
         zone0.zone_utime = 0;

@@ -2245,11 +2485,11 @@
             MAXCAP, MAXCAP, &zone_cpu_cap_ops);
 
         rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
             RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
             RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
-            1024, 1024, &zone_zfs_io_pri_ops);
+            16384, 16384, &zone_zfs_io_pri_ops);
 
         rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
             RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
             INT_MAX, INT_MAX, &zone_lwps_ops);

@@ -2298,10 +2538,15 @@
         rc_zone_max_swap = rctl_register("zone.max-swap",
             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
             &zone_max_swap_ops);
 
+        rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+            RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+            RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+            &zone_phys_mem_ops);
+
         rc_zone_max_lofi = rctl_register("zone.max-lofi",
             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
             &zone_max_lofi_ops);

@@ -2319,10 +2564,12 @@
         zone0.zone_nlwps = p0.p_lwpcnt;
         zone0.zone_nprocs = 1;
         zone0.zone_ntasks = 1;
         mutex_exit(&p0.p_lock);
         zone0.zone_restart_init = B_TRUE;
+        zone0.zone_reboot_on_init_exit = B_FALSE;
+        zone0.zone_init_status = -1;
         zone0.zone_brand = &native_brand;
         rctl_prealloc_destroy(gp);
         /*
          * pool_default hasn't been initialized yet, so we let pool_init()
          * take care of making sure the global zone is in the default pool.

@@ -2398,10 +2645,12 @@
 }
 
 static void
 zone_free(zone_t *zone)
 {
+        zone_dl_t *zdl;
+
         ASSERT(zone != global_zone);
         ASSERT(zone->zone_ntasks == 0);
         ASSERT(zone->zone_nlwps == 0);
         ASSERT(zone->zone_nprocs == 0);
         ASSERT(zone->zone_cred_ref == 0);

@@ -2426,10 +2675,23 @@
         }
 
         list_destroy(&zone->zone_ref_list);
         zone_free_zsd(zone);
         zone_free_datasets(zone);
+
+        /*
+         * While dlmgmtd should have removed all of these, it could have left
+         * something behind or crashed. In which case it's not safe for us to
+         * assume that the list is empty which list_destroy() will ASSERT. We
+         * clean up for our userland comrades which may have crashed, or worse,
+         * been disabled by SMF.
+         */
+        while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+                if (zdl->zdl_net != NULL)
+                        nvlist_free(zdl->zdl_net);
+                kmem_free(zdl, sizeof (zone_dl_t));
+        }
         list_destroy(&zone->zone_dl_list);
 
         if (zone->zone_rootvp != NULL)
                 VN_RELE(zone->zone_rootvp);
         if (zone->zone_rootpath)

@@ -2561,13 +2823,18 @@
                 mutex_exit(&zone_status_lock);
                 brand_unregister_zone(bp);
                 return (EINVAL);
         }
 
-        /* set up the brand specific data */
+        /*
+         * Set up the brand specific data.
+         * Note that it's possible that the hook has to drop the
+         * zone_status_lock and reaquire it before returning so we can't
+         * assume the lock has been held the entire time.
+         */
         zone->zone_brand = bp;
-        ZBROP(zone)->b_init_brand_data(zone);
+        ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
 
         mutex_exit(&zone_status_lock);
         return (0);
 }

@@ -2609,23 +2876,74 @@
         zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
         (void) strcpy(zone->zone_initname, initname);
         return (0);
 }
 
+/*
+ * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
+ * to provide the physical memory capping kstats.  Since physical memory
+ * capping is currently implemented in userland, that code uses the setattr
+ * entry point to increment the kstats.  We always simply increment nover
+ * every time that setattr is called and we always add in the input value
+ * to zone_mcap_pagedout every time that is called.
+ */
+/*ARGSUSED*/
 static int
-zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
+zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
 {
-        uint64_t mcap;
-        int err = 0;
+        zone->zone_mcap_nover++;
 
-        if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
-                zone->zone_phys_mcap = mcap;
+        return (0);
+}
 
+static int
+zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
+{
+        uint64_t pageout;
+        int err;
+
+        if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
+                zone->zone_mcap_pagedout += pageout;
+
         return (err);
 }
 
+/*
+ * The zone_set_page_fault_delay function is used to set the number of usecs
+ * to throttle page faults.  This is normally 0 but can be set to a non-0 value
+ * by the user-land memory capping code when the zone is over its physcial
+ * memory cap.
+ */
 static int
+zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
+{
+        uint32_t dusec;
+        int err;
+
+        if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
+                zone->zone_pg_flt_delay = dusec;
+
+        return (err);
+}
+
+/*
+ * The zone_set_rss function is used to set the zone's RSS when we do the
+ * fast, approximate calculation in user-land.
+ */
+static int
+zone_set_rss(zone_t *zone, const uint64_t *prss)
+{
+        uint64_t rss;
+        int err;
+
+        if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
+                zone->zone_phys_mem = rss;
+
+        return (err);
+}
+
+static int
 zone_set_sched_class(zone_t *zone, const char *new_class)
 {
         char sched_class[PC_CLNMSZ];
         id_t classid;
         int err;

@@ -3771,11 +4089,22 @@
          * For all purposes (ZONE_ATTR_INITPID and restart_init),
          * storing just the pid of init is sufficient.
          */
         z->zone_proc_initpid = p->p_pid;
 
+        if (z->zone_setup_app_contract == B_TRUE) {
         /*
+                 * Normally a process cannot modify its own contract, but we're
+                 * just starting the zone's init process and its contract is
+                 * always initialized from the sys_process_tmpl template, so
+                 * this is the simplest way to setup init's contract to kill
+                 * the process if any other process in the contract exits.
+                 */
+                p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+        }
+
+        /*
          * We maintain zone_boot_err so that we can return the cause of the
          * failure back to the caller of the zone_boot syscall.
          */
         p->p_zone->zone_boot_err = start_init_common();

@@ -3799,13 +4128,58 @@
                         mutex_enter(&p->p_lock);
                         ASSERT(p->p_flag & SEXITLWPS);
                         lwp_exit();
                 }
         } else {
+                id_t cid = curthread->t_cid;
+
                 if (zone_status_get(z) == ZONE_IS_BOOTING)
                         zone_status_set(z, ZONE_IS_RUNNING);
                 mutex_exit(&zone_status_lock);
+
+                mutex_enter(&class_lock);
+                ASSERT(cid < loaded_classes);
+                if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+                    z->zone_fixed_hipri) {
+                        /*
+                         * If the zone is using FX then by default all
+                         * processes start at the lowest priority and stay
+                         * there. We provide a mechanism for the zone to
+                         * indicate that it should run at "high priority". In
+                         * this case we setup init to run at the highest FX
+                         * priority (which is one level higher than the
+                         * non-fixed scheduling classes can use).
+                         */
+                        pcparms_t pcparms;
+
+                        pcparms.pc_cid = cid;
+                        ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+                        ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+                            FXMAXUPRI;
+                        ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+                            FX_DOUPRILIM | FX_DOUPRI;
+
+                        mutex_enter(&pidlock);
+                        mutex_enter(&curproc->p_lock);
+
+                        (void) parmsset(&pcparms, curthread);
+
+                        mutex_exit(&curproc->p_lock);
+                        mutex_exit(&pidlock);
+                } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+                        /*
+                         * zsched always starts the init lwp at priority
+                         * minclsyspri - 1. This priority gets set in t_pri and
+                         * is invalid for RT, but RT never uses t_pri. However
+                         * t_pri is used by procfs, so we always see processes
+                         * within an RT zone with an invalid priority value.
+                         * We fix that up now.
+                         */
+                        curthread->t_pri = RTGPPRIO0;
+                }
+                mutex_exit(&class_lock);
+
                 /* cause the process to return to userland. */
                 lwp_rtt();
         }
 }

@@ -3843,10 +4217,11 @@
         bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
         bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
         PTOU(pp)->u_argc = 0;
         PTOU(pp)->u_argv = NULL;
         PTOU(pp)->u_envp = NULL;
+        PTOU(pp)->u_commpagep = NULL;
         closeall(P_FINFO(pp));
 
         /*
          * We are this zone's "zsched" process.  As the zone isn't generally
          * visible yet we don't need to grab any locks before initializing its

@@ -4285,12 +4660,13 @@
                 uint_t i, nelem;
                 char *name;
 
                 error = EINVAL;
                 name = nvpair_name(nvp);
-                if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
-                    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
+                if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
+                    strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
+                    nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
                         goto out;
                 }
                 if ((hndl = rctl_hndl_lookup(name)) == -1) {
                         goto out;
                 }

@@ -4434,10 +4810,12 @@
         zone->zone_pool_mod = gethrtime();
         zone->zone_psetid = ZONE_PS_INVAL;
         zone->zone_ncpus = 0;
         zone->zone_ncpus_online = 0;
         zone->zone_restart_init = B_TRUE;
+        zone->zone_reboot_on_init_exit = B_FALSE;
+        zone->zone_init_status = -1;
         zone->zone_brand = &native_brand;
         zone->zone_initname = NULL;
         mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);

@@ -4495,14 +4873,17 @@
         zone->zone_nprocs_ctl = INT_MAX;
         zone->zone_locked_mem = 0;
         zone->zone_locked_mem_ctl = UINT64_MAX;
         zone->zone_max_swap = 0;
         zone->zone_max_swap_ctl = UINT64_MAX;
+        zone->zone_phys_mem = 0;
+        zone->zone_phys_mem_ctl = UINT64_MAX;
         zone->zone_max_lofi = 0;
         zone->zone_max_lofi_ctl = UINT64_MAX;
         zone->zone_lockedmem_kstat = NULL;
         zone->zone_swapresv_kstat = NULL;
+        zone->zone_physmem_kstat = NULL;
         zone->zone_zfs_io_pri = 1;
 
         /*
          * Zsched initializes the rctls.
          */

@@ -4654,12 +5035,12 @@
         zarg.zone = zone;
         zarg.nvlist = rctls;
         /*
          * The process, task, and project rctls are probably wrong;
          * we need an interface to get the default values of all rctls,
-         * and initialize zsched appropriately.  I'm not sure that that
-         * makes much of a difference, though.
+         * and initialize zsched appropriately. However, we allow zoneadmd
+         * to pass down both zone and project rctls for the zone's init.
          */
         error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
         if (error != 0) {
                 /*
                  * We need to undo all globally visible state.

@@ -5555,18 +5936,10 @@
                         err = copyoutstr(outstr, buf, bufsize, NULL);
                         if (err != 0 && err != ENAMETOOLONG)
                                 error = EFAULT;
                 }
                 break;
-        case ZONE_ATTR_PHYS_MCAP:
-                size = sizeof (zone->zone_phys_mcap);
-                if (bufsize > size)
-                        bufsize = size;
-                if (buf != NULL &&
-                    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
-                        error = EFAULT;
-                break;
         case ZONE_ATTR_SCHED_CLASS:
                 mutex_enter(&class_lock);
 
                 if (zone->zone_defaultcid >= loaded_classes)
                         outstr = "";

@@ -5617,10 +5990,19 @@
                         if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
                                 error = EFAULT;
                 }
                 kmem_free(zbuf, bufsize);
                 break;
+        case ZONE_ATTR_SCHED_FIXEDHI:
+                size = sizeof (boolean_t);
+                if (bufsize > size)
+                        bufsize = size;
+
+                if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+                    bufsize) != 0)
+                        error = EFAULT;
+                break;
         default:
                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
                         size = bufsize;
                         error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
                 } else {

@@ -5648,14 +6030,15 @@
 
         if (secpolicy_zone_config(CRED()) != 0)
                 return (set_errno(EPERM));
 
         /*
-         * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
-         * global zone.
+         * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
+         * attributes can be set on the global zone.
          */
-        if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
+        if (zoneid == GLOBAL_ZONEID &&
+            attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
                 return (set_errno(EINVAL));
         }
 
         mutex_enter(&zonehash_lock);
         if ((zone = zone_find_all_by_id(zoneid)) == NULL) {

@@ -5668,11 +6051,13 @@
         /*
          * At present most attributes can only be set on non-running,
          * non-global zones.
          */
         zone_status = zone_status_get(zone);
-        if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+        if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
+            attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
+            zone_status > ZONE_IS_READY) {
                 err = EINVAL;
                 goto done;
         }
 
         switch (attr) {

@@ -5690,13 +6075,22 @@
                 err = zone_set_brand(zone, (const char *)buf);
                 break;
         case ZONE_ATTR_FS_ALLOWED:
                 err = zone_set_fs_allowed(zone, (const char *)buf);
                 break;
-        case ZONE_ATTR_PHYS_MCAP:
-                err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
+        case ZONE_ATTR_PMCAP_NOVER:
+                err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
                 break;
+        case ZONE_ATTR_PMCAP_PAGEOUT:
+                err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
+                break;
+        case ZONE_ATTR_PG_FLT_DELAY:
+                err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
+                break;
+        case ZONE_ATTR_RSS:
+                err = zone_set_rss(zone, (const uint64_t *)buf);
+                break;
         case ZONE_ATTR_SCHED_CLASS:
                 err = zone_set_sched_class(zone, (const char *)buf);
                 break;
         case ZONE_ATTR_HOSTID:
                 if (bufsize == sizeof (zone->zone_hostid)) {

@@ -5720,10 +6114,26 @@
                         break;
                 }
                 err = zone_set_network(zoneid, zbuf);
                 kmem_free(zbuf, bufsize);
                 break;
+        case ZONE_ATTR_APP_SVC_CT:
+                if (bufsize != sizeof (boolean_t)) {
+                        err = EINVAL;
+                } else {
+                        zone->zone_setup_app_contract = (boolean_t)buf;
+                        err = 0;
+                }
+                break;
+        case ZONE_ATTR_SCHED_FIXEDHI:
+                if (bufsize != sizeof (boolean_t)) {
+                        err = EINVAL;
+                } else {
+                        zone->zone_fixed_hipri = (boolean_t)buf;
+                        err = 0;
+                }
+                break;
         default:
                 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
                         err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
                 else
                         err = EINVAL;

@@ -6523,10 +6933,11 @@
         zone_namelen = strlen(zone->zone_name) + 1;
         zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
         bcopy(zone->zone_name, zone_name, zone_namelen);
         zoneid = zone->zone_id;
         uniqid = zone->zone_uniqid;
+        arg.status = zone->zone_init_status;
         /*
          * zoneadmd may be down, but at least we can empty out the zone.
          * We can ignore the return value of zone_empty() since we're called
          * from a kernel thread and know we won't be delivered any signals.
          */

@@ -6763,20 +7174,19 @@
         mutex_exit(&zone_status_lock);
         mutex_exit(&zonehash_lock);
 }
 
 /*
- * Returns true if the named dataset is visible in the current zone.
+ * Returns true if the named dataset is visible in the specified zone.
  * The 'write' parameter is set to 1 if the dataset is also writable.
  */
 int
-zone_dataset_visible(const char *dataset, int *write)
+zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
 {
         static int zfstype = -1;
         zone_dataset_t *zd;
         size_t len;
-        zone_t *zone = curproc->p_zone;
         const char *name = NULL;
         vfs_t *vfsp = NULL;
 
         if (dataset[0] == '\0')
                 return (0);

@@ -6840,11 +7250,12 @@
         }
 
         vfs_list_read_lock();
         vfsp = zone->zone_vfslist;
         do {
-                ASSERT(vfsp);
+                if (vfsp == NULL)
+                        break;
                 if (vfsp->vfs_fstype == zfstype) {
                         name = refstr_value(vfsp->vfs_resource);
 
                         /*
                          * Check if we have an exact match.

@@ -6876,10 +7287,22 @@
 
         vfs_list_unlock();
         return (0);
 }
 
+/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+        zone_t *zone = curproc->p_zone;
+
+        return (zone_dataset_visible_inzone(zone, dataset, write));
+}
+
 /*
  * zone_find_by_any_path() -
  *
  * kernel-private routine similar to zone_find_by_path(), but which
  * effectively compares against zone paths rather than zonerootpath