Print this page
Remove most KEBE comments and accompanying unused code or variables/fields.
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
Mismerge in zone.c frees stats that aren't there.
Mismerge in zone.c create zone mcap kstats too many times
OS-338 Kstat counters to show "slow" VFS operations
OS-5189 lx dev enumeration can deadlock with zfs
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5187 improve /proc/diskstat handling
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5179 flatten zvol entries for /dev and /proc/partitions
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Undo merge damage from zone kstats
OS-4915 want FX high priority zone configuration option
OS-4925 ps pri shows misleading value for zone in RT scheduling class
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4781 would like to be able to add CT_PR_EV_EXIT to fatal event set of current contract
OS-4017 would like zfs-io-priority values > 1024
OS-3820 lxbrand ptrace(2): the next generation
OS-3685 lxbrand PTRACE_O_TRACEFORK race condition
OS-3834 lxbrand 64-bit strace(1) reports 64-bit process as using x32 ABI
OS-3794 lxbrand panic on init signal death
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-3776 project rctls should be in sync with zone rctls
OS-3429 Expose zone's init exit status
OS-3342 dlmgmtd needs to be mindful of lock ordering
OS-2608 dlmgmtd needs to record zone identifiers
OS-3492 zone_free asserts to its destruction when dlmgmtd has fallen
OS-3494 zoneadmd tears down networking too soon when boot fails
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-803 make phys mem cap a bit harder
OS-1043 minimize vm_getusage impact
OS-11 rcapd behaves poorly when under extreme load
OS-399 zone phys. mem. cap should be a rctl and have associated kstat
        
*** 19,29 ****
   * CDDL HEADER END
   */
  
  /*
   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
!  * Copyright 2015, Joyent Inc. All rights reserved.
   */
  
  /*
   * Zones
   *
--- 19,29 ----
   * CDDL HEADER END
   */
  
  /*
   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
!  * Copyright 2016, Joyent Inc.
   */
  
  /*
   * Zones
   *
*** 248,257 ****
--- 248,259 ----
  #include <sys/zone.h>
  #include <net/if.h>
  #include <sys/cpucaps.h>
  #include <vm/seg.h>
  #include <sys/mac.h>
+ #include <sys/rt.h>
+ #include <sys/fx.h>
  
  /*
   * This constant specifies the number of seconds that threads waiting for
   * subsystems to release a zone's general-purpose references will wait before
   * they log the zone's reference counts.  The constant's value shouldn't
*** 368,377 ****
--- 370,380 ----
   * This isn't static so lint doesn't complain.
   */
  rctl_hndl_t rc_zone_cpu_shares;
  rctl_hndl_t rc_zone_locked_mem;
  rctl_hndl_t rc_zone_max_swap;
+ rctl_hndl_t rc_zone_phys_mem;
  rctl_hndl_t rc_zone_max_lofi;
  rctl_hndl_t rc_zone_cpu_cap;
  rctl_hndl_t rc_zone_zfs_io_pri;
  rctl_hndl_t rc_zone_nlwps;
  rctl_hndl_t rc_zone_nprocs;
*** 1740,1749 ****
--- 1743,1785 ----
          zone_max_swap_test
  };
  
  /*ARGSUSED*/
  static rctl_qty_t
+ zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
+ {
+         rctl_qty_t q;
+         zone_t *z = p->p_zone;
+ 
+         ASSERT(MUTEX_HELD(&p->p_lock));
+         /* No additional lock because not enforced in the kernel */
+         q = z->zone_phys_mem;
+         return (q);
+ }
+ 
+ /*ARGSUSED*/
+ static int
+ zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+     rctl_qty_t nv)
+ {
+         ASSERT(MUTEX_HELD(&p->p_lock));
+         ASSERT(e->rcep_t == RCENTITY_ZONE);
+         if (e->rcep_p.zone == NULL)
+                 return (0);
+         e->rcep_p.zone->zone_phys_mem_ctl = nv;
+         return (0);
+ }
+ 
+ static rctl_ops_t zone_phys_mem_ops = {
+         rcop_no_action,
+         zone_phys_mem_usage,
+         zone_phys_mem_set,
+         rcop_no_test
+ };
+ 
+ /*ARGSUSED*/
+ static rctl_qty_t
  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
  {
          rctl_qty_t q;
          zone_t *z = p->p_zone;
  
*** 1833,1842 ****
--- 1869,1892 ----
          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
          return (0);
  }
  
  static int
+ zone_physmem_kstat_update(kstat_t *ksp, int rw)
+ {
+         zone_t *zone = ksp->ks_private;
+         zone_kstat_t *zk = ksp->ks_data;
+ 
+         if (rw == KSTAT_WRITE)
+                 return (EACCES);
+ 
+         zk->zk_usage.value.ui64 = zone->zone_phys_mem;
+         zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
+         return (0);
+ }
+ 
+ static int
  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
  {
          zone_t *zone = ksp->ks_private;
          zone_kstat_t *zk = ksp->ks_data;
  
*** 1886,1910 ****
--- 1936,2122 ----
          ksp->ks_private = zone;
          kstat_install(ksp);
          return (ksp);
  }
  
+ static int
+ zone_vfs_kstat_update(kstat_t *ksp, int rw)
+ {
+         zone_t *zone = ksp->ks_private;
+         zone_vfs_kstat_t *zvp = ksp->ks_data;
+         kstat_io_t *kiop = &zone->zone_vfs_rwstats;
  
+         if (rw == KSTAT_WRITE)
+                 return (EACCES);
+ 
+         /*
+          * Extract the VFS statistics from the kstat_io_t structure used by
+          * kstat_runq_enter() and related functions.  Since the slow ops
+          * counters are updated directly by the VFS layer, there's no need to
+          * copy those statistics here.
+          *
+          * Note that kstat_runq_enter() and the related functions use
+          * gethrtime_unscaled(), so scale the time here.
+          */
+         zvp->zv_nread.value.ui64 = kiop->nread;
+         zvp->zv_reads.value.ui64 = kiop->reads;
+         zvp->zv_rtime.value.ui64 = kiop->rtime;
+         zvp->zv_rcnt.value.ui64 = kiop->rcnt;
+         zvp->zv_rlentime.value.ui64 = kiop->rlentime;
+         zvp->zv_nwritten.value.ui64 = kiop->nwritten;
+         zvp->zv_writes.value.ui64 = kiop->writes;
+         zvp->zv_wtime.value.ui64 = kiop->wtime;
+         zvp->zv_wcnt.value.ui64 = kiop->wcnt;
+         zvp->zv_wlentime.value.ui64 = kiop->wlentime;
+ 
+         scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
+         scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
+         scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
+         scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
+ 
+         return (0);
+ }
+ 
+ static kstat_t *
+ zone_vfs_kstat_create(zone_t *zone)
+ {
+         kstat_t *ksp;
+         zone_vfs_kstat_t *zvp;
+ 
+         if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
+             zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
+             sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
+             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+                 return (NULL);
+ 
+         if (zone->zone_id != GLOBAL_ZONEID)
+                 kstat_zone_add(ksp, GLOBAL_ZONEID);
+ 
+         zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
+         ksp->ks_data_size += strlen(zone->zone_name) + 1;
+         ksp->ks_lock = &zone->zone_vfs_lock;
+         zone->zone_vfs_stats = zvp;
+ 
+         /* The kstat "name" field is not large enough for a full zonename */
+         kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
+         kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
+         kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
+         kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
+ 
+         ksp->ks_update = zone_vfs_kstat_update;
+         ksp->ks_private = zone;
+ 
+         kstat_install(ksp);
+         return (ksp);
+ }
+ 
  static int
+ zone_zfs_kstat_update(kstat_t *ksp, int rw)
+ {
+         zone_t *zone = ksp->ks_private;
+         zone_zfs_kstat_t *zzp = ksp->ks_data;
+         kstat_io_t *kiop = &zone->zone_zfs_rwstats;
+ 
+         if (rw == KSTAT_WRITE)
+                 return (EACCES);
+ 
+         /*
+          * Extract the ZFS statistics from the kstat_io_t structure used by
+          * kstat_runq_enter() and related functions.  Since the I/O throttle
+          * counters are updated directly by the ZFS layer, there's no need to
+          * copy those statistics here.
+          *
+          * Note that kstat_runq_enter() and the related functions use
+          * gethrtime_unscaled(), so scale the time here.
+          */
+         zzp->zz_nread.value.ui64 = kiop->nread;
+         zzp->zz_reads.value.ui64 = kiop->reads;
+         zzp->zz_rtime.value.ui64 = kiop->rtime;
+         zzp->zz_rlentime.value.ui64 = kiop->rlentime;
+         zzp->zz_nwritten.value.ui64 = kiop->nwritten;
+         zzp->zz_writes.value.ui64 = kiop->writes;
+ 
+         scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
+         scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
+ 
+         return (0);
+ }
+ 
+ static kstat_t *
+ zone_zfs_kstat_create(zone_t *zone)
+ {
+         kstat_t *ksp;
+         zone_zfs_kstat_t *zzp;
+ 
+         if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
+             zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
+             sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
+             KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
+                 return (NULL);
+ 
+         if (zone->zone_id != GLOBAL_ZONEID)
+                 kstat_zone_add(ksp, GLOBAL_ZONEID);
+ 
+         zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
+         ksp->ks_data_size += strlen(zone->zone_name) + 1;
+         ksp->ks_lock = &zone->zone_zfs_lock;
+         zone->zone_zfs_stats = zzp;
+ 
+         /* The kstat "name" field is not large enough for a full zonename */
+         kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
+         kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
+         kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
+         kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
+         kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
+         kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
+         kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
+         kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
+         kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
+ 
+         ksp->ks_update = zone_zfs_kstat_update;
+         ksp->ks_private = zone;
+ 
+         kstat_install(ksp);
+         return (ksp);
+ }
+ 
+ static int
  zone_mcap_kstat_update(kstat_t *ksp, int rw)
  {
          zone_t *zone = ksp->ks_private;
          zone_mcap_kstat_t *zmp = ksp->ks_data;
  
          if (rw == KSTAT_WRITE)
                  return (EACCES);
  
+         zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
+         zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
+         zmp->zm_swap.value.ui64 = zone->zone_max_swap;
+         zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
+         zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
+         zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
          zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
          zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
          zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
          zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
          zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
+         zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
+         zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
  
          return (0);
  }
  
  static kstat_t *
*** 1928,1943 ****
--- 2140,2166 ----
          zone->zone_mcap_stats = zmp;
  
          /* The kstat "name" field is not large enough for a full zonename */
          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+         kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
+         kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
+         kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
+         kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
+         kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
+         kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
+         kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
          kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
          kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
          kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
          kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
          kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
              KSTAT_DATA_UINT64);
+         kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
+             KSTAT_DATA_UINT64);
+         kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
+             KSTAT_DATA_UINT64);
  
          ksp->ks_update = zone_mcap_kstat_update;
          ksp->ks_private = zone;
  
          kstat_install(ksp);
*** 2033,2054 ****
--- 2256,2285 ----
  {
          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
              "lockedmem", zone_lockedmem_kstat_update);
          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
              "swapresv", zone_swapresv_kstat_update);
+         zone->zone_physmem_kstat = zone_kstat_create_common(zone,
+             "physicalmem", zone_physmem_kstat_update);
          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
              "nprocs", zone_nprocs_kstat_update);
  
+         if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
+                 zone->zone_vfs_stats = kmem_zalloc(
+                     sizeof (zone_vfs_kstat_t), KM_SLEEP);
+         }
+ 
          if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
                  zone->zone_mcap_stats = kmem_zalloc(
                      sizeof (zone_mcap_kstat_t), KM_SLEEP);
          }
  
          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
                  zone->zone_misc_stats = kmem_zalloc(
                      sizeof (zone_misc_kstat_t), KM_SLEEP);
          }
+ 
  }
  
  static void
  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
  {
*** 2067,2082 ****
--- 2298,2319 ----
  {
          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
              sizeof (zone_kstat_t));
          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
              sizeof (zone_kstat_t));
+         zone_kstat_delete_common(&zone->zone_physmem_kstat,
+             sizeof (zone_kstat_t));
          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
              sizeof (zone_kstat_t));
+ 
+         zone_kstat_delete_common(&zone->zone_vfs_ksp,
+             sizeof (zone_vfs_kstat_t));
          zone_kstat_delete_common(&zone->zone_mcap_ksp,
              sizeof (zone_mcap_kstat_t));
          zone_kstat_delete_common(&zone->zone_misc_ksp,
              sizeof (zone_misc_kstat_t));
+ 
  }
  
  /*
   * Called very early on in boot to initialize the ZSD list so that
   * zone_key_create() can be called before zone_init().  It also initializes
*** 2106,2115 ****
--- 2343,2354 ----
          zone0.zone_nprocs_ctl = INT_MAX;
          zone0.zone_locked_mem = 0;
          zone0.zone_locked_mem_ctl = UINT64_MAX;
          ASSERT(zone0.zone_max_swap == 0);
          zone0.zone_max_swap_ctl = UINT64_MAX;
+         zone0.zone_phys_mem = 0;
+         zone0.zone_phys_mem_ctl = UINT64_MAX;
          zone0.zone_max_lofi = 0;
          zone0.zone_max_lofi_ctl = UINT64_MAX;
          zone0.zone_shmmax = 0;
          zone0.zone_ipc.ipcq_shmmni = 0;
          zone0.zone_ipc.ipcq_semmni = 0;
*** 2129,2138 ****
--- 2368,2378 ----
          zone0.zone_ncpus_online = 0;
          zone0.zone_proc_initpid = 1;
          zone0.zone_initname = initname;
          zone0.zone_lockedmem_kstat = NULL;
          zone0.zone_swapresv_kstat = NULL;
+         zone0.zone_physmem_kstat = NULL;
          zone0.zone_nprocs_kstat = NULL;
          zone0.zone_zfs_io_pri = 1;
  
          zone0.zone_stime = 0;
          zone0.zone_utime = 0;
*** 2245,2255 ****
              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
  
          rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
!             1024, 1024, &zone_zfs_io_pri_ops);
  
          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
              INT_MAX, INT_MAX, &zone_lwps_ops);
  
--- 2485,2495 ----
              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
  
          rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
!             16384, 16384, &zone_zfs_io_pri_ops);
  
          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
              INT_MAX, INT_MAX, &zone_lwps_ops);
  
*** 2298,2307 ****
--- 2538,2552 ----
          rc_zone_max_swap = rctl_register("zone.max-swap",
              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
              &zone_max_swap_ops);
  
+         rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
+             RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
+             RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
+             &zone_phys_mem_ops);
+ 
          rc_zone_max_lofi = rctl_register("zone.max-lofi",
              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
              &zone_max_lofi_ops);
  
*** 2319,2328 ****
--- 2564,2575 ----
          zone0.zone_nlwps = p0.p_lwpcnt;
          zone0.zone_nprocs = 1;
          zone0.zone_ntasks = 1;
          mutex_exit(&p0.p_lock);
          zone0.zone_restart_init = B_TRUE;
+         zone0.zone_reboot_on_init_exit = B_FALSE;
+         zone0.zone_init_status = -1;
          zone0.zone_brand = &native_brand;
          rctl_prealloc_destroy(gp);
          /*
           * pool_default hasn't been initialized yet, so we let pool_init()
           * take care of making sure the global zone is in the default pool.
*** 2398,2407 ****
--- 2645,2656 ----
  }
  
  static void
  zone_free(zone_t *zone)
  {
+         zone_dl_t *zdl;
+ 
          ASSERT(zone != global_zone);
          ASSERT(zone->zone_ntasks == 0);
          ASSERT(zone->zone_nlwps == 0);
          ASSERT(zone->zone_nprocs == 0);
          ASSERT(zone->zone_cred_ref == 0);
*** 2426,2435 ****
--- 2675,2697 ----
          }
  
          list_destroy(&zone->zone_ref_list);
          zone_free_zsd(zone);
          zone_free_datasets(zone);
+ 
+         /*
+          * While dlmgmtd should have removed all of these, it could have left
+          * something behind or crashed. In which case it's not safe for us to
+          * assume that the list is empty which list_destroy() will ASSERT. We
+          * clean up for our userland comrades which may have crashed, or worse,
+          * been disabled by SMF.
+          */
+         while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
+                 if (zdl->zdl_net != NULL)
+                         nvlist_free(zdl->zdl_net);
+                 kmem_free(zdl, sizeof (zone_dl_t));
+         }
          list_destroy(&zone->zone_dl_list);
  
          if (zone->zone_rootvp != NULL)
                  VN_RELE(zone->zone_rootvp);
          if (zone->zone_rootpath)
*** 2561,2573 ****
                  mutex_exit(&zone_status_lock);
                  brand_unregister_zone(bp);
                  return (EINVAL);
          }
  
!         /* set up the brand specific data */
          zone->zone_brand = bp;
!         ZBROP(zone)->b_init_brand_data(zone);
  
          mutex_exit(&zone_status_lock);
          return (0);
  }
  
--- 2823,2840 ----
                  mutex_exit(&zone_status_lock);
                  brand_unregister_zone(bp);
                  return (EINVAL);
          }
  
!         /*
!          * Set up the brand specific data.
!          * Note that it's possible that the hook has to drop the
!          * zone_status_lock and reaquire it before returning so we can't
!          * assume the lock has been held the entire time.
!          */
          zone->zone_brand = bp;
!         ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
  
          mutex_exit(&zone_status_lock);
          return (0);
  }
  
*** 2609,2631 ****
          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
          (void) strcpy(zone->zone_initname, initname);
          return (0);
  }
  
  static int
! zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
  {
!         uint64_t mcap;
!         int err = 0;
  
!         if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
!                 zone->zone_phys_mcap = mcap;
  
          return (err);
  }
  
  static int
  zone_set_sched_class(zone_t *zone, const char *new_class)
  {
          char sched_class[PC_CLNMSZ];
          id_t classid;
          int err;
--- 2876,2949 ----
          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
          (void) strcpy(zone->zone_initname, initname);
          return (0);
  }
  
+ /*
+  * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
+  * to provide the physical memory capping kstats.  Since physical memory
+  * capping is currently implemented in userland, that code uses the setattr
+  * entry point to increment the kstats.  We always simply increment nover
+  * every time that setattr is called and we always add in the input value
+  * to zone_mcap_pagedout every time that is called.
+  */
+ /*ARGSUSED*/
  static int
! zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
  {
!         zone->zone_mcap_nover++;
  
!         return (0);
! }
  
+ static int
+ zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
+ {
+         uint64_t pageout;
+         int err;
+ 
+         if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
+                 zone->zone_mcap_pagedout += pageout;
+ 
          return (err);
  }
  
+ /*
+  * The zone_set_page_fault_delay function is used to set the number of usecs
+  * to throttle page faults.  This is normally 0 but can be set to a non-0 value
+  * by the user-land memory capping code when the zone is over its physcial
+  * memory cap.
+  */
  static int
+ zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
+ {
+         uint32_t dusec;
+         int err;
+ 
+         if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
+                 zone->zone_pg_flt_delay = dusec;
+ 
+         return (err);
+ }
+ 
+ /*
+  * The zone_set_rss function is used to set the zone's RSS when we do the
+  * fast, approximate calculation in user-land.
+  */
+ static int
+ zone_set_rss(zone_t *zone, const uint64_t *prss)
+ {
+         uint64_t rss;
+         int err;
+ 
+         if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
+                 zone->zone_phys_mem = rss;
+ 
+         return (err);
+ }
+ 
+ static int
  zone_set_sched_class(zone_t *zone, const char *new_class)
  {
          char sched_class[PC_CLNMSZ];
          id_t classid;
          int err;
*** 3771,3781 ****
--- 4089,4110 ----
           * For all purposes (ZONE_ATTR_INITPID and restart_init),
           * storing just the pid of init is sufficient.
           */
          z->zone_proc_initpid = p->p_pid;
  
+         if (z->zone_setup_app_contract == B_TRUE) {
                  /*
+                  * Normally a process cannot modify its own contract, but we're
+                  * just starting the zone's init process and its contract is
+                  * always initialized from the sys_process_tmpl template, so
+                  * this is the simplest way to setup init's contract to kill
+                  * the process if any other process in the contract exits.
+                  */
+                 p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
+         }
+ 
+         /*
           * We maintain zone_boot_err so that we can return the cause of the
           * failure back to the caller of the zone_boot syscall.
           */
          p->p_zone->zone_boot_err = start_init_common();
  
*** 3799,3811 ****
--- 4128,4185 ----
                          mutex_enter(&p->p_lock);
                          ASSERT(p->p_flag & SEXITLWPS);
                          lwp_exit();
                  }
          } else {
+                 id_t cid = curthread->t_cid;
+ 
                  if (zone_status_get(z) == ZONE_IS_BOOTING)
                          zone_status_set(z, ZONE_IS_RUNNING);
                  mutex_exit(&zone_status_lock);
+ 
+                 mutex_enter(&class_lock);
+                 ASSERT(cid < loaded_classes);
+                 if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
+                     z->zone_fixed_hipri) {
+                         /*
+                          * If the zone is using FX then by default all
+                          * processes start at the lowest priority and stay
+                          * there. We provide a mechanism for the zone to
+                          * indicate that it should run at "high priority". In
+                          * this case we setup init to run at the highest FX
+                          * priority (which is one level higher than the
+                          * non-fixed scheduling classes can use).
+                          */
+                         pcparms_t pcparms;
+ 
+                         pcparms.pc_cid = cid;
+                         ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
+                         ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
+                             FXMAXUPRI;
+                         ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
+                             FX_DOUPRILIM | FX_DOUPRI;
+ 
+                         mutex_enter(&pidlock);
+                         mutex_enter(&curproc->p_lock);
+ 
+                         (void) parmsset(&pcparms, curthread);
+ 
+                         mutex_exit(&curproc->p_lock);
+                         mutex_exit(&pidlock);
+                 } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
+                         /*
+                          * zsched always starts the init lwp at priority
+                          * minclsyspri - 1. This priority gets set in t_pri and
+                          * is invalid for RT, but RT never uses t_pri. However
+                          * t_pri is used by procfs, so we always see processes
+                          * within an RT zone with an invalid priority value.
+                          * We fix that up now.
+                          */
+                         curthread->t_pri = RTGPPRIO0;
+                 }
+                 mutex_exit(&class_lock);
+ 
                  /* cause the process to return to userland. */
                  lwp_rtt();
          }
  }
  
*** 3843,3852 ****
--- 4217,4227 ----
          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
          PTOU(pp)->u_argc = 0;
          PTOU(pp)->u_argv = NULL;
          PTOU(pp)->u_envp = NULL;
+         PTOU(pp)->u_commpagep = NULL;
          closeall(P_FINFO(pp));
  
          /*
           * We are this zone's "zsched" process.  As the zone isn't generally
           * visible yet we don't need to grab any locks before initializing its
*** 4285,4296 ****
                  uint_t i, nelem;
                  char *name;
  
                  error = EINVAL;
                  name = nvpair_name(nvp);
!                 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
!                     != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
                          goto out;
                  }
                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
                          goto out;
                  }
--- 4660,4672 ----
                  uint_t i, nelem;
                  char *name;
  
                  error = EINVAL;
                  name = nvpair_name(nvp);
!                 if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
!                     strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
!                     nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
                          goto out;
                  }
                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
                          goto out;
                  }
*** 4434,4443 ****
--- 4810,4821 ----
          zone->zone_pool_mod = gethrtime();
          zone->zone_psetid = ZONE_PS_INVAL;
          zone->zone_ncpus = 0;
          zone->zone_ncpus_online = 0;
          zone->zone_restart_init = B_TRUE;
+         zone->zone_reboot_on_init_exit = B_FALSE;
+         zone->zone_init_status = -1;
          zone->zone_brand = &native_brand;
          zone->zone_initname = NULL;
          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
*** 4495,4508 ****
--- 4873,4889 ----
          zone->zone_nprocs_ctl = INT_MAX;
          zone->zone_locked_mem = 0;
          zone->zone_locked_mem_ctl = UINT64_MAX;
          zone->zone_max_swap = 0;
          zone->zone_max_swap_ctl = UINT64_MAX;
+         zone->zone_phys_mem = 0;
+         zone->zone_phys_mem_ctl = UINT64_MAX;
          zone->zone_max_lofi = 0;
          zone->zone_max_lofi_ctl = UINT64_MAX;
          zone->zone_lockedmem_kstat = NULL;
          zone->zone_swapresv_kstat = NULL;
+         zone->zone_physmem_kstat = NULL;
          zone->zone_zfs_io_pri = 1;
  
          /*
           * Zsched initializes the rctls.
           */
*** 4654,4665 ****
          zarg.zone = zone;
          zarg.nvlist = rctls;
          /*
           * The process, task, and project rctls are probably wrong;
           * we need an interface to get the default values of all rctls,
!          * and initialize zsched appropriately.  I'm not sure that that
!          * makes much of a difference, though.
           */
          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
          if (error != 0) {
                  /*
                   * We need to undo all globally visible state.
--- 5035,5046 ----
          zarg.zone = zone;
          zarg.nvlist = rctls;
          /*
           * The process, task, and project rctls are probably wrong;
           * we need an interface to get the default values of all rctls,
!          * and initialize zsched appropriately. However, we allow zoneadmd
!          * to pass down both zone and project rctls for the zone's init.
           */
          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
          if (error != 0) {
                  /*
                   * We need to undo all globally visible state.
*** 5555,5572 ****
                          err = copyoutstr(outstr, buf, bufsize, NULL);
                          if (err != 0 && err != ENAMETOOLONG)
                                  error = EFAULT;
                  }
                  break;
-         case ZONE_ATTR_PHYS_MCAP:
-                 size = sizeof (zone->zone_phys_mcap);
-                 if (bufsize > size)
-                         bufsize = size;
-                 if (buf != NULL &&
-                     copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
-                         error = EFAULT;
-                 break;
          case ZONE_ATTR_SCHED_CLASS:
                  mutex_enter(&class_lock);
  
                  if (zone->zone_defaultcid >= loaded_classes)
                          outstr = "";
--- 5936,5945 ----
*** 5617,5626 ****
--- 5990,6008 ----
                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
                                  error = EFAULT;
                  }
                  kmem_free(zbuf, bufsize);
                  break;
+         case ZONE_ATTR_SCHED_FIXEDHI:
+                 size = sizeof (boolean_t);
+                 if (bufsize > size)
+                         bufsize = size;
+ 
+                 if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
+                     bufsize) != 0)
+                         error = EFAULT;
+                 break;
          default:
                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
                          size = bufsize;
                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
                  } else {
*** 5648,5661 ****
  
          if (secpolicy_zone_config(CRED()) != 0)
                  return (set_errno(EPERM));
  
          /*
!          * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
!          * global zone.
           */
!         if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
                  return (set_errno(EINVAL));
          }
  
          mutex_enter(&zonehash_lock);
          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
--- 6030,6044 ----
  
          if (secpolicy_zone_config(CRED()) != 0)
                  return (set_errno(EPERM));
  
          /*
!          * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
!          * attributes can be set on the global zone.
           */
!         if (zoneid == GLOBAL_ZONEID &&
!             attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
                  return (set_errno(EINVAL));
          }
  
          mutex_enter(&zonehash_lock);
          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
*** 5668,5678 ****
          /*
           * At present most attributes can only be set on non-running,
           * non-global zones.
           */
          zone_status = zone_status_get(zone);
!         if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
                  err = EINVAL;
                  goto done;
          }
  
          switch (attr) {
--- 6051,6063 ----
          /*
           * At present most attributes can only be set on non-running,
           * non-global zones.
           */
          zone_status = zone_status_get(zone);
!         if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
!             attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
!             zone_status > ZONE_IS_READY) {
                  err = EINVAL;
                  goto done;
          }
  
          switch (attr) {
*** 5690,5702 ****
                  err = zone_set_brand(zone, (const char *)buf);
                  break;
          case ZONE_ATTR_FS_ALLOWED:
                  err = zone_set_fs_allowed(zone, (const char *)buf);
                  break;
!         case ZONE_ATTR_PHYS_MCAP:
!                 err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
                  break;
          case ZONE_ATTR_SCHED_CLASS:
                  err = zone_set_sched_class(zone, (const char *)buf);
                  break;
          case ZONE_ATTR_HOSTID:
                  if (bufsize == sizeof (zone->zone_hostid)) {
--- 6075,6096 ----
                  err = zone_set_brand(zone, (const char *)buf);
                  break;
          case ZONE_ATTR_FS_ALLOWED:
                  err = zone_set_fs_allowed(zone, (const char *)buf);
                  break;
!         case ZONE_ATTR_PMCAP_NOVER:
!                 err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
                  break;
+         case ZONE_ATTR_PMCAP_PAGEOUT:
+                 err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
+                 break;
+         case ZONE_ATTR_PG_FLT_DELAY:
+                 err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
+                 break;
+         case ZONE_ATTR_RSS:
+                 err = zone_set_rss(zone, (const uint64_t *)buf);
+                 break;
          case ZONE_ATTR_SCHED_CLASS:
                  err = zone_set_sched_class(zone, (const char *)buf);
                  break;
          case ZONE_ATTR_HOSTID:
                  if (bufsize == sizeof (zone->zone_hostid)) {
*** 5720,5729 ****
--- 6114,6139 ----
                          break;
                  }
                  err = zone_set_network(zoneid, zbuf);
                  kmem_free(zbuf, bufsize);
                  break;
+         case ZONE_ATTR_APP_SVC_CT:
+                 if (bufsize != sizeof (boolean_t)) {
+                         err = EINVAL;
+                 } else {
+                         zone->zone_setup_app_contract = (boolean_t)buf;
+                         err = 0;
+                 }
+                 break;
+         case ZONE_ATTR_SCHED_FIXEDHI:
+                 if (bufsize != sizeof (boolean_t)) {
+                         err = EINVAL;
+                 } else {
+                         zone->zone_fixed_hipri = (boolean_t)buf;
+                         err = 0;
+                 }
+                 break;
          default:
                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
                  else
                          err = EINVAL;
*** 6523,6532 ****
--- 6933,6943 ----
          zone_namelen = strlen(zone->zone_name) + 1;
          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
          bcopy(zone->zone_name, zone_name, zone_namelen);
          zoneid = zone->zone_id;
          uniqid = zone->zone_uniqid;
+         arg.status = zone->zone_init_status;
          /*
           * zoneadmd may be down, but at least we can empty out the zone.
           * We can ignore the return value of zone_empty() since we're called
           * from a kernel thread and know we won't be delivered any signals.
           */
*** 6763,6782 ****
          mutex_exit(&zone_status_lock);
          mutex_exit(&zonehash_lock);
  }
  
  /*
!  * Returns true if the named dataset is visible in the current zone.
   * The 'write' parameter is set to 1 if the dataset is also writable.
   */
  int
! zone_dataset_visible(const char *dataset, int *write)
  {
          static int zfstype = -1;
          zone_dataset_t *zd;
          size_t len;
-         zone_t *zone = curproc->p_zone;
          const char *name = NULL;
          vfs_t *vfsp = NULL;
  
          if (dataset[0] == '\0')
                  return (0);
--- 7174,7192 ----
          mutex_exit(&zone_status_lock);
          mutex_exit(&zonehash_lock);
  }
  
  /*
!  * Returns true if the named dataset is visible in the specified zone.
   * The 'write' parameter is set to 1 if the dataset is also writable.
   */
  int
! zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
  {
          static int zfstype = -1;
          zone_dataset_t *zd;
          size_t len;
          const char *name = NULL;
          vfs_t *vfsp = NULL;
  
          if (dataset[0] == '\0')
                  return (0);
*** 6840,6850 ****
          }
  
          vfs_list_read_lock();
          vfsp = zone->zone_vfslist;
          do {
!                 ASSERT(vfsp);
                  if (vfsp->vfs_fstype == zfstype) {
                          name = refstr_value(vfsp->vfs_resource);
  
                          /*
                           * Check if we have an exact match.
--- 7250,7261 ----
          }
  
          vfs_list_read_lock();
          vfsp = zone->zone_vfslist;
          do {
!                 if (vfsp == NULL)
!                         break;
                  if (vfsp->vfs_fstype == zfstype) {
                          name = refstr_value(vfsp->vfs_resource);
  
                          /*
                           * Check if we have an exact match.
*** 6876,6885 ****
--- 7287,7308 ----
  
          vfs_list_unlock();
          return (0);
  }
  
+ /*
+  * Returns true if the named dataset is visible in the current zone.
+  * The 'write' parameter is set to 1 if the dataset is also writable.
+  */
+ int
+ zone_dataset_visible(const char *dataset, int *write)
+ {
+         zone_t *zone = curproc->p_zone;
+ 
+         return (zone_dataset_visible_inzone(zone, dataset, write));
+ }
+ 
  /*
   * zone_find_by_any_path() -
   *
   * kernel-private routine similar to zone_find_by_path(), but which
   * effectively compares against zone paths rather than zonerootpath