Print this page
Remove most KEBE comments and accompanying unused code or variables/fields.
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
Mismerge in zone.c frees stats that aren't there.
Mismerge in zone.c create zone mcap kstats too many times
OS-338 Kstat counters to show "slow" VFS operations
OS-5189 lx dev enumeration can deadlock with zfs
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5187 improve /proc/diskstat handling
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-5179 flatten zvol entries for /dev and /proc/partitions
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Undo merge damage from zone kstats
OS-4915 want FX high priority zone configuration option
OS-4925 ps pri shows misleading value for zone in RT scheduling class
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4781 would like to be able to add CT_PR_EV_EXIT to fatal event set of current contract
OS-4017 would like zfs-io-priority values > 1024
OS-3820 lxbrand ptrace(2): the next generation
OS-3685 lxbrand PTRACE_O_TRACEFORK race condition
OS-3834 lxbrand 64-bit strace(1) reports 64-bit process as using x32 ABI
OS-3794 lxbrand panic on init signal death
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-3776 project rctls should be in sync with zone rctls
OS-3429 Expose zone's init exit status
OS-3342 dlmgmtd needs to be mindful of lock ordering
OS-2608 dlmgmtd needs to record zone identifiers
OS-3492 zone_free asserts to its destruction when dlmgmtd has fallen
OS-3494 zoneadmd tears down networking too soon when boot fails
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-803 make phys mem cap a bit harder
OS-1043 minimize vm_getusage impact
OS-11 rcapd behaves poorly when under extreme load
OS-399 zone phys. mem. cap should be a rctl and have associated kstat

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/os/zone.c
          +++ new/usr/src/uts/common/os/zone.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright 2015, Joyent Inc. All rights reserved.
       24 + * Copyright 2016, Joyent Inc.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Zones
  29   29   *
  30   30   *   A zone is a named collection of processes, namespace constraints,
  31   31   *   and other system resources which comprise a secure and manageable
  32   32   *   application containment facility.
  33   33   *
  34   34   *   Zones (represented by the reference counted zone_t) are tracked in
↓ open down ↓ 208 lines elided ↑ open up ↑
 243  243  #include <sys/sunddi.h>
 244  244  #include <sys/nvpair.h>
 245  245  #include <sys/rctl.h>
 246  246  #include <sys/fss.h>
 247  247  #include <sys/brand.h>
 248  248  #include <sys/zone.h>
 249  249  #include <net/if.h>
 250  250  #include <sys/cpucaps.h>
 251  251  #include <vm/seg.h>
 252  252  #include <sys/mac.h>
      253 +#include <sys/rt.h>
      254 +#include <sys/fx.h>
 253  255  
 254  256  /*
 255  257   * This constant specifies the number of seconds that threads waiting for
 256  258   * subsystems to release a zone's general-purpose references will wait before
 257  259   * they log the zone's reference counts.  The constant's value shouldn't
 258  260   * be so small that reference counts are unnecessarily reported for zones
 259  261   * whose references are slowly released.  On the other hand, it shouldn't be so
 260  262   * large that users reboot their systems out of frustration over hung zones
 261  263   * before the system logs the zones' reference counts.
 262  264   */
↓ open down ↓ 100 lines elided ↑ open up ↑
 363  365          "VFS",          /* ZONE_REF_VFS */
 364  366          "IPC"           /* ZONE_REF_IPC */
 365  367  };
 366  368  
 367  369  /*
 368  370   * This isn't static so lint doesn't complain.
 369  371   */
 370  372  rctl_hndl_t rc_zone_cpu_shares;
 371  373  rctl_hndl_t rc_zone_locked_mem;
 372  374  rctl_hndl_t rc_zone_max_swap;
      375 +rctl_hndl_t rc_zone_phys_mem;
 373  376  rctl_hndl_t rc_zone_max_lofi;
 374  377  rctl_hndl_t rc_zone_cpu_cap;
 375  378  rctl_hndl_t rc_zone_zfs_io_pri;
 376  379  rctl_hndl_t rc_zone_nlwps;
 377  380  rctl_hndl_t rc_zone_nprocs;
 378  381  rctl_hndl_t rc_zone_shmmax;
 379  382  rctl_hndl_t rc_zone_shmmni;
 380  383  rctl_hndl_t rc_zone_semmni;
 381  384  rctl_hndl_t rc_zone_msgmni;
 382  385  
↓ open down ↓ 1352 lines elided ↑ open up ↑
1735 1738  
1736 1739  static rctl_ops_t zone_max_swap_ops = {
1737 1740          rcop_no_action,
1738 1741          zone_max_swap_usage,
1739 1742          zone_max_swap_set,
1740 1743          zone_max_swap_test
1741 1744  };
1742 1745  
1743 1746  /*ARGSUSED*/
1744 1747  static rctl_qty_t
     1748 +zone_phys_mem_usage(rctl_t *rctl, struct proc *p)
     1749 +{
     1750 +        rctl_qty_t q;
     1751 +        zone_t *z = p->p_zone;
     1752 +
     1753 +        ASSERT(MUTEX_HELD(&p->p_lock));
     1754 +        /* No additional lock because not enforced in the kernel */
     1755 +        q = z->zone_phys_mem;
     1756 +        return (q);
     1757 +}
     1758 +
     1759 +/*ARGSUSED*/
     1760 +static int
     1761 +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
     1762 +    rctl_qty_t nv)
     1763 +{
     1764 +        ASSERT(MUTEX_HELD(&p->p_lock));
     1765 +        ASSERT(e->rcep_t == RCENTITY_ZONE);
     1766 +        if (e->rcep_p.zone == NULL)
     1767 +                return (0);
     1768 +        e->rcep_p.zone->zone_phys_mem_ctl = nv;
     1769 +        return (0);
     1770 +}
     1771 +
     1772 +static rctl_ops_t zone_phys_mem_ops = {
     1773 +        rcop_no_action,
     1774 +        zone_phys_mem_usage,
     1775 +        zone_phys_mem_set,
     1776 +        rcop_no_test
     1777 +};
     1778 +
     1779 +/*ARGSUSED*/
     1780 +static rctl_qty_t
1745 1781  zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1746 1782  {
1747 1783          rctl_qty_t q;
1748 1784          zone_t *z = p->p_zone;
1749 1785  
1750 1786          ASSERT(MUTEX_HELD(&p->p_lock));
1751 1787          mutex_enter(&z->zone_rctl_lock);
1752 1788          q = z->zone_max_lofi;
1753 1789          mutex_exit(&z->zone_rctl_lock);
1754 1790          return (q);
↓ open down ↓ 73 lines elided ↑ open up ↑
1828 1864  
1829 1865          if (rw == KSTAT_WRITE)
1830 1866                  return (EACCES);
1831 1867  
1832 1868          zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1833 1869          zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1834 1870          return (0);
1835 1871  }
1836 1872  
1837 1873  static int
     1874 +zone_physmem_kstat_update(kstat_t *ksp, int rw)
     1875 +{
     1876 +        zone_t *zone = ksp->ks_private;
     1877 +        zone_kstat_t *zk = ksp->ks_data;
     1878 +
     1879 +        if (rw == KSTAT_WRITE)
     1880 +                return (EACCES);
     1881 +
     1882 +        zk->zk_usage.value.ui64 = zone->zone_phys_mem;
     1883 +        zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl;
     1884 +        return (0);
     1885 +}
     1886 +
     1887 +static int
1838 1888  zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1839 1889  {
1840 1890          zone_t *zone = ksp->ks_private;
1841 1891          zone_kstat_t *zk = ksp->ks_data;
1842 1892  
1843 1893          if (rw == KSTAT_WRITE)
1844 1894                  return (EACCES);
1845 1895  
1846 1896          zk->zk_usage.value.ui64 = zone->zone_nprocs;
1847 1897          zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
↓ open down ↓ 33 lines elided ↑ open up ↑
1881 1931          kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1882 1932          kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1883 1933          kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1884 1934          kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1885 1935          ksp->ks_update = updatefunc;
1886 1936          ksp->ks_private = zone;
1887 1937          kstat_install(ksp);
1888 1938          return (ksp);
1889 1939  }
1890 1940  
     1941 +static int
     1942 +zone_vfs_kstat_update(kstat_t *ksp, int rw)
     1943 +{
     1944 +        zone_t *zone = ksp->ks_private;
     1945 +        zone_vfs_kstat_t *zvp = ksp->ks_data;
     1946 +        kstat_io_t *kiop = &zone->zone_vfs_rwstats;
1891 1947  
     1948 +        if (rw == KSTAT_WRITE)
     1949 +                return (EACCES);
     1950 +
     1951 +        /*
     1952 +         * Extract the VFS statistics from the kstat_io_t structure used by
     1953 +         * kstat_runq_enter() and related functions.  Since the slow ops
     1954 +         * counters are updated directly by the VFS layer, there's no need to
     1955 +         * copy those statistics here.
     1956 +         *
     1957 +         * Note that kstat_runq_enter() and the related functions use
     1958 +         * gethrtime_unscaled(), so scale the time here.
     1959 +         */
     1960 +        zvp->zv_nread.value.ui64 = kiop->nread;
     1961 +        zvp->zv_reads.value.ui64 = kiop->reads;
     1962 +        zvp->zv_rtime.value.ui64 = kiop->rtime;
     1963 +        zvp->zv_rcnt.value.ui64 = kiop->rcnt;
     1964 +        zvp->zv_rlentime.value.ui64 = kiop->rlentime;
     1965 +        zvp->zv_nwritten.value.ui64 = kiop->nwritten;
     1966 +        zvp->zv_writes.value.ui64 = kiop->writes;
     1967 +        zvp->zv_wtime.value.ui64 = kiop->wtime;
     1968 +        zvp->zv_wcnt.value.ui64 = kiop->wcnt;
     1969 +        zvp->zv_wlentime.value.ui64 = kiop->wlentime;
     1970 +
     1971 +        scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64);
     1972 +        scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64);
     1973 +        scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64);
     1974 +        scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64);
     1975 +
     1976 +        return (0);
     1977 +}
     1978 +
     1979 +static kstat_t *
     1980 +zone_vfs_kstat_create(zone_t *zone)
     1981 +{
     1982 +        kstat_t *ksp;
     1983 +        zone_vfs_kstat_t *zvp;
     1984 +
     1985 +        if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id,
     1986 +            zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED,
     1987 +            sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t),
     1988 +            KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
     1989 +                return (NULL);
     1990 +
     1991 +        if (zone->zone_id != GLOBAL_ZONEID)
     1992 +                kstat_zone_add(ksp, GLOBAL_ZONEID);
     1993 +
     1994 +        zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP);
     1995 +        ksp->ks_data_size += strlen(zone->zone_name) + 1;
     1996 +        ksp->ks_lock = &zone->zone_vfs_lock;
     1997 +        zone->zone_vfs_stats = zvp;
     1998 +
     1999 +        /* The kstat "name" field is not large enough for a full zonename */
     2000 +        kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING);
     2001 +        kstat_named_setstr(&zvp->zv_zonename, zone->zone_name);
     2002 +        kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64);
     2003 +        kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64);
     2004 +        kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64);
     2005 +        kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64);
     2006 +        kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64);
     2007 +        kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64);
     2008 +        kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64);
     2009 +        kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64);
     2010 +        kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64);
     2011 +        kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64);
     2012 +        kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64);
     2013 +        kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64);
     2014 +        kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64);
     2015 +        kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64);
     2016 +        kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64);
     2017 +        kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64);
     2018 +
     2019 +        ksp->ks_update = zone_vfs_kstat_update;
     2020 +        ksp->ks_private = zone;
     2021 +
     2022 +        kstat_install(ksp);
     2023 +        return (ksp);
     2024 +}
     2025 +
1892 2026  static int
     2027 +zone_zfs_kstat_update(kstat_t *ksp, int rw)
     2028 +{
     2029 +        zone_t *zone = ksp->ks_private;
     2030 +        zone_zfs_kstat_t *zzp = ksp->ks_data;
     2031 +        kstat_io_t *kiop = &zone->zone_zfs_rwstats;
     2032 +
     2033 +        if (rw == KSTAT_WRITE)
     2034 +                return (EACCES);
     2035 +
     2036 +        /*
     2037 +         * Extract the ZFS statistics from the kstat_io_t structure used by
     2038 +         * kstat_runq_enter() and related functions.  Since the I/O throttle
     2039 +         * counters are updated directly by the ZFS layer, there's no need to
     2040 +         * copy those statistics here.
     2041 +         *
     2042 +         * Note that kstat_runq_enter() and the related functions use
     2043 +         * gethrtime_unscaled(), so scale the time here.
     2044 +         */
     2045 +        zzp->zz_nread.value.ui64 = kiop->nread;
     2046 +        zzp->zz_reads.value.ui64 = kiop->reads;
     2047 +        zzp->zz_rtime.value.ui64 = kiop->rtime;
     2048 +        zzp->zz_rlentime.value.ui64 = kiop->rlentime;
     2049 +        zzp->zz_nwritten.value.ui64 = kiop->nwritten;
     2050 +        zzp->zz_writes.value.ui64 = kiop->writes;
     2051 +
     2052 +        scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64);
     2053 +        scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64);
     2054 +
     2055 +        return (0);
     2056 +}
     2057 +
     2058 +static kstat_t *
     2059 +zone_zfs_kstat_create(zone_t *zone)
     2060 +{
     2061 +        kstat_t *ksp;
     2062 +        zone_zfs_kstat_t *zzp;
     2063 +
     2064 +        if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id,
     2065 +            zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED,
     2066 +            sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t),
     2067 +            KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL)
     2068 +                return (NULL);
     2069 +
     2070 +        if (zone->zone_id != GLOBAL_ZONEID)
     2071 +                kstat_zone_add(ksp, GLOBAL_ZONEID);
     2072 +
     2073 +        zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP);
     2074 +        ksp->ks_data_size += strlen(zone->zone_name) + 1;
     2075 +        ksp->ks_lock = &zone->zone_zfs_lock;
     2076 +        zone->zone_zfs_stats = zzp;
     2077 +
     2078 +        /* The kstat "name" field is not large enough for a full zonename */
     2079 +        kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING);
     2080 +        kstat_named_setstr(&zzp->zz_zonename, zone->zone_name);
     2081 +        kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64);
     2082 +        kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64);
     2083 +        kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64);
     2084 +        kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64);
     2085 +        kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64);
     2086 +        kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64);
     2087 +        kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64);
     2088 +
     2089 +        ksp->ks_update = zone_zfs_kstat_update;
     2090 +        ksp->ks_private = zone;
     2091 +
     2092 +        kstat_install(ksp);
     2093 +        return (ksp);
     2094 +}
     2095 +
     2096 +static int
1893 2097  zone_mcap_kstat_update(kstat_t *ksp, int rw)
1894 2098  {
1895 2099          zone_t *zone = ksp->ks_private;
1896 2100          zone_mcap_kstat_t *zmp = ksp->ks_data;
1897 2101  
1898 2102          if (rw == KSTAT_WRITE)
1899 2103                  return (EACCES);
1900 2104  
     2105 +        zmp->zm_rss.value.ui64 = zone->zone_phys_mem;
     2106 +        zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl;
     2107 +        zmp->zm_swap.value.ui64 = zone->zone_max_swap;
     2108 +        zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl;
     2109 +        zmp->zm_nover.value.ui64 = zone->zone_mcap_nover;
     2110 +        zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout;
1901 2111          zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin;
1902 2112          zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin;
1903 2113          zmp->zm_execpgin.value.ui64 = zone->zone_execpgin;
1904 2114          zmp->zm_fspgin.value.ui64 = zone->zone_fspgin;
1905 2115          zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail;
     2116 +        zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle;
     2117 +        zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec;
1906 2118  
1907 2119          return (0);
1908 2120  }
1909 2121  
1910 2122  static kstat_t *
1911 2123  zone_mcap_kstat_create(zone_t *zone)
1912 2124  {
1913 2125          kstat_t *ksp;
1914 2126          zone_mcap_kstat_t *zmp;
1915 2127  
↓ open down ↓ 7 lines elided ↑ open up ↑
1923 2135                  kstat_zone_add(ksp, GLOBAL_ZONEID);
1924 2136  
1925 2137          zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP);
1926 2138          ksp->ks_data_size += strlen(zone->zone_name) + 1;
1927 2139          ksp->ks_lock = &zone->zone_mcap_lock;
1928 2140          zone->zone_mcap_stats = zmp;
1929 2141  
1930 2142          /* The kstat "name" field is not large enough for a full zonename */
1931 2143          kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING);
1932 2144          kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
     2145 +        kstat_named_setstr(&zmp->zm_zonename, zone->zone_name);
     2146 +        kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64);
     2147 +        kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64);
     2148 +        kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64);
     2149 +        kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64);
     2150 +        kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64);
     2151 +        kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64);
1933 2152          kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64);
1934 2153          kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64);
1935 2154          kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64);
1936 2155          kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64);
1937 2156          kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail",
1938 2157              KSTAT_DATA_UINT64);
     2158 +        kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle",
     2159 +            KSTAT_DATA_UINT64);
     2160 +        kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec",
     2161 +            KSTAT_DATA_UINT64);
1939 2162  
1940 2163          ksp->ks_update = zone_mcap_kstat_update;
1941 2164          ksp->ks_private = zone;
1942 2165  
1943 2166          kstat_install(ksp);
1944 2167          return (ksp);
1945 2168  }
1946 2169  
1947 2170  static int
1948 2171  zone_misc_kstat_update(kstat_t *ksp, int rw)
↓ open down ↓ 79 lines elided ↑ open up ↑
2028 2251          return (ksp);
2029 2252  }
2030 2253  
2031 2254  static void
2032 2255  zone_kstat_create(zone_t *zone)
2033 2256  {
2034 2257          zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
2035 2258              "lockedmem", zone_lockedmem_kstat_update);
2036 2259          zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
2037 2260              "swapresv", zone_swapresv_kstat_update);
     2261 +        zone->zone_physmem_kstat = zone_kstat_create_common(zone,
     2262 +            "physicalmem", zone_physmem_kstat_update);
2038 2263          zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
2039 2264              "nprocs", zone_nprocs_kstat_update);
2040 2265  
     2266 +        if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) {
     2267 +                zone->zone_vfs_stats = kmem_zalloc(
     2268 +                    sizeof (zone_vfs_kstat_t), KM_SLEEP);
     2269 +        }
     2270 +
2041 2271          if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) {
2042 2272                  zone->zone_mcap_stats = kmem_zalloc(
2043 2273                      sizeof (zone_mcap_kstat_t), KM_SLEEP);
2044 2274          }
2045 2275  
2046 2276          if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) {
2047 2277                  zone->zone_misc_stats = kmem_zalloc(
2048 2278                      sizeof (zone_misc_kstat_t), KM_SLEEP);
2049 2279          }
     2280 +
2050 2281  }
2051 2282  
2052 2283  static void
2053 2284  zone_kstat_delete_common(kstat_t **pkstat, size_t datasz)
2054 2285  {
2055 2286          void *data;
2056 2287  
2057 2288          if (*pkstat != NULL) {
2058 2289                  data = (*pkstat)->ks_data;
2059 2290                  kstat_delete(*pkstat);
↓ open down ↓ 2 lines elided ↑ open up ↑
2062 2293          }
2063 2294  }
2064 2295  
2065 2296  static void
2066 2297  zone_kstat_delete(zone_t *zone)
2067 2298  {
2068 2299          zone_kstat_delete_common(&zone->zone_lockedmem_kstat,
2069 2300              sizeof (zone_kstat_t));
2070 2301          zone_kstat_delete_common(&zone->zone_swapresv_kstat,
2071 2302              sizeof (zone_kstat_t));
     2303 +        zone_kstat_delete_common(&zone->zone_physmem_kstat,
     2304 +            sizeof (zone_kstat_t));
2072 2305          zone_kstat_delete_common(&zone->zone_nprocs_kstat,
2073 2306              sizeof (zone_kstat_t));
     2307 +
     2308 +        zone_kstat_delete_common(&zone->zone_vfs_ksp,
     2309 +            sizeof (zone_vfs_kstat_t));
2074 2310          zone_kstat_delete_common(&zone->zone_mcap_ksp,
2075 2311              sizeof (zone_mcap_kstat_t));
2076 2312          zone_kstat_delete_common(&zone->zone_misc_ksp,
2077 2313              sizeof (zone_misc_kstat_t));
     2314 +
2078 2315  }
2079 2316  
2080 2317  /*
2081 2318   * Called very early on in boot to initialize the ZSD list so that
2082 2319   * zone_key_create() can be called before zone_init().  It also initializes
2083 2320   * portions of zone0 which may be used before zone_init() is called.  The
2084 2321   * variable "global_zone" will be set when zone0 is fully initialized by
2085 2322   * zone_init().
2086 2323   */
2087 2324  void
↓ open down ↓ 13 lines elided ↑ open up ↑
2101 2338          mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
2102 2339          zone0.zone_shares = 1;
2103 2340          zone0.zone_nlwps = 0;
2104 2341          zone0.zone_nlwps_ctl = INT_MAX;
2105 2342          zone0.zone_nprocs = 0;
2106 2343          zone0.zone_nprocs_ctl = INT_MAX;
2107 2344          zone0.zone_locked_mem = 0;
2108 2345          zone0.zone_locked_mem_ctl = UINT64_MAX;
2109 2346          ASSERT(zone0.zone_max_swap == 0);
2110 2347          zone0.zone_max_swap_ctl = UINT64_MAX;
     2348 +        zone0.zone_phys_mem = 0;
     2349 +        zone0.zone_phys_mem_ctl = UINT64_MAX;
2111 2350          zone0.zone_max_lofi = 0;
2112 2351          zone0.zone_max_lofi_ctl = UINT64_MAX;
2113 2352          zone0.zone_shmmax = 0;
2114 2353          zone0.zone_ipc.ipcq_shmmni = 0;
2115 2354          zone0.zone_ipc.ipcq_semmni = 0;
2116 2355          zone0.zone_ipc.ipcq_msgmni = 0;
2117 2356          zone0.zone_name = GLOBAL_ZONENAME;
2118 2357          zone0.zone_nodename = utsname.nodename;
2119 2358          zone0.zone_domain = srpc_domain;
2120 2359          zone0.zone_hostid = HW_INVALID_HOSTID;
↓ open down ↓ 3 lines elided ↑ open up ↑
2124 2363          zone0.zone_status = ZONE_IS_RUNNING;
2125 2364          zone0.zone_rootpath = "/";
2126 2365          zone0.zone_rootpathlen = 2;
2127 2366          zone0.zone_psetid = ZONE_PS_INVAL;
2128 2367          zone0.zone_ncpus = 0;
2129 2368          zone0.zone_ncpus_online = 0;
2130 2369          zone0.zone_proc_initpid = 1;
2131 2370          zone0.zone_initname = initname;
2132 2371          zone0.zone_lockedmem_kstat = NULL;
2133 2372          zone0.zone_swapresv_kstat = NULL;
     2373 +        zone0.zone_physmem_kstat = NULL;
2134 2374          zone0.zone_nprocs_kstat = NULL;
2135 2375          zone0.zone_zfs_io_pri = 1;
2136 2376  
2137 2377          zone0.zone_stime = 0;
2138 2378          zone0.zone_utime = 0;
2139 2379          zone0.zone_wtime = 0;
2140 2380  
2141 2381          list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
2142 2382              offsetof(zone_ref_t, zref_linkage));
2143 2383          list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
↓ open down ↓ 96 lines elided ↑ open up ↑
2240 2480  
2241 2481          rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
2242 2482              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
2243 2483              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
2244 2484              RCTL_GLOBAL_INFINITE,
2245 2485              MAXCAP, MAXCAP, &zone_cpu_cap_ops);
2246 2486  
2247 2487          rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority",
2248 2488              RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
2249 2489              RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
2250      -            1024, 1024, &zone_zfs_io_pri_ops);
     2490 +            16384, 16384, &zone_zfs_io_pri_ops);
2251 2491  
2252 2492          rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
2253 2493              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2254 2494              INT_MAX, INT_MAX, &zone_lwps_ops);
2255 2495  
2256 2496          rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
2257 2497              RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
2258 2498              INT_MAX, INT_MAX, &zone_procs_ops);
2259 2499  
2260 2500          /*
↓ open down ↓ 32 lines elided ↑ open up ↑
2293 2533          rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2294 2534              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2295 2535              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2296 2536              &zone_locked_mem_ops);
2297 2537  
2298 2538          rc_zone_max_swap = rctl_register("zone.max-swap",
2299 2539              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2300 2540              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2301 2541              &zone_max_swap_ops);
2302 2542  
     2543 +        rc_zone_phys_mem = rctl_register("zone.max-physical-memory",
     2544 +            RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
     2545 +            RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
     2546 +            &zone_phys_mem_ops);
     2547 +
2303 2548          rc_zone_max_lofi = rctl_register("zone.max-lofi",
2304 2549              RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2305 2550              RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2306 2551              &zone_max_lofi_ops);
2307 2552  
2308 2553          /*
2309 2554           * Initialize the ``global zone''.
2310 2555           */
2311 2556          set = rctl_set_create();
2312 2557          gp = rctl_set_init_prealloc(RCENTITY_ZONE);
↓ open down ↓ 1 lines elided ↑ open up ↑
2314 2559          e.rcep_p.zone = &zone0;
2315 2560          e.rcep_t = RCENTITY_ZONE;
2316 2561          zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2317 2562              gp);
2318 2563  
2319 2564          zone0.zone_nlwps = p0.p_lwpcnt;
2320 2565          zone0.zone_nprocs = 1;
2321 2566          zone0.zone_ntasks = 1;
2322 2567          mutex_exit(&p0.p_lock);
2323 2568          zone0.zone_restart_init = B_TRUE;
     2569 +        zone0.zone_reboot_on_init_exit = B_FALSE;
     2570 +        zone0.zone_init_status = -1;
2324 2571          zone0.zone_brand = &native_brand;
2325 2572          rctl_prealloc_destroy(gp);
2326 2573          /*
2327 2574           * pool_default hasn't been initialized yet, so we let pool_init()
2328 2575           * take care of making sure the global zone is in the default pool.
2329 2576           */
2330 2577  
2331 2578          /*
2332 2579           * Initialize global zone kstats
2333 2580           */
↓ open down ↓ 59 lines elided ↑ open up ↑
2393 2640              EVCH_CREAT);
2394 2641  
2395 2642          if (res)
2396 2643                  panic("Sysevent_evc_bind failed during zone setup.\n");
2397 2644  
2398 2645  }
2399 2646  
2400 2647  static void
2401 2648  zone_free(zone_t *zone)
2402 2649  {
     2650 +        zone_dl_t *zdl;
     2651 +
2403 2652          ASSERT(zone != global_zone);
2404 2653          ASSERT(zone->zone_ntasks == 0);
2405 2654          ASSERT(zone->zone_nlwps == 0);
2406 2655          ASSERT(zone->zone_nprocs == 0);
2407 2656          ASSERT(zone->zone_cred_ref == 0);
2408 2657          ASSERT(zone->zone_kcred == NULL);
2409 2658          ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2410 2659              zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2411 2660          ASSERT(list_is_empty(&zone->zone_ref_list));
2412 2661  
↓ open down ↓ 8 lines elided ↑ open up ↑
2421 2670          if (zone_status_get(zone) == ZONE_IS_DEAD) {
2422 2671                  ASSERT(zone->zone_ref == 0);
2423 2672                  mutex_enter(&zone_deathrow_lock);
2424 2673                  list_remove(&zone_deathrow, zone);
2425 2674                  mutex_exit(&zone_deathrow_lock);
2426 2675          }
2427 2676  
2428 2677          list_destroy(&zone->zone_ref_list);
2429 2678          zone_free_zsd(zone);
2430 2679          zone_free_datasets(zone);
     2680 +
     2681 +        /*
     2682 +         * While dlmgmtd should have removed all of these, it could have left
     2683 +         * something behind or crashed. In which case it's not safe for us to
     2684 +         * assume that the list is empty which list_destroy() will ASSERT. We
     2685 +         * clean up for our userland comrades which may have crashed, or worse,
     2686 +         * been disabled by SMF.
     2687 +         */
     2688 +        while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) {
     2689 +                if (zdl->zdl_net != NULL)
     2690 +                        nvlist_free(zdl->zdl_net);
     2691 +                kmem_free(zdl, sizeof (zone_dl_t));
     2692 +        }
2431 2693          list_destroy(&zone->zone_dl_list);
2432 2694  
2433 2695          if (zone->zone_rootvp != NULL)
2434 2696                  VN_RELE(zone->zone_rootvp);
2435 2697          if (zone->zone_rootpath)
2436 2698                  kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2437 2699          if (zone->zone_name != NULL)
2438 2700                  kmem_free(zone->zone_name, ZONENAME_MAX);
2439 2701          if (zone->zone_slabel != NULL)
2440 2702                  label_rele(zone->zone_slabel);
↓ open down ↓ 115 lines elided ↑ open up ↑
2556 2818          mutex_enter(&zone_status_lock);
2557 2819  
2558 2820          /* Re-Branding is not allowed and the zone can't be booted yet */
2559 2821          if ((ZONE_IS_BRANDED(zone)) ||
2560 2822              (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2561 2823                  mutex_exit(&zone_status_lock);
2562 2824                  brand_unregister_zone(bp);
2563 2825                  return (EINVAL);
2564 2826          }
2565 2827  
2566      -        /* set up the brand specific data */
     2828 +        /*
     2829 +         * Set up the brand specific data.
     2830 +         * Note that it's possible that the hook has to drop the
     2831 +         * zone_status_lock and reaquire it before returning so we can't
     2832 +         * assume the lock has been held the entire time.
     2833 +         */
2567 2834          zone->zone_brand = bp;
2568      -        ZBROP(zone)->b_init_brand_data(zone);
     2835 +        ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock);
2569 2836  
2570 2837          mutex_exit(&zone_status_lock);
2571 2838          return (0);
2572 2839  }
2573 2840  
2574 2841  static int
2575 2842  zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2576 2843  {
2577 2844          char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2578 2845          int err = 0;
↓ open down ↓ 25 lines elided ↑ open up ↑
2604 2871                  return (err);   /* EFAULT or ENAMETOOLONG */
2605 2872  
2606 2873          if (zone->zone_initname != NULL)
2607 2874                  strfree(zone->zone_initname);
2608 2875  
2609 2876          zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2610 2877          (void) strcpy(zone->zone_initname, initname);
2611 2878          return (0);
2612 2879  }
2613 2880  
     2881 +/*
     2882 + * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used
     2883 + * to provide the physical memory capping kstats.  Since physical memory
     2884 + * capping is currently implemented in userland, that code uses the setattr
     2885 + * entry point to increment the kstats.  We always simply increment nover
     2886 + * every time that setattr is called and we always add in the input value
     2887 + * to zone_mcap_pagedout every time that is called.
     2888 + */
     2889 +/*ARGSUSED*/
2614 2890  static int
2615      -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
     2891 +zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover)
2616 2892  {
2617      -        uint64_t mcap;
2618      -        int err = 0;
     2893 +        zone->zone_mcap_nover++;
2619 2894  
2620      -        if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2621      -                zone->zone_phys_mcap = mcap;
     2895 +        return (0);
     2896 +}
2622 2897  
     2898 +static int
     2899 +zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout)
     2900 +{
     2901 +        uint64_t pageout;
     2902 +        int err;
     2903 +
     2904 +        if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0)
     2905 +                zone->zone_mcap_pagedout += pageout;
     2906 +
2623 2907          return (err);
2624 2908  }
2625 2909  
     2910 +/*
     2911 + * The zone_set_page_fault_delay function is used to set the number of usecs
     2912 + * to throttle page faults.  This is normally 0 but can be set to a non-0 value
     2913 + * by the user-land memory capping code when the zone is over its physcial
     2914 + * memory cap.
     2915 + */
2626 2916  static int
     2917 +zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay)
     2918 +{
     2919 +        uint32_t dusec;
     2920 +        int err;
     2921 +
     2922 +        if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0)
     2923 +                zone->zone_pg_flt_delay = dusec;
     2924 +
     2925 +        return (err);
     2926 +}
     2927 +
     2928 +/*
     2929 + * The zone_set_rss function is used to set the zone's RSS when we do the
     2930 + * fast, approximate calculation in user-land.
     2931 + */
     2932 +static int
     2933 +zone_set_rss(zone_t *zone, const uint64_t *prss)
     2934 +{
     2935 +        uint64_t rss;
     2936 +        int err;
     2937 +
     2938 +        if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0)
     2939 +                zone->zone_phys_mem = rss;
     2940 +
     2941 +        return (err);
     2942 +}
     2943 +
     2944 +static int
2627 2945  zone_set_sched_class(zone_t *zone, const char *new_class)
2628 2946  {
2629 2947          char sched_class[PC_CLNMSZ];
2630 2948          id_t classid;
2631 2949          int err;
2632 2950  
2633 2951          ASSERT(zone != global_zone);
2634 2952          if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2635 2953                  return (err);   /* EFAULT or ENAMETOOLONG */
2636 2954  
↓ open down ↓ 1129 lines elided ↑ open up ↑
3766 4084          zone_t *z = p->p_zone;
3767 4085  
3768 4086          ASSERT(!INGLOBALZONE(curproc));
3769 4087  
3770 4088          /*
3771 4089           * For all purposes (ZONE_ATTR_INITPID and restart_init),
3772 4090           * storing just the pid of init is sufficient.
3773 4091           */
3774 4092          z->zone_proc_initpid = p->p_pid;
3775 4093  
     4094 +        if (z->zone_setup_app_contract == B_TRUE) {
     4095 +                /*
     4096 +                 * Normally a process cannot modify its own contract, but we're
     4097 +                 * just starting the zone's init process and its contract is
     4098 +                 * always initialized from the sys_process_tmpl template, so
     4099 +                 * this is the simplest way to setup init's contract to kill
     4100 +                 * the process if any other process in the contract exits.
     4101 +                 */
     4102 +                p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT;
     4103 +        }
     4104 +
3776 4105          /*
3777 4106           * We maintain zone_boot_err so that we can return the cause of the
3778 4107           * failure back to the caller of the zone_boot syscall.
3779 4108           */
3780 4109          p->p_zone->zone_boot_err = start_init_common();
3781 4110  
3782 4111          /*
3783 4112           * We will prevent booting zones from becoming running zones if the
3784 4113           * global zone is shutting down.
3785 4114           */
↓ open down ↓ 8 lines elided ↑ open up ↑
3794 4123                          zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3795 4124                  }
3796 4125                  mutex_exit(&zone_status_lock);
3797 4126                  /* It's gone bad, dispose of the process */
3798 4127                  if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3799 4128                          mutex_enter(&p->p_lock);
3800 4129                          ASSERT(p->p_flag & SEXITLWPS);
3801 4130                          lwp_exit();
3802 4131                  }
3803 4132          } else {
     4133 +                id_t cid = curthread->t_cid;
     4134 +
3804 4135                  if (zone_status_get(z) == ZONE_IS_BOOTING)
3805 4136                          zone_status_set(z, ZONE_IS_RUNNING);
3806 4137                  mutex_exit(&zone_status_lock);
     4138 +
     4139 +                mutex_enter(&class_lock);
     4140 +                ASSERT(cid < loaded_classes);
     4141 +                if (strcmp(sclass[cid].cl_name, "FX") == 0 &&
     4142 +                    z->zone_fixed_hipri) {
     4143 +                        /*
     4144 +                         * If the zone is using FX then by default all
     4145 +                         * processes start at the lowest priority and stay
     4146 +                         * there. We provide a mechanism for the zone to
     4147 +                         * indicate that it should run at "high priority". In
     4148 +                         * this case we setup init to run at the highest FX
     4149 +                         * priority (which is one level higher than the
     4150 +                         * non-fixed scheduling classes can use).
     4151 +                         */
     4152 +                        pcparms_t pcparms;
     4153 +
     4154 +                        pcparms.pc_cid = cid;
     4155 +                        ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI;
     4156 +                        ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim =
     4157 +                            FXMAXUPRI;
     4158 +                        ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags =
     4159 +                            FX_DOUPRILIM | FX_DOUPRI;
     4160 +
     4161 +                        mutex_enter(&pidlock);
     4162 +                        mutex_enter(&curproc->p_lock);
     4163 +
     4164 +                        (void) parmsset(&pcparms, curthread);
     4165 +
     4166 +                        mutex_exit(&curproc->p_lock);
     4167 +                        mutex_exit(&pidlock);
     4168 +                } else if (strcmp(sclass[cid].cl_name, "RT") == 0) {
     4169 +                        /*
     4170 +                         * zsched always starts the init lwp at priority
     4171 +                         * minclsyspri - 1. This priority gets set in t_pri and
     4172 +                         * is invalid for RT, but RT never uses t_pri. However
     4173 +                         * t_pri is used by procfs, so we always see processes
     4174 +                         * within an RT zone with an invalid priority value.
     4175 +                         * We fix that up now.
     4176 +                         */
     4177 +                        curthread->t_pri = RTGPPRIO0;
     4178 +                }
     4179 +                mutex_exit(&class_lock);
     4180 +
3807 4181                  /* cause the process to return to userland. */
3808 4182                  lwp_rtt();
3809 4183          }
3810 4184  }
3811 4185  
3812 4186  struct zsched_arg {
3813 4187          zone_t *zone;
3814 4188          nvlist_t *nvlist;
3815 4189  };
3816 4190  
↓ open down ↓ 21 lines elided ↑ open up ↑
3838 4212          kproject_t *pj;
3839 4213  
3840 4214          nvlist_t *nvl = za->nvlist;
3841 4215          nvpair_t *nvp = NULL;
3842 4216  
3843 4217          bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3844 4218          bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3845 4219          PTOU(pp)->u_argc = 0;
3846 4220          PTOU(pp)->u_argv = NULL;
3847 4221          PTOU(pp)->u_envp = NULL;
     4222 +        PTOU(pp)->u_commpagep = NULL;
3848 4223          closeall(P_FINFO(pp));
3849 4224  
3850 4225          /*
3851 4226           * We are this zone's "zsched" process.  As the zone isn't generally
3852 4227           * visible yet we don't need to grab any locks before initializing its
3853 4228           * zone_proc pointer.
3854 4229           */
3855 4230          zone_hold(zone);  /* this hold is released by zone_destroy() */
3856 4231          zone->zone_zsched = pp;
3857 4232          mutex_enter(&pp->p_lock);
↓ open down ↓ 422 lines elided ↑ open up ↑
4280 4655          }
4281 4656          while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4282 4657                  rctl_dict_entry_t *rde;
4283 4658                  rctl_hndl_t hndl;
4284 4659                  nvlist_t **nvlarray;
4285 4660                  uint_t i, nelem;
4286 4661                  char *name;
4287 4662  
4288 4663                  error = EINVAL;
4289 4664                  name = nvpair_name(nvp);
4290      -                if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
4291      -                    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
     4665 +                if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 &&
     4666 +                    strncmp(name, "project.", sizeof ("project.") - 1) != 0) ||
     4667 +                    nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
4292 4668                          goto out;
4293 4669                  }
4294 4670                  if ((hndl = rctl_hndl_lookup(name)) == -1) {
4295 4671                          goto out;
4296 4672                  }
4297 4673                  rde = rctl_dict_lookup_hndl(hndl);
4298 4674                  error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
4299 4675                  ASSERT(error == 0);
4300 4676                  for (i = 0; i < nelem; i++) {
4301 4677                          if (error = nvlist2rctlval(nvlarray[i], &rv))
↓ open down ↓ 127 lines elided ↑ open up ↑
4429 4805  
4430 4806          zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
4431 4807          zoneid = zone->zone_id = id_alloc(zoneid_space);
4432 4808          zone->zone_status = ZONE_IS_UNINITIALIZED;
4433 4809          zone->zone_pool = pool_default;
4434 4810          zone->zone_pool_mod = gethrtime();
4435 4811          zone->zone_psetid = ZONE_PS_INVAL;
4436 4812          zone->zone_ncpus = 0;
4437 4813          zone->zone_ncpus_online = 0;
4438 4814          zone->zone_restart_init = B_TRUE;
     4815 +        zone->zone_reboot_on_init_exit = B_FALSE;
     4816 +        zone->zone_init_status = -1;
4439 4817          zone->zone_brand = &native_brand;
4440 4818          zone->zone_initname = NULL;
4441 4819          mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
4442 4820          mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
4443 4821          mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
4444 4822          cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
4445 4823          list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
4446 4824              offsetof(zone_ref_t, zref_linkage));
4447 4825          list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
4448 4826              offsetof(struct zsd_entry, zsd_linkage));
↓ open down ↓ 41 lines elided ↑ open up ↑
4490 4868              kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4491 4869          (void) strcpy(zone->zone_initname, zone_default_initname);
4492 4870          zone->zone_nlwps = 0;
4493 4871          zone->zone_nlwps_ctl = INT_MAX;
4494 4872          zone->zone_nprocs = 0;
4495 4873          zone->zone_nprocs_ctl = INT_MAX;
4496 4874          zone->zone_locked_mem = 0;
4497 4875          zone->zone_locked_mem_ctl = UINT64_MAX;
4498 4876          zone->zone_max_swap = 0;
4499 4877          zone->zone_max_swap_ctl = UINT64_MAX;
     4878 +        zone->zone_phys_mem = 0;
     4879 +        zone->zone_phys_mem_ctl = UINT64_MAX;
4500 4880          zone->zone_max_lofi = 0;
4501 4881          zone->zone_max_lofi_ctl = UINT64_MAX;
4502 4882          zone->zone_lockedmem_kstat = NULL;
4503 4883          zone->zone_swapresv_kstat = NULL;
     4884 +        zone->zone_physmem_kstat = NULL;
4504 4885          zone->zone_zfs_io_pri = 1;
4505 4886  
4506 4887          /*
4507 4888           * Zsched initializes the rctls.
4508 4889           */
4509 4890          zone->zone_rctls = NULL;
4510 4891  
4511 4892          if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4512 4893                  zone_free(zone);
4513 4894                  return (zone_create_error(error, 0, extended_error));
↓ open down ↓ 135 lines elided ↑ open up ↑
4649 5030           * newproc() is successful.
4650 5031           */
4651 5032          list_insert_tail(&zone_active, zone);
4652 5033          mutex_exit(&zonehash_lock);
4653 5034  
4654 5035          zarg.zone = zone;
4655 5036          zarg.nvlist = rctls;
4656 5037          /*
4657 5038           * The process, task, and project rctls are probably wrong;
4658 5039           * we need an interface to get the default values of all rctls,
4659      -         * and initialize zsched appropriately.  I'm not sure that that
4660      -         * makes much of a difference, though.
     5040 +         * and initialize zsched appropriately. However, we allow zoneadmd
     5041 +         * to pass down both zone and project rctls for the zone's init.
4661 5042           */
4662 5043          error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4663 5044          if (error != 0) {
4664 5045                  /*
4665 5046                   * We need to undo all globally visible state.
4666 5047                   */
4667 5048                  mutex_enter(&zonehash_lock);
4668 5049                  list_remove(&zone_active, zone);
4669 5050                  if (zone->zone_flags & ZF_HASHED_LABEL) {
4670 5051                          ASSERT(zone->zone_slabel != NULL);
↓ open down ↓ 879 lines elided ↑ open up ↑
5550 5931                          outstr = zone->zone_bootargs;
5551 5932                  size = strlen(outstr) + 1;
5552 5933                  if (bufsize > size)
5553 5934                          bufsize = size;
5554 5935                  if (buf != NULL) {
5555 5936                          err = copyoutstr(outstr, buf, bufsize, NULL);
5556 5937                          if (err != 0 && err != ENAMETOOLONG)
5557 5938                                  error = EFAULT;
5558 5939                  }
5559 5940                  break;
5560      -        case ZONE_ATTR_PHYS_MCAP:
5561      -                size = sizeof (zone->zone_phys_mcap);
5562      -                if (bufsize > size)
5563      -                        bufsize = size;
5564      -                if (buf != NULL &&
5565      -                    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
5566      -                        error = EFAULT;
5567      -                break;
5568 5941          case ZONE_ATTR_SCHED_CLASS:
5569 5942                  mutex_enter(&class_lock);
5570 5943  
5571 5944                  if (zone->zone_defaultcid >= loaded_classes)
5572 5945                          outstr = "";
5573 5946                  else
5574 5947                          outstr = sclass[zone->zone_defaultcid].cl_name;
5575 5948                  size = strlen(outstr) + 1;
5576 5949                  if (bufsize > size)
5577 5950                          bufsize = size;
↓ open down ↓ 34 lines elided ↑ open up ↑
5612 5985                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5613 5986                  if (copyin(buf, zbuf, bufsize) != 0) {
5614 5987                          error = EFAULT;
5615 5988                  } else {
5616 5989                          error = zone_get_network(zoneid, zbuf);
5617 5990                          if (error == 0 && copyout(zbuf, buf, bufsize) != 0)
5618 5991                                  error = EFAULT;
5619 5992                  }
5620 5993                  kmem_free(zbuf, bufsize);
5621 5994                  break;
     5995 +        case ZONE_ATTR_SCHED_FIXEDHI:
     5996 +                size = sizeof (boolean_t);
     5997 +                if (bufsize > size)
     5998 +                        bufsize = size;
     5999 +
     6000 +                if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf,
     6001 +                    bufsize) != 0)
     6002 +                        error = EFAULT;
     6003 +                break;
5622 6004          default:
5623 6005                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
5624 6006                          size = bufsize;
5625 6007                          error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
5626 6008                  } else {
5627 6009                          error = EINVAL;
5628 6010                  }
5629 6011          }
5630 6012          zone_rele(zone);
5631 6013  
↓ open down ↓ 11 lines elided ↑ open up ↑
5643 6025  {
5644 6026          zone_t *zone;
5645 6027          zone_status_t zone_status;
5646 6028          int err = -1;
5647 6029          zone_net_data_t *zbuf;
5648 6030  
5649 6031          if (secpolicy_zone_config(CRED()) != 0)
5650 6032                  return (set_errno(EPERM));
5651 6033  
5652 6034          /*
5653      -         * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5654      -         * global zone.
     6035 +         * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT
     6036 +         * attributes can be set on the global zone.
5655 6037           */
5656      -        if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
     6038 +        if (zoneid == GLOBAL_ZONEID &&
     6039 +            attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) {
5657 6040                  return (set_errno(EINVAL));
5658 6041          }
5659 6042  
5660 6043          mutex_enter(&zonehash_lock);
5661 6044          if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5662 6045                  mutex_exit(&zonehash_lock);
5663 6046                  return (set_errno(EINVAL));
5664 6047          }
5665 6048          zone_hold(zone);
5666 6049          mutex_exit(&zonehash_lock);
5667 6050  
5668 6051          /*
5669 6052           * At present most attributes can only be set on non-running,
5670 6053           * non-global zones.
5671 6054           */
5672 6055          zone_status = zone_status_get(zone);
5673      -        if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
     6056 +        if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT &&
     6057 +            attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS &&
     6058 +            zone_status > ZONE_IS_READY) {
5674 6059                  err = EINVAL;
5675 6060                  goto done;
5676 6061          }
5677 6062  
5678 6063          switch (attr) {
5679 6064          case ZONE_ATTR_INITNAME:
5680 6065                  err = zone_set_initname(zone, (const char *)buf);
5681 6066                  break;
5682 6067          case ZONE_ATTR_INITNORESTART:
5683 6068                  zone->zone_restart_init = B_FALSE;
↓ open down ↓ 1 lines elided ↑ open up ↑
5685 6070                  break;
5686 6071          case ZONE_ATTR_BOOTARGS:
5687 6072                  err = zone_set_bootargs(zone, (const char *)buf);
5688 6073                  break;
5689 6074          case ZONE_ATTR_BRAND:
5690 6075                  err = zone_set_brand(zone, (const char *)buf);
5691 6076                  break;
5692 6077          case ZONE_ATTR_FS_ALLOWED:
5693 6078                  err = zone_set_fs_allowed(zone, (const char *)buf);
5694 6079                  break;
5695      -        case ZONE_ATTR_PHYS_MCAP:
5696      -                err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
     6080 +        case ZONE_ATTR_PMCAP_NOVER:
     6081 +                err = zone_set_mcap_nover(zone, (const uint64_t *)buf);
5697 6082                  break;
     6083 +        case ZONE_ATTR_PMCAP_PAGEOUT:
     6084 +                err = zone_set_mcap_pageout(zone, (const uint64_t *)buf);
     6085 +                break;
     6086 +        case ZONE_ATTR_PG_FLT_DELAY:
     6087 +                err = zone_set_page_fault_delay(zone, (const uint32_t *)buf);
     6088 +                break;
     6089 +        case ZONE_ATTR_RSS:
     6090 +                err = zone_set_rss(zone, (const uint64_t *)buf);
     6091 +                break;
5698 6092          case ZONE_ATTR_SCHED_CLASS:
5699 6093                  err = zone_set_sched_class(zone, (const char *)buf);
5700 6094                  break;
5701 6095          case ZONE_ATTR_HOSTID:
5702 6096                  if (bufsize == sizeof (zone->zone_hostid)) {
5703 6097                          if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5704 6098                                  err = 0;
5705 6099                          else
5706 6100                                  err = EFAULT;
5707 6101                  } else {
↓ open down ↓ 7 lines elided ↑ open up ↑
5715 6109                  }
5716 6110                  zbuf = kmem_alloc(bufsize, KM_SLEEP);
5717 6111                  if (copyin(buf, zbuf, bufsize) != 0) {
5718 6112                          kmem_free(zbuf, bufsize);
5719 6113                          err = EFAULT;
5720 6114                          break;
5721 6115                  }
5722 6116                  err = zone_set_network(zoneid, zbuf);
5723 6117                  kmem_free(zbuf, bufsize);
5724 6118                  break;
     6119 +        case ZONE_ATTR_APP_SVC_CT:
     6120 +                if (bufsize != sizeof (boolean_t)) {
     6121 +                        err = EINVAL;
     6122 +                } else {
     6123 +                        zone->zone_setup_app_contract = (boolean_t)buf;
     6124 +                        err = 0;
     6125 +                }
     6126 +                break;
     6127 +        case ZONE_ATTR_SCHED_FIXEDHI:
     6128 +                if (bufsize != sizeof (boolean_t)) {
     6129 +                        err = EINVAL;
     6130 +                } else {
     6131 +                        zone->zone_fixed_hipri = (boolean_t)buf;
     6132 +                        err = 0;
     6133 +                }
     6134 +                break;
5725 6135          default:
5726 6136                  if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5727 6137                          err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5728 6138                  else
5729 6139                          err = EINVAL;
5730 6140          }
5731 6141  
5732 6142  done:
5733 6143          zone_rele(zone);
5734 6144          ASSERT(err != -1);
↓ open down ↓ 783 lines elided ↑ open up ↑
6518 6928  
6519 6929          zone = zargp->zone;
6520 6930          arg = zargp->arg;
6521 6931          kmem_free(zargp, sizeof (*zargp));
6522 6932  
6523 6933          zone_namelen = strlen(zone->zone_name) + 1;
6524 6934          zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
6525 6935          bcopy(zone->zone_name, zone_name, zone_namelen);
6526 6936          zoneid = zone->zone_id;
6527 6937          uniqid = zone->zone_uniqid;
     6938 +        arg.status = zone->zone_init_status;
6528 6939          /*
6529 6940           * zoneadmd may be down, but at least we can empty out the zone.
6530 6941           * We can ignore the return value of zone_empty() since we're called
6531 6942           * from a kernel thread and know we won't be delivered any signals.
6532 6943           */
6533 6944          ASSERT(curproc == &p0);
6534 6945          (void) zone_empty(zone);
6535 6946          ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
6536 6947          zone_rele(zone);
6537 6948  
↓ open down ↓ 220 lines elided ↑ open up ↑
6758 7169          for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6759 7170              current_zonep = list_next(&zone_active, current_zonep)) {
6760 7171                  if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6761 7172                          zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6762 7173          }
6763 7174          mutex_exit(&zone_status_lock);
6764 7175          mutex_exit(&zonehash_lock);
6765 7176  }
6766 7177  
6767 7178  /*
6768      - * Returns true if the named dataset is visible in the current zone.
     7179 + * Returns true if the named dataset is visible in the specified zone.
6769 7180   * The 'write' parameter is set to 1 if the dataset is also writable.
6770 7181   */
6771 7182  int
6772      -zone_dataset_visible(const char *dataset, int *write)
     7183 +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write)
6773 7184  {
6774 7185          static int zfstype = -1;
6775 7186          zone_dataset_t *zd;
6776 7187          size_t len;
6777      -        zone_t *zone = curproc->p_zone;
6778 7188          const char *name = NULL;
6779 7189          vfs_t *vfsp = NULL;
6780 7190  
6781 7191          if (dataset[0] == '\0')
6782 7192                  return (0);
6783 7193  
6784 7194          /*
6785 7195           * Walk the list once, looking for datasets which match exactly, or
6786 7196           * specify a dataset underneath an exported dataset.  If found, return
6787 7197           * true and note that it is writable.
↓ open down ↓ 47 lines elided ↑ open up ↑
6835 7245           */
6836 7246          if (zfstype == -1) {
6837 7247                  struct vfssw *vswp = vfs_getvfssw("zfs");
6838 7248                  zfstype = vswp - vfssw;
6839 7249                  vfs_unrefvfssw(vswp);
6840 7250          }
6841 7251  
6842 7252          vfs_list_read_lock();
6843 7253          vfsp = zone->zone_vfslist;
6844 7254          do {
6845      -                ASSERT(vfsp);
     7255 +                if (vfsp == NULL)
     7256 +                        break;
6846 7257                  if (vfsp->vfs_fstype == zfstype) {
6847 7258                          name = refstr_value(vfsp->vfs_resource);
6848 7259  
6849 7260                          /*
6850 7261                           * Check if we have an exact match.
6851 7262                           */
6852 7263                          if (strcmp(dataset, name) == 0) {
6853 7264                                  vfs_list_unlock();
6854 7265                                  if (write)
6855 7266                                          *write = 0;
↓ open down ↓ 15 lines elided ↑ open up ↑
6871 7282                                  return (1);
6872 7283                          }
6873 7284                  }
6874 7285                  vfsp = vfsp->vfs_zone_next;
6875 7286          } while (vfsp != zone->zone_vfslist);
6876 7287  
6877 7288          vfs_list_unlock();
6878 7289          return (0);
6879 7290  }
6880 7291  
     7292 +/*
     7293 + * Returns true if the named dataset is visible in the current zone.
     7294 + * The 'write' parameter is set to 1 if the dataset is also writable.
     7295 + */
     7296 +int
     7297 +zone_dataset_visible(const char *dataset, int *write)
     7298 +{
     7299 +        zone_t *zone = curproc->p_zone;
     7300 +
     7301 +        return (zone_dataset_visible_inzone(zone, dataset, write));
     7302 +}
     7303 +
6881 7304  /*
6882 7305   * zone_find_by_any_path() -
6883 7306   *
6884 7307   * kernel-private routine similar to zone_find_by_path(), but which
6885 7308   * effectively compares against zone paths rather than zonerootpath
6886 7309   * (i.e., the last component of zonerootpaths, which should be "root/",
6887 7310   * are not compared.)  This is done in order to accurately identify all
6888 7311   * paths, whether zone-visible or not, including those which are parallel
6889 7312   * to /root/, such as /dev/, /home/, etc...
6890 7313   *
↓ open down ↓ 428 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX