Print this page
OS-5078 illumos#6514 broke vm_usage and lx proc
OS-2969 vm_getusage syscall accurate zone RSS is overcounting
OS-3088 need a lighterweight page invalidation mechanism for zone memcap
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process
OS-750 improve RUSAGESYS_GETVMUSAGE for zoneadmd
OS-399 zone phys. mem. cap should be a rctl and have associated kstat

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/vm/vm_usage.c
          +++ new/usr/src/uts/common/vm/vm_usage.c
↓ open down ↓ 17 lines elided ↑ open up ↑
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27  /*
       28 + * Copyright 2016, Joyent, Inc.
       29 + */
       30 +
       31 +/*
  28   32   * vm_usage
  29   33   *
  30   34   * This file implements the getvmusage() private system call.
  31   35   * getvmusage() counts the amount of resident memory pages and swap
  32   36   * reserved by the specified process collective. A "process collective" is
  33   37   * the set of processes owned by a particular, zone, project, task, or user.
  34   38   *
  35   39   * rss and swap are counted so that for a given process collective, a page is
  36   40   * only counted once.  For example, this means that if multiple processes in
  37   41   * the same project map the same page, then the project will only be charged
↓ open down ↓ 473 lines elided ↑ open up ↑
 511  515                  vmu_data.vmu_free_zones =
 512  516                      vmu_data.vmu_free_zones->vmz_next;
 513  517                  zone->vmz_next = NULL;
 514  518                  zone->vmz_zone = NULL;
 515  519          } else {
 516  520                  zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
 517  521          }
 518  522  
 519  523          zone->vmz_id = id;
 520  524  
 521      -        if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
      525 +        if ((vmu_data.vmu_calc_flags &
      526 +            (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
 522  527                  zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 523  528  
 524  529          if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
 525  530              VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
 526  531                  zone->vmz_projects_hash = mod_hash_create_idhash(
 527  532                      "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 528  533  
 529  534          if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
 530  535              != 0 && zone->vmz_tasks_hash == NULL)
 531  536                  zone->vmz_tasks_hash = mod_hash_create_idhash(
↓ open down ↓ 379 lines elided ↑ open up ↑
 911  916          for (;;) {
 912  917                  if (incore == B_TRUE)
 913  918                          next->vmb_type = VMUSAGE_BOUND_INCORE;
 914  919  
 915  920                  if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 916  921                          if (next == *last)
 917  922                                  break;
 918  923                          next = AVL_NEXT(tree, next);
 919  924                          continue;
 920  925                  }
      926 +
      927 +                ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 921  928                  bound_type = next->vmb_type;
 922  929                  index = next->vmb_start;
 923  930                  while (index <= next->vmb_end) {
 924  931  
 925  932                          /*
 926  933                           * These are used to determine how much to increment
 927  934                           * index when a large page is found.
 928  935                           */
 929  936                          page_t *page;
 930  937                          pgcnt_t pgcnt = 1;
 931  938                          uint_t pgshft;
 932  939                          pgcnt_t pgmsk;
 933  940  
 934  941                          ap = anon_get_ptr(amp->ahp, index);
 935  942                          if (ap != NULL)
 936  943                                  swap_xlate(ap, &vn, &off);
 937  944  
 938  945                          if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 939  946                              (page = page_exists(vn, off)) != NULL) {
 940      -                                page_type = VMUSAGE_BOUND_INCORE;
      947 +                                if (PP_ISFREE(page))
      948 +                                        page_type = VMUSAGE_BOUND_NOT_INCORE;
      949 +                                else
      950 +                                        page_type = VMUSAGE_BOUND_INCORE;
 941  951                                  if (page->p_szc > 0) {
 942  952                                          pgcnt = page_get_pagecnt(page->p_szc);
 943  953                                          pgshft = page_get_shift(page->p_szc);
 944  954                                          pgmsk = (0x1 << (pgshft - PAGESHIFT))
 945  955                                              - 1;
 946  956                                  }
 947  957                          } else {
 948  958                                  page_type = VMUSAGE_BOUND_NOT_INCORE;
 949  959                          }
      960 +
 950  961                          if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 951  962                                  next->vmb_type = page_type;
      963 +                                bound_type = page_type;
 952  964                          } else if (next->vmb_type != page_type) {
 953  965                                  /*
 954  966                                   * If current bound type does not match page
 955  967                                   * type, need to split off new bound.
 956  968                                   */
 957  969                                  tmp = vmu_alloc_bound();
 958  970                                  tmp->vmb_type = page_type;
 959  971                                  tmp->vmb_start = index;
 960  972                                  tmp->vmb_end = next->vmb_end;
 961  973                                  avl_insert_here(tree, tmp, next, AVL_AFTER);
↓ open down ↓ 40 lines elided ↑ open up ↑
1002 1014                  if (vnode->v_pages == NULL)
1003 1015                          next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1004 1016  
1005 1017                  if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1006 1018                          if (next == *last)
1007 1019                                  break;
1008 1020                          next = AVL_NEXT(tree, next);
1009 1021                          continue;
1010 1022                  }
1011 1023  
     1024 +                ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
1012 1025                  bound_type = next->vmb_type;
1013 1026                  index = next->vmb_start;
1014 1027                  while (index <= next->vmb_end) {
1015 1028  
1016 1029                          /*
1017 1030                           * These are used to determine how much to increment
1018 1031                           * index when a large page is found.
1019 1032                           */
1020 1033                          page_t *page;
1021 1034                          pgcnt_t pgcnt = 1;
1022 1035                          uint_t pgshft;
1023 1036                          pgcnt_t pgmsk;
1024 1037  
1025 1038                          if (vnode->v_pages != NULL &&
1026 1039                              (page = page_exists(vnode, ptob(index))) != NULL) {
1027      -                                page_type = VMUSAGE_BOUND_INCORE;
     1040 +                                if (PP_ISFREE(page))
     1041 +                                        page_type = VMUSAGE_BOUND_NOT_INCORE;
     1042 +                                else
     1043 +                                        page_type = VMUSAGE_BOUND_INCORE;
1028 1044                                  if (page->p_szc > 0) {
1029 1045                                          pgcnt = page_get_pagecnt(page->p_szc);
1030 1046                                          pgshft = page_get_shift(page->p_szc);
1031 1047                                          pgmsk = (0x1 << (pgshft - PAGESHIFT))
1032 1048                                              - 1;
1033 1049                                  }
1034 1050                          } else {
1035 1051                                  page_type = VMUSAGE_BOUND_NOT_INCORE;
1036 1052                          }
     1053 +
1037 1054                          if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1038 1055                                  next->vmb_type = page_type;
     1056 +                                bound_type = page_type;
1039 1057                          } else if (next->vmb_type != page_type) {
1040 1058                                  /*
1041 1059                                   * If current bound type does not match page
1042 1060                                   * type, need to split off new bound.
1043 1061                                   */
1044 1062                                  tmp = vmu_alloc_bound();
1045 1063                                  tmp->vmb_type = page_type;
1046 1064                                  tmp->vmb_start = index;
1047 1065                                  tmp->vmb_end = next->vmb_end;
1048 1066                                  avl_insert_here(tree, tmp, next, AVL_AFTER);
↓ open down ↓ 248 lines elided ↑ open up ↑
1297 1315                                   * or at page after end of mapping.
1298 1316                                   */
1299 1317                                  p_index += pgcnt;
1300 1318                                  s_index += pgcnt;
1301 1319                          } else {
1302 1320                                  p_index++;
1303 1321                                  s_index++;
1304 1322                          }
1305 1323  
1306 1324                          /*
     1325 +                         * Pages on the free list aren't counted for the rss.
     1326 +                         */
     1327 +                        if (PP_ISFREE(page))
     1328 +                                continue;
     1329 +
     1330 +                        /*
1307 1331                           * Assume anon structs with a refcnt
1308 1332                           * of 1 are not COW shared, so there
1309 1333                           * is no reason to track them per entity.
1310 1334                           */
1311 1335                          if (cnt == 1) {
1312 1336                                  panon += pgcnt;
1313 1337                                  continue;
1314 1338                          }
1315 1339                          for (entity = vmu_entities; entity != NULL;
1316 1340                              entity = entity->vme_next_calc) {
↓ open down ↓ 137 lines elided ↑ open up ↑
1454 1478          struct seg *seg;
1455 1479          int ret;
1456 1480  
1457 1481          /* Figure out which entities are being computed */
1458 1482          if ((vmu_data.vmu_system) != NULL) {
1459 1483                  tmp = vmu_data.vmu_system;
1460 1484                  tmp->vme_next_calc = entities;
1461 1485                  entities = tmp;
1462 1486          }
1463 1487          if (vmu_data.vmu_calc_flags &
1464      -            (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1465      -            VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
     1488 +            (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
     1489 +            VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
     1490 +            VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1466 1491              VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1467 1492              VMUSAGE_ALL_EUSERS)) {
1468 1493                  ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1469 1494                      (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1470 1495                      (mod_hash_val_t *)&zone);
1471 1496                  if (ret != 0) {
1472 1497                          zone = vmu_alloc_zone(p->p_zone->zone_id);
1473 1498                          ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1474 1499                              (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1475 1500                              (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
↓ open down ↓ 256 lines elided ↑ open up ↑
1732 1757          ASSERT(cache->vmc_refcnt > 0);
1733 1758          cache->vmc_refcnt--;
1734 1759          if (cache->vmc_refcnt == 0) {
1735 1760                  kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1736 1761                      cache->vmc_nresults);
1737 1762                  kmem_free(cache, sizeof (vmu_cache_t));
1738 1763          }
1739 1764  }
1740 1765  
1741 1766  /*
     1767 + * When new data is calculated, update the phys_mem rctl usage value in the
     1768 + * zones.
     1769 + */
     1770 +static void
     1771 +vmu_update_zone_rctls(vmu_cache_t *cache)
     1772 +{
     1773 +        vmusage_t       *rp;
     1774 +        size_t          i = 0;
     1775 +        zone_t          *zp;
     1776 +
     1777 +        for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
     1778 +                if (rp->vmu_type == VMUSAGE_ZONE &&
     1779 +                    rp->vmu_zoneid != ALL_ZONES) {
     1780 +                        if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
     1781 +                                zp->zone_phys_mem = rp->vmu_rss_all;
     1782 +                                zone_rele(zp);
     1783 +                        }
     1784 +                }
     1785 +        }
     1786 +}
     1787 +
     1788 +/*
1742 1789   * Copy out the cached results to a caller.  Inspect the callers flags
1743 1790   * and zone to determine which cached results should be copied.
1744 1791   */
1745 1792  static int
1746 1793  vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1747      -    uint_t flags, int cpflg)
     1794 +    uint_t flags, id_t req_zone_id, int cpflg)
1748 1795  {
1749 1796          vmusage_t *result, *out_result;
1750 1797          vmusage_t dummy;
1751 1798          size_t i, count = 0;
1752 1799          size_t bufsize;
1753 1800          int ret = 0;
1754 1801          uint_t types = 0;
1755 1802  
1756 1803          if (nres != NULL) {
1757 1804                  if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1758 1805                          return (set_errno(EFAULT));
1759 1806          } else {
1760 1807                  bufsize = 0;
1761 1808          }
1762 1809  
1763 1810          /* figure out what results the caller is interested in. */
1764 1811          if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1765 1812                  types |= VMUSAGE_SYSTEM;
1766      -        if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
     1813 +        if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
1767 1814                  types |= VMUSAGE_ZONE;
1768 1815          if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1769 1816              VMUSAGE_COL_PROJECTS))
1770 1817                  types |= VMUSAGE_PROJECTS;
1771 1818          if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1772 1819                  types |= VMUSAGE_TASKS;
1773 1820          if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1774 1821                  types |= VMUSAGE_RUSERS;
1775 1822          if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1776 1823                  types |= VMUSAGE_EUSERS;
↓ open down ↓ 42 lines elided ↑ open up ↑
1819 1866                              (flags & VMUSAGE_COL_PROJECTS) == 0)
1820 1867                                  continue;
1821 1868                          if (result->vmu_type == VMUSAGE_EUSERS &&
1822 1869                              (flags & VMUSAGE_COL_EUSERS) == 0)
1823 1870                                  continue;
1824 1871                          if (result->vmu_type == VMUSAGE_RUSERS &&
1825 1872                              (flags & VMUSAGE_COL_RUSERS) == 0)
1826 1873                                  continue;
1827 1874                  }
1828 1875  
1829      -                /* Skip "other zone" results if not requested */
1830      -                if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1831      -                        if (result->vmu_type == VMUSAGE_ZONE &&
1832      -                            (flags & VMUSAGE_ALL_ZONES) == 0)
     1876 +                if (result->vmu_type == VMUSAGE_ZONE &&
     1877 +                    flags & VMUSAGE_A_ZONE) {
     1878 +                        /* Skip non-requested zone results */
     1879 +                        if (result->vmu_zoneid != req_zone_id)
1833 1880                                  continue;
1834      -                        if (result->vmu_type == VMUSAGE_PROJECTS &&
1835      -                            (flags & (VMUSAGE_ALL_PROJECTS |
1836      -                            VMUSAGE_COL_PROJECTS)) == 0)
1837      -                                continue;
1838      -                        if (result->vmu_type == VMUSAGE_TASKS &&
1839      -                            (flags & VMUSAGE_ALL_TASKS) == 0)
1840      -                                continue;
1841      -                        if (result->vmu_type == VMUSAGE_RUSERS &&
1842      -                            (flags & (VMUSAGE_ALL_RUSERS |
1843      -                            VMUSAGE_COL_RUSERS)) == 0)
1844      -                                continue;
1845      -                        if (result->vmu_type == VMUSAGE_EUSERS &&
1846      -                            (flags & (VMUSAGE_ALL_EUSERS |
1847      -                            VMUSAGE_COL_EUSERS)) == 0)
1848      -                                continue;
     1881 +                } else {
     1882 +                        /* Skip "other zone" results if not requested */
     1883 +                        if (result->vmu_zoneid != curproc->p_zone->zone_id) {
     1884 +                                if (result->vmu_type == VMUSAGE_ZONE &&
     1885 +                                    (flags & VMUSAGE_ALL_ZONES) == 0)
     1886 +                                        continue;
     1887 +                                if (result->vmu_type == VMUSAGE_PROJECTS &&
     1888 +                                    (flags & (VMUSAGE_ALL_PROJECTS |
     1889 +                                    VMUSAGE_COL_PROJECTS)) == 0)
     1890 +                                        continue;
     1891 +                                if (result->vmu_type == VMUSAGE_TASKS &&
     1892 +                                    (flags & VMUSAGE_ALL_TASKS) == 0)
     1893 +                                        continue;
     1894 +                                if (result->vmu_type == VMUSAGE_RUSERS &&
     1895 +                                    (flags & (VMUSAGE_ALL_RUSERS |
     1896 +                                    VMUSAGE_COL_RUSERS)) == 0)
     1897 +                                        continue;
     1898 +                                if (result->vmu_type == VMUSAGE_EUSERS &&
     1899 +                                    (flags & (VMUSAGE_ALL_EUSERS |
     1900 +                                    VMUSAGE_COL_EUSERS)) == 0)
     1901 +                                        continue;
     1902 +                        }
1849 1903                  }
1850 1904                  count++;
1851 1905                  if (out_result != NULL) {
1852 1906                          if (bufsize < count) {
1853 1907                                  ret = set_errno(EOVERFLOW);
1854 1908                          } else {
1855 1909                                  if (ddi_copyout(result, out_result,
1856 1910                                      sizeof (vmusage_t), cpflg))
1857 1911                                          return (set_errno(EFAULT));
1858 1912                                  out_result++;
↓ open down ↓ 35 lines elided ↑ open up ↑
1894 1948   */
1895 1949  int
1896 1950  vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1897 1951  {
1898 1952          vmu_entity_t *entity;
1899 1953          vmusage_t *result;
1900 1954          int ret = 0;
1901 1955          int cacherecent = 0;
1902 1956          hrtime_t now;
1903 1957          uint_t flags_orig;
     1958 +        id_t req_zone_id;
1904 1959  
1905 1960          /*
1906 1961           * Non-global zones cannot request system wide and/or collated
1907      -         * results, or the system result, so munge the flags accordingly.
     1962 +         * results, or the system result, or usage of another zone, so munge
     1963 +         * the flags accordingly.
1908 1964           */
1909 1965          flags_orig = flags;
1910 1966          if (curproc->p_zone != global_zone) {
1911 1967                  if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1912 1968                          flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1913 1969                          flags |= VMUSAGE_PROJECTS;
1914 1970                  }
1915 1971                  if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1916 1972                          flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1917 1973                          flags |= VMUSAGE_RUSERS;
1918 1974                  }
1919 1975                  if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1920 1976                          flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1921 1977                          flags |= VMUSAGE_EUSERS;
1922 1978                  }
1923 1979                  if (flags & VMUSAGE_SYSTEM) {
1924 1980                          flags &= ~VMUSAGE_SYSTEM;
1925 1981                          flags |= VMUSAGE_ZONE;
1926 1982                  }
     1983 +                if (flags & VMUSAGE_A_ZONE) {
     1984 +                        flags &= ~VMUSAGE_A_ZONE;
     1985 +                        flags |= VMUSAGE_ZONE;
     1986 +                }
1927 1987          }
1928 1988  
1929 1989          /* Check for unknown flags */
1930 1990          if ((flags & (~VMUSAGE_MASK)) != 0)
1931 1991                  return (set_errno(EINVAL));
1932 1992  
1933 1993          /* Check for no flags */
1934 1994          if ((flags & VMUSAGE_MASK) == 0)
1935 1995                  return (set_errno(EINVAL));
1936 1996  
     1997 +        /* If requesting results for a specific zone, get the zone ID */
     1998 +        if (flags & VMUSAGE_A_ZONE) {
     1999 +                size_t bufsize;
     2000 +                vmusage_t zreq;
     2001 +
     2002 +                if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
     2003 +                        return (set_errno(EFAULT));
     2004 +                /* Requested zone ID is passed in buf, so 0 len not allowed */
     2005 +                if (bufsize == 0)
     2006 +                        return (set_errno(EINVAL));
     2007 +                if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
     2008 +                        return (set_errno(EFAULT));
     2009 +                req_zone_id = zreq.vmu_id;
     2010 +        }
     2011 +
1937 2012          mutex_enter(&vmu_data.vmu_lock);
1938 2013          now = gethrtime();
1939 2014  
1940 2015  start:
1941 2016          if (vmu_data.vmu_cache != NULL) {
1942 2017  
1943 2018                  vmu_cache_t *cache;
1944 2019  
1945 2020                  if ((vmu_data.vmu_cache->vmc_timestamp +
1946 2021                      ((hrtime_t)age * NANOSEC)) > now)
1947 2022                          cacherecent = 1;
1948 2023  
1949 2024                  if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1950 2025                      cacherecent == 1) {
1951 2026                          cache = vmu_data.vmu_cache;
1952 2027                          vmu_cache_hold(cache);
1953 2028                          mutex_exit(&vmu_data.vmu_lock);
1954 2029  
1955 2030                          ret = vmu_copyout_results(cache, buf, nres, flags_orig,
1956      -                            cpflg);
     2031 +                            req_zone_id, cpflg);
1957 2032                          mutex_enter(&vmu_data.vmu_lock);
1958 2033                          vmu_cache_rele(cache);
1959 2034                          if (vmu_data.vmu_pending_waiters > 0)
1960 2035                                  cv_broadcast(&vmu_data.vmu_cv);
1961 2036                          mutex_exit(&vmu_data.vmu_lock);
1962 2037                          return (ret);
1963 2038                  }
1964 2039                  /*
1965 2040                   * If the cache is recent, it is likely that there are other
1966 2041                   * consumers of vm_getusage running, so add their flags to the
↓ open down ↓ 35 lines elided ↑ open up ↑
2002 2077                  vmu_cache_hold(cache);
2003 2078  
2004 2079                  vmu_data.vmu_calc_flags = 0;
2005 2080                  vmu_data.vmu_calc_thread = NULL;
2006 2081  
2007 2082                  if (vmu_data.vmu_pending_waiters > 0)
2008 2083                          cv_broadcast(&vmu_data.vmu_cv);
2009 2084  
2010 2085                  mutex_exit(&vmu_data.vmu_lock);
2011 2086  
     2087 +                /* update zone's phys. mem. rctl usage */
     2088 +                vmu_update_zone_rctls(cache);
2012 2089                  /* copy cache */
2013      -                ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
     2090 +                ret = vmu_copyout_results(cache, buf, nres, flags_orig,
     2091 +                    req_zone_id, cpflg);
2014 2092                  mutex_enter(&vmu_data.vmu_lock);
2015 2093                  vmu_cache_rele(cache);
2016 2094                  mutex_exit(&vmu_data.vmu_lock);
2017 2095  
2018 2096                  return (ret);
2019 2097          }
2020 2098          vmu_data.vmu_pending_flags |= flags;
2021 2099          vmu_data.vmu_pending_waiters++;
2022 2100          while (vmu_data.vmu_calc_thread != NULL) {
2023 2101                  if (cv_wait_sig(&vmu_data.vmu_cv,
2024 2102                      &vmu_data.vmu_lock) == 0) {
2025 2103                          vmu_data.vmu_pending_waiters--;
2026 2104                          mutex_exit(&vmu_data.vmu_lock);
2027 2105                          return (set_errno(EINTR));
2028 2106                  }
2029 2107          }
2030 2108          vmu_data.vmu_pending_waiters--;
2031 2109          goto start;
2032 2110  }
     2111 +
     2112 +#if defined(__x86)
     2113 +/*
     2114 + * Attempt to invalidate all of the pages in the mapping for the given process.
     2115 + */
     2116 +static void
     2117 +map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
     2118 +{
     2119 +        page_t          *pp;
     2120 +        size_t          psize;
     2121 +        u_offset_t      off;
     2122 +        caddr_t         eaddr;
     2123 +        struct vnode    *vp;
     2124 +        struct segvn_data *svd;
     2125 +        struct hat      *victim_hat;
     2126 +
     2127 +        ASSERT((addr + size) <= (seg->s_base + seg->s_size));
     2128 +
     2129 +        victim_hat = p->p_as->a_hat;
     2130 +        svd = (struct segvn_data *)seg->s_data;
     2131 +        vp = svd->vp;
     2132 +        psize = page_get_pagesize(seg->s_szc);
     2133 +
     2134 +        off = svd->offset + (uintptr_t)(addr - seg->s_base);
     2135 +
     2136 +        for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
     2137 +                pp = page_lookup_nowait(vp, off, SE_SHARED);
     2138 +
     2139 +                if (pp != NULL) {
     2140 +                        /* following logic based on pvn_getdirty() */
     2141 +
     2142 +                        if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
     2143 +                                page_unlock(pp);
     2144 +                                continue;
     2145 +                        }
     2146 +
     2147 +                        page_io_lock(pp);
     2148 +                        hat_page_inval(pp, 0, victim_hat);
     2149 +                        page_io_unlock(pp);
     2150 +
     2151 +                        /*
     2152 +                         * For B_INVALCURONLY-style handling we let
     2153 +                         * page_release call VN_DISPOSE if no one else is using
     2154 +                         * the page.
     2155 +                         *
     2156 +                         * A hat_ismod() check would be useless because:
     2157 +                         * (1) we are not be holding SE_EXCL lock
     2158 +                         * (2) we've not unloaded _all_ translations
     2159 +                         *
     2160 +                         * Let page_release() do the heavy-lifting.
     2161 +                         */
     2162 +                        (void) page_release(pp, 1);
     2163 +                }
     2164 +        }
     2165 +}
     2166 +
     2167 +/*
     2168 + * vm_map_inval()
     2169 + *
     2170 + * Invalidate as many pages as possible within the given mapping for the given
     2171 + * process. addr is expected to be the base address of the mapping and size is
     2172 + * the length of the mapping. In some cases a mapping will encompass an
     2173 + * entire segment, but at least for anon or stack mappings, these will be
     2174 + * regions within a single large segment. Thus, the invalidation is oriented
     2175 + * around a single mapping and not an entire segment.
     2176 + *
     2177 + * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
     2178 + * this code is only applicable to x86.
     2179 + */
     2180 +int
     2181 +vm_map_inval(pid_t pid, caddr_t addr, size_t size)
     2182 +{
     2183 +        int ret;
     2184 +        int error = 0;
     2185 +        proc_t *p;              /* target proc */
     2186 +        struct as *as;          /* target proc's address space */
     2187 +        struct seg *seg;        /* working segment */
     2188 +
     2189 +        if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
     2190 +                return (set_errno(EPERM));
     2191 +
     2192 +        /* If not a valid mapping address, return an error */
     2193 +        if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
     2194 +                return (set_errno(EINVAL));
     2195 +
     2196 +again:
     2197 +        mutex_enter(&pidlock);
     2198 +        p = prfind(pid);
     2199 +        if (p == NULL) {
     2200 +                mutex_exit(&pidlock);
     2201 +                return (set_errno(ESRCH));
     2202 +        }
     2203 +
     2204 +        mutex_enter(&p->p_lock);
     2205 +        mutex_exit(&pidlock);
     2206 +
     2207 +        if (panicstr != NULL) {
     2208 +                mutex_exit(&p->p_lock);
     2209 +                return (0);
     2210 +        }
     2211 +
     2212 +        as = p->p_as;
     2213 +
     2214 +        /*
     2215 +         * Try to set P_PR_LOCK - prevents process "changing shape"
     2216 +         * - blocks fork
     2217 +         * - blocks sigkill
     2218 +         * - cannot be a system proc
     2219 +         * - must be fully created proc
     2220 +         */
     2221 +        ret = sprtrylock_proc(p);
     2222 +        if (ret == -1) {
     2223 +                /* Process in invalid state */
     2224 +                mutex_exit(&p->p_lock);
     2225 +                return (set_errno(ESRCH));
     2226 +        }
     2227 +
     2228 +        if (ret == 1) {
     2229 +                /*
     2230 +                 * P_PR_LOCK is already set. Wait and try again. This also
     2231 +                 * drops p_lock so p may no longer be valid since the proc may
     2232 +                 * have exited.
     2233 +                 */
     2234 +                sprwaitlock_proc(p);
     2235 +                goto again;
     2236 +        }
     2237 +
     2238 +        /* P_PR_LOCK is now set */
     2239 +        mutex_exit(&p->p_lock);
     2240 +
     2241 +        AS_LOCK_ENTER(as, RW_READER);
     2242 +        if ((seg = as_segat(as, addr)) == NULL) {
     2243 +                AS_LOCK_EXIT(as);
     2244 +                mutex_enter(&p->p_lock);
     2245 +                sprunlock(p);
     2246 +                return (set_errno(ENOMEM));
     2247 +        }
     2248 +
     2249 +        /*
     2250 +         * The invalidation behavior only makes sense for vnode-backed segments.
     2251 +         */
     2252 +        if (seg->s_ops != &segvn_ops) {
     2253 +                AS_LOCK_EXIT(as);
     2254 +                mutex_enter(&p->p_lock);
     2255 +                sprunlock(p);
     2256 +                return (0);
     2257 +        }
     2258 +
     2259 +        /*
     2260 +         * If the mapping is out of bounds of the segement return an error.
     2261 +         */
     2262 +        if ((addr + size) > (seg->s_base + seg->s_size)) {
     2263 +                AS_LOCK_EXIT(as);
     2264 +                mutex_enter(&p->p_lock);
     2265 +                sprunlock(p);
     2266 +                return (set_errno(EINVAL));
     2267 +        }
     2268 +
     2269 +        /*
     2270 +         * Don't use MS_INVALCURPROC flag here since that would eventually
     2271 +         * initiate hat invalidation based on curthread. Since we're doing this
     2272 +         * on behalf of a different process, that would erroneously invalidate
     2273 +         * our own process mappings.
     2274 +         */
     2275 +        error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
     2276 +        if (error == 0) {
     2277 +                /*
     2278 +                 * Since we didn't invalidate during the sync above, we now
     2279 +                 * try to invalidate all of the pages in the mapping.
     2280 +                 */
     2281 +                map_inval(p, seg, addr, size);
     2282 +        }
     2283 +        AS_LOCK_EXIT(as);
     2284 +
     2285 +        mutex_enter(&p->p_lock);
     2286 +        sprunlock(p);
     2287 +
     2288 +        if (error)
     2289 +                (void) set_errno(error);
     2290 +        return (error);
     2291 +}
     2292 +#endif
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX