io-lx-public Sdiff usr/src/uts/common/vm/vm

Print this page

OS-5078 illumos#6514 broke vm_usage and lx proc
OS-2969 vm_getusage syscall accurate zone RSS is overcounting
OS-3088 need a lighterweight page invalidation mechanism for zone memcap
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process
OS-750 improve RUSAGESYS_GETVMUSAGE for zoneadmd
OS-399 zone phys. mem. cap should be a rctl and have associated kstat

   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*




  28  * vm_usage
  29  *
  30  * This file implements the getvmusage() private system call.
  31  * getvmusage() counts the amount of resident memory pages and swap
  32  * reserved by the specified process collective. A "process collective" is
  33  * the set of processes owned by a particular, zone, project, task, or user.
  34  *
  35  * rss and swap are counted so that for a given process collective, a page is
  36  * only counted once.  For example, this means that if multiple processes in
  37  * the same project map the same page, then the project will only be charged
  38  * once for that page.  On the other hand, if two processes in different
  39  * projects map the same page, then both projects will be charged
  40  * for the page.
  41  *
  42  * The vm_getusage() calculation is implemented so that the first thread
  43  * performs the rss/swap counting. Other callers will wait for that thread to
  44  * finish, copying the results.  This enables multiple rcapds and prstats to
  45  * consume data from the same calculation.  The results are also cached so that
  46  * a caller interested in recent results can just copy them instead of starting
  47  * a new calculation. The caller passes the maximium age (in seconds) of the

 501  * Allocate a zone entity, and hashes for tracking visited vm objects
 502  * for projects, tasks, and users within that zone.
 503  */
 504 static vmu_zone_t *
 505 vmu_alloc_zone(id_t id)
 506 {
 507         vmu_zone_t *zone;
 508 
 509         if (vmu_data.vmu_free_zones != NULL) {
 510                 zone = vmu_data.vmu_free_zones;
 511                 vmu_data.vmu_free_zones =
 512                     vmu_data.vmu_free_zones->vmz_next;
 513                 zone->vmz_next = NULL;
 514                 zone->vmz_zone = NULL;
 515         } else {
 516                 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
 517         }
 518 
 519         zone->vmz_id = id;
 520 
 521         if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)

 522                 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 523 
 524         if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
 525             VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
 526                 zone->vmz_projects_hash = mod_hash_create_idhash(
 527                     "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 528 
 529         if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
 530             != 0 && zone->vmz_tasks_hash == NULL)
 531                 zone->vmz_tasks_hash = mod_hash_create_idhash(
 532                     "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 533 
 534         if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
 535             != 0 && zone->vmz_rusers_hash == NULL)
 536                 zone->vmz_rusers_hash = mod_hash_create_idhash(
 537                     "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 538 
 539         if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
 540             != 0 && zone->vmz_eusers_hash == NULL)
 541                 zone->vmz_eusers_hash = mod_hash_create_idhash(

 901         pgcnt_t index;
 902         short bound_type;
 903         short page_type;
 904         vnode_t *vn;
 905         anoff_t off;
 906         struct anon *ap;
 907 
 908         next = *first;
 909         /* Shared anon slots don't change once set. */
 910         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 911         for (;;) {
 912                 if (incore == B_TRUE)
 913                         next->vmb_type = VMUSAGE_BOUND_INCORE;
 914 
 915                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 916                         if (next == *last)
 917                                 break;
 918                         next = AVL_NEXT(tree, next);
 919                         continue;
 920                 }


 921                 bound_type = next->vmb_type;
 922                 index = next->vmb_start;
 923                 while (index <= next->vmb_end) {
 924 
 925                         /*
 926                          * These are used to determine how much to increment
 927                          * index when a large page is found.
 928                          */
 929                         page_t *page;
 930                         pgcnt_t pgcnt = 1;
 931                         uint_t pgshft;
 932                         pgcnt_t pgmsk;
 933 
 934                         ap = anon_get_ptr(amp->ahp, index);
 935                         if (ap != NULL)
 936                                 swap_xlate(ap, &vn, &off);
 937 
 938                         if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 939                             (page = page_exists(vn, off)) != NULL) {



 940                                 page_type = VMUSAGE_BOUND_INCORE;
 941                                 if (page->p_szc > 0) {
 942                                         pgcnt = page_get_pagecnt(page->p_szc);
 943                                         pgshft = page_get_shift(page->p_szc);
 944                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
 945                                             - 1;
 946                                 }
 947                         } else {
 948                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
 949                         }

 950                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 951                                 next->vmb_type = page_type;

 952                         } else if (next->vmb_type != page_type) {
 953                                 /*
 954                                  * If current bound type does not match page
 955                                  * type, need to split off new bound.
 956                                  */
 957                                 tmp = vmu_alloc_bound();
 958                                 tmp->vmb_type = page_type;
 959                                 tmp->vmb_start = index;
 960                                 tmp->vmb_end = next->vmb_end;
 961                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
 962                                 next->vmb_end = index - 1;
 963                                 if (*last == next)
 964                                         *last = tmp;
 965                                 next = tmp;
 966                         }
 967                         if (pgcnt > 1) {
 968                                 /*
 969                                  * If inside large page, jump to next large
 970                                  * page
 971                                  */

 992     vmu_bound_t **first, vmu_bound_t **last)
 993 {
 994         vmu_bound_t *next;
 995         vmu_bound_t *tmp;
 996         pgcnt_t index;
 997         short bound_type;
 998         short page_type;
 999 
1000         next = *first;
1001         for (;;) {
1002                 if (vnode->v_pages == NULL)
1003                         next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1004 
1005                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1006                         if (next == *last)
1007                                 break;
1008                         next = AVL_NEXT(tree, next);
1009                         continue;
1010                 }
1011 

1012                 bound_type = next->vmb_type;
1013                 index = next->vmb_start;
1014                 while (index <= next->vmb_end) {
1015 
1016                         /*
1017                          * These are used to determine how much to increment
1018                          * index when a large page is found.
1019                          */
1020                         page_t *page;
1021                         pgcnt_t pgcnt = 1;
1022                         uint_t pgshft;
1023                         pgcnt_t pgmsk;
1024 
1025                         if (vnode->v_pages != NULL &&
1026                             (page = page_exists(vnode, ptob(index))) != NULL) {



1027                                 page_type = VMUSAGE_BOUND_INCORE;
1028                                 if (page->p_szc > 0) {
1029                                         pgcnt = page_get_pagecnt(page->p_szc);
1030                                         pgshft = page_get_shift(page->p_szc);
1031                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
1032                                             - 1;
1033                                 }
1034                         } else {
1035                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
1036                         }

1037                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1038                                 next->vmb_type = page_type;

1039                         } else if (next->vmb_type != page_type) {
1040                                 /*
1041                                  * If current bound type does not match page
1042                                  * type, need to split off new bound.
1043                                  */
1044                                 tmp = vmu_alloc_bound();
1045                                 tmp->vmb_type = page_type;
1046                                 tmp->vmb_start = index;
1047                                 tmp->vmb_end = next->vmb_end;
1048                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
1049                                 next->vmb_end = index - 1;
1050                                 if (*last == next)
1051                                         *last = tmp;
1052                                 next = tmp;
1053                         }
1054                         if (pgcnt > 1) {
1055                                 /*
1056                                  * If inside large page, jump to next large
1057                                  * page
1058                                  */

1287                                         pgend = p_end;
1288 
1289                                 /*
1290                                  * Compute number of pages from large page
1291                                  * which are mapped.
1292                                  */
1293                                 pgcnt = pgend - p_index + 1;
1294 
1295                                 /*
1296                                  * Point indicies at page after large page,
1297                                  * or at page after end of mapping.
1298                                  */
1299                                 p_index += pgcnt;
1300                                 s_index += pgcnt;
1301                         } else {
1302                                 p_index++;
1303                                 s_index++;
1304                         }
1305 
1306                         /*






1307                          * Assume anon structs with a refcnt
1308                          * of 1 are not COW shared, so there
1309                          * is no reason to track them per entity.
1310                          */
1311                         if (cnt == 1) {
1312                                 panon += pgcnt;
1313                                 continue;
1314                         }
1315                         for (entity = vmu_entities; entity != NULL;
1316                             entity = entity->vme_next_calc) {
1317 
1318                                 result = &entity->vme_result;
1319                                 /*
1320                                  * Track COW anons per entity so
1321                                  * they are not double counted.
1322                                  */
1323                                 if (vmu_find_insert_anon(entity->vme_anon_hash,
1324                                     (caddr_t)ap) == 0)
1325                                         continue;
1326

1444  * which are relative to the process.  Then calculate each segment
1445  * in the process'es address space for each relevant entity.
1446  */
1447 static void
1448 vmu_calculate_proc(proc_t *p)
1449 {
1450         vmu_entity_t *entities = NULL;
1451         vmu_zone_t *zone;
1452         vmu_entity_t *tmp;
1453         struct as *as;
1454         struct seg *seg;
1455         int ret;
1456 
1457         /* Figure out which entities are being computed */
1458         if ((vmu_data.vmu_system) != NULL) {
1459                 tmp = vmu_data.vmu_system;
1460                 tmp->vme_next_calc = entities;
1461                 entities = tmp;
1462         }
1463         if (vmu_data.vmu_calc_flags &
1464             (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1465             VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |

1466             VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1467             VMUSAGE_ALL_EUSERS)) {
1468                 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1469                     (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1470                     (mod_hash_val_t *)&zone);
1471                 if (ret != 0) {
1472                         zone = vmu_alloc_zone(p->p_zone->zone_id);
1473                         ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1474                             (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1475                             (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1476                         ASSERT(ret == 0);
1477                 }
1478                 if (zone->vmz_zone != NULL) {
1479                         tmp = zone->vmz_zone;
1480                         tmp->vme_next_calc = entities;
1481                         entities = tmp;
1482                 }
1483                 if (vmu_data.vmu_calc_flags &
1484                     (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1485                         tmp = vmu_find_insert_entity(zone->vmz_projects_hash,

1722         cache->vmc_refcnt++;
1723 }
1724 
1725 /*
1726  * free cache data
1727  */
1728 static void
1729 vmu_cache_rele(vmu_cache_t *cache)
1730 {
1731         ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1732         ASSERT(cache->vmc_refcnt > 0);
1733         cache->vmc_refcnt--;
1734         if (cache->vmc_refcnt == 0) {
1735                 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1736                     cache->vmc_nresults);
1737                 kmem_free(cache, sizeof (vmu_cache_t));
1738         }
1739 }
1740 
1741 /*






















1742  * Copy out the cached results to a caller.  Inspect the callers flags
1743  * and zone to determine which cached results should be copied.
1744  */
1745 static int
1746 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1747     uint_t flags, int cpflg)
1748 {
1749         vmusage_t *result, *out_result;
1750         vmusage_t dummy;
1751         size_t i, count = 0;
1752         size_t bufsize;
1753         int ret = 0;
1754         uint_t types = 0;
1755 
1756         if (nres != NULL) {
1757                 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1758                         return (set_errno(EFAULT));
1759         } else {
1760                 bufsize = 0;
1761         }
1762 
1763         /* figure out what results the caller is interested in. */
1764         if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1765                 types |= VMUSAGE_SYSTEM;
1766         if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
1767                 types |= VMUSAGE_ZONE;
1768         if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1769             VMUSAGE_COL_PROJECTS))
1770                 types |= VMUSAGE_PROJECTS;
1771         if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1772                 types |= VMUSAGE_TASKS;
1773         if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1774                 types |= VMUSAGE_RUSERS;
1775         if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1776                 types |= VMUSAGE_EUSERS;
1777 
1778         /* count results for current zone */
1779         out_result = buf;
1780         for (result = cache->vmc_results, i = 0;
1781             i < cache->vmc_nresults; result++, i++) {
1782 
1783                 /* Do not return "other-zone" results to non-global zones */
1784                 if (curproc->p_zone != global_zone &&
1785                     curproc->p_zone->zone_id != result->vmu_zoneid)
1786                         continue;

1809                         }
1810                 }
1811 
1812                 /* Skip results that do not match requested type */
1813                 if ((result->vmu_type & types) == 0)
1814                         continue;
1815 
1816                 /* Skip collated results if not requested */
1817                 if (result->vmu_zoneid == ALL_ZONES) {
1818                         if (result->vmu_type == VMUSAGE_PROJECTS &&
1819                             (flags & VMUSAGE_COL_PROJECTS) == 0)
1820                                 continue;
1821                         if (result->vmu_type == VMUSAGE_EUSERS &&
1822                             (flags & VMUSAGE_COL_EUSERS) == 0)
1823                                 continue;
1824                         if (result->vmu_type == VMUSAGE_RUSERS &&
1825                             (flags & VMUSAGE_COL_RUSERS) == 0)
1826                                 continue;
1827                 }
1828 






1829                 /* Skip "other zone" results if not requested */
1830                 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1831                         if (result->vmu_type == VMUSAGE_ZONE &&
1832                             (flags & VMUSAGE_ALL_ZONES) == 0)
1833                                 continue;
1834                         if (result->vmu_type == VMUSAGE_PROJECTS &&
1835                             (flags & (VMUSAGE_ALL_PROJECTS |
1836                             VMUSAGE_COL_PROJECTS)) == 0)
1837                                 continue;
1838                         if (result->vmu_type == VMUSAGE_TASKS &&
1839                             (flags & VMUSAGE_ALL_TASKS) == 0)
1840                                 continue;
1841                         if (result->vmu_type == VMUSAGE_RUSERS &&
1842                             (flags & (VMUSAGE_ALL_RUSERS |
1843                             VMUSAGE_COL_RUSERS)) == 0)
1844                                 continue;
1845                         if (result->vmu_type == VMUSAGE_EUSERS &&
1846                             (flags & (VMUSAGE_ALL_EUSERS |
1847                             VMUSAGE_COL_EUSERS)) == 0)
1848                                 continue;
1849                 }

1850                 count++;
1851                 if (out_result != NULL) {
1852                         if (bufsize < count) {
1853                                 ret = set_errno(EOVERFLOW);
1854                         } else {
1855                                 if (ddi_copyout(result, out_result,
1856                                     sizeof (vmusage_t), cpflg))
1857                                         return (set_errno(EFAULT));
1858                                 out_result++;
1859                         }
1860                 }
1861         }
1862         if (nres != NULL)
1863                 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1864                         return (set_errno(EFAULT));
1865 
1866         return (ret);
1867 }
1868 
1869 /*

1884  *      nres:   Set to number of vmusage_t structures pointed to by buf
1885  *              before calling vm_getusage().
1886  *              On return 0 (success) or ENOSPC, is set to the number of result
1887  *              structures returned or attempted to return.
1888  *
1889  * returns 0 on success, -1 on failure:
1890  *      EINTR (interrupted)
1891  *      ENOSPC (nres to small for results, nres set to needed value for success)
1892  *      EINVAL (flags invalid)
1893  *      EFAULT (bad address for buf or nres)
1894  */
1895 int
1896 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1897 {
1898         vmu_entity_t *entity;
1899         vmusage_t *result;
1900         int ret = 0;
1901         int cacherecent = 0;
1902         hrtime_t now;
1903         uint_t flags_orig;

1904 
1905         /*
1906          * Non-global zones cannot request system wide and/or collated
1907          * results, or the system result, so munge the flags accordingly.

1908          */
1909         flags_orig = flags;
1910         if (curproc->p_zone != global_zone) {
1911                 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1912                         flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1913                         flags |= VMUSAGE_PROJECTS;
1914                 }
1915                 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1916                         flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1917                         flags |= VMUSAGE_RUSERS;
1918                 }
1919                 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1920                         flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1921                         flags |= VMUSAGE_EUSERS;
1922                 }
1923                 if (flags & VMUSAGE_SYSTEM) {
1924                         flags &= ~VMUSAGE_SYSTEM;
1925                         flags |= VMUSAGE_ZONE;
1926                 }



1927         }

1928 
1929         /* Check for unknown flags */
1930         if ((flags & (~VMUSAGE_MASK)) != 0)
1931                 return (set_errno(EINVAL));
1932 
1933         /* Check for no flags */
1934         if ((flags & VMUSAGE_MASK) == 0)
1935                 return (set_errno(EINVAL));
1936 















1937         mutex_enter(&vmu_data.vmu_lock);
1938         now = gethrtime();
1939 
1940 start:
1941         if (vmu_data.vmu_cache != NULL) {
1942 
1943                 vmu_cache_t *cache;
1944 
1945                 if ((vmu_data.vmu_cache->vmc_timestamp +
1946                     ((hrtime_t)age * NANOSEC)) > now)
1947                         cacherecent = 1;
1948 
1949                 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1950                     cacherecent == 1) {
1951                         cache = vmu_data.vmu_cache;
1952                         vmu_cache_hold(cache);
1953                         mutex_exit(&vmu_data.vmu_lock);
1954 
1955                         ret = vmu_copyout_results(cache, buf, nres, flags_orig,
1956                             cpflg);
1957                         mutex_enter(&vmu_data.vmu_lock);
1958                         vmu_cache_rele(cache);
1959                         if (vmu_data.vmu_pending_waiters > 0)
1960                                 cv_broadcast(&vmu_data.vmu_cv);
1961                         mutex_exit(&vmu_data.vmu_lock);
1962                         return (ret);
1963                 }
1964                 /*
1965                  * If the cache is recent, it is likely that there are other
1966                  * consumers of vm_getusage running, so add their flags to the
1967                  * desired flags for the calculation.
1968                  */
1969                 if (cacherecent == 1)
1970                         flags = vmu_data.vmu_cache->vmc_flags | flags;
1971         }
1972         if (vmu_data.vmu_calc_thread == NULL) {
1973 
1974                 vmu_cache_t *cache;
1975 
1976                 vmu_data.vmu_calc_thread = curthread;

1992                     vmu_cache_alloc(vmu_data.vmu_nentities,
1993                     vmu_data.vmu_calc_flags);
1994 
1995                 result = cache->vmc_results;
1996                 for (entity = vmu_data.vmu_entities; entity != NULL;
1997                     entity = entity->vme_next) {
1998                         *result = entity->vme_result;
1999                         result++;
2000                 }
2001                 cache->vmc_timestamp = gethrtime();
2002                 vmu_cache_hold(cache);
2003 
2004                 vmu_data.vmu_calc_flags = 0;
2005                 vmu_data.vmu_calc_thread = NULL;
2006 
2007                 if (vmu_data.vmu_pending_waiters > 0)
2008                         cv_broadcast(&vmu_data.vmu_cv);
2009 
2010                 mutex_exit(&vmu_data.vmu_lock);
2011 


2012                 /* copy cache */
2013                 ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);

2014                 mutex_enter(&vmu_data.vmu_lock);
2015                 vmu_cache_rele(cache);
2016                 mutex_exit(&vmu_data.vmu_lock);
2017 
2018                 return (ret);
2019         }
2020         vmu_data.vmu_pending_flags |= flags;
2021         vmu_data.vmu_pending_waiters++;
2022         while (vmu_data.vmu_calc_thread != NULL) {
2023                 if (cv_wait_sig(&vmu_data.vmu_cv,
2024                     &vmu_data.vmu_lock) == 0) {
2025                         vmu_data.vmu_pending_waiters--;
2026                         mutex_exit(&vmu_data.vmu_lock);
2027                         return (set_errno(EINTR));
2028                 }
2029         }
2030         vmu_data.vmu_pending_waiters--;
2031         goto start;
2032 }

   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2016, Joyent, Inc.
  29  */
  30 
  31 /*
  32  * vm_usage
  33  *
  34  * This file implements the getvmusage() private system call.
  35  * getvmusage() counts the amount of resident memory pages and swap
  36  * reserved by the specified process collective. A "process collective" is
  37  * the set of processes owned by a particular, zone, project, task, or user.
  38  *
  39  * rss and swap are counted so that for a given process collective, a page is
  40  * only counted once.  For example, this means that if multiple processes in
  41  * the same project map the same page, then the project will only be charged
  42  * once for that page.  On the other hand, if two processes in different
  43  * projects map the same page, then both projects will be charged
  44  * for the page.
  45  *
  46  * The vm_getusage() calculation is implemented so that the first thread
  47  * performs the rss/swap counting. Other callers will wait for that thread to
  48  * finish, copying the results.  This enables multiple rcapds and prstats to
  49  * consume data from the same calculation.  The results are also cached so that
  50  * a caller interested in recent results can just copy them instead of starting
  51  * a new calculation. The caller passes the maximium age (in seconds) of the

 505  * Allocate a zone entity, and hashes for tracking visited vm objects
 506  * for projects, tasks, and users within that zone.
 507  */
 508 static vmu_zone_t *
 509 vmu_alloc_zone(id_t id)
 510 {
 511         vmu_zone_t *zone;
 512 
 513         if (vmu_data.vmu_free_zones != NULL) {
 514                 zone = vmu_data.vmu_free_zones;
 515                 vmu_data.vmu_free_zones =
 516                     vmu_data.vmu_free_zones->vmz_next;
 517                 zone->vmz_next = NULL;
 518                 zone->vmz_zone = NULL;
 519         } else {
 520                 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
 521         }
 522 
 523         zone->vmz_id = id;
 524 
 525         if ((vmu_data.vmu_calc_flags &
 526             (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
 527                 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 528 
 529         if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
 530             VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
 531                 zone->vmz_projects_hash = mod_hash_create_idhash(
 532                     "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 533 
 534         if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
 535             != 0 && zone->vmz_tasks_hash == NULL)
 536                 zone->vmz_tasks_hash = mod_hash_create_idhash(
 537                     "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 538 
 539         if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
 540             != 0 && zone->vmz_rusers_hash == NULL)
 541                 zone->vmz_rusers_hash = mod_hash_create_idhash(
 542                     "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 543 
 544         if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
 545             != 0 && zone->vmz_eusers_hash == NULL)
 546                 zone->vmz_eusers_hash = mod_hash_create_idhash(

 906         pgcnt_t index;
 907         short bound_type;
 908         short page_type;
 909         vnode_t *vn;
 910         anoff_t off;
 911         struct anon *ap;
 912 
 913         next = *first;
 914         /* Shared anon slots don't change once set. */
 915         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 916         for (;;) {
 917                 if (incore == B_TRUE)
 918                         next->vmb_type = VMUSAGE_BOUND_INCORE;
 919 
 920                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 921                         if (next == *last)
 922                                 break;
 923                         next = AVL_NEXT(tree, next);
 924                         continue;
 925                 }
 926 
 927                 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 928                 bound_type = next->vmb_type;
 929                 index = next->vmb_start;
 930                 while (index <= next->vmb_end) {
 931 
 932                         /*
 933                          * These are used to determine how much to increment
 934                          * index when a large page is found.
 935                          */
 936                         page_t *page;
 937                         pgcnt_t pgcnt = 1;
 938                         uint_t pgshft;
 939                         pgcnt_t pgmsk;
 940 
 941                         ap = anon_get_ptr(amp->ahp, index);
 942                         if (ap != NULL)
 943                                 swap_xlate(ap, &vn, &off);
 944 
 945                         if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 946                             (page = page_exists(vn, off)) != NULL) {
 947                                 if (PP_ISFREE(page))
 948                                         page_type = VMUSAGE_BOUND_NOT_INCORE;
 949                                 else
 950                                         page_type = VMUSAGE_BOUND_INCORE;
 951                                 if (page->p_szc > 0) {
 952                                         pgcnt = page_get_pagecnt(page->p_szc);
 953                                         pgshft = page_get_shift(page->p_szc);
 954                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
 955                                             - 1;
 956                                 }
 957                         } else {
 958                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
 959                         }
 960 
 961                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 962                                 next->vmb_type = page_type;
 963                                 bound_type = page_type;
 964                         } else if (next->vmb_type != page_type) {
 965                                 /*
 966                                  * If current bound type does not match page
 967                                  * type, need to split off new bound.
 968                                  */
 969                                 tmp = vmu_alloc_bound();
 970                                 tmp->vmb_type = page_type;
 971                                 tmp->vmb_start = index;
 972                                 tmp->vmb_end = next->vmb_end;
 973                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
 974                                 next->vmb_end = index - 1;
 975                                 if (*last == next)
 976                                         *last = tmp;
 977                                 next = tmp;
 978                         }
 979                         if (pgcnt > 1) {
 980                                 /*
 981                                  * If inside large page, jump to next large
 982                                  * page
 983                                  */

1004     vmu_bound_t **first, vmu_bound_t **last)
1005 {
1006         vmu_bound_t *next;
1007         vmu_bound_t *tmp;
1008         pgcnt_t index;
1009         short bound_type;
1010         short page_type;
1011 
1012         next = *first;
1013         for (;;) {
1014                 if (vnode->v_pages == NULL)
1015                         next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1016 
1017                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1018                         if (next == *last)
1019                                 break;
1020                         next = AVL_NEXT(tree, next);
1021                         continue;
1022                 }
1023 
1024                 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
1025                 bound_type = next->vmb_type;
1026                 index = next->vmb_start;
1027                 while (index <= next->vmb_end) {
1028 
1029                         /*
1030                          * These are used to determine how much to increment
1031                          * index when a large page is found.
1032                          */
1033                         page_t *page;
1034                         pgcnt_t pgcnt = 1;
1035                         uint_t pgshft;
1036                         pgcnt_t pgmsk;
1037 
1038                         if (vnode->v_pages != NULL &&
1039                             (page = page_exists(vnode, ptob(index))) != NULL) {
1040                                 if (PP_ISFREE(page))
1041                                         page_type = VMUSAGE_BOUND_NOT_INCORE;
1042                                 else
1043                                         page_type = VMUSAGE_BOUND_INCORE;
1044                                 if (page->p_szc > 0) {
1045                                         pgcnt = page_get_pagecnt(page->p_szc);
1046                                         pgshft = page_get_shift(page->p_szc);
1047                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
1048                                             - 1;
1049                                 }
1050                         } else {
1051                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
1052                         }
1053 
1054                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1055                                 next->vmb_type = page_type;
1056                                 bound_type = page_type;
1057                         } else if (next->vmb_type != page_type) {
1058                                 /*
1059                                  * If current bound type does not match page
1060                                  * type, need to split off new bound.
1061                                  */
1062                                 tmp = vmu_alloc_bound();
1063                                 tmp->vmb_type = page_type;
1064                                 tmp->vmb_start = index;
1065                                 tmp->vmb_end = next->vmb_end;
1066                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
1067                                 next->vmb_end = index - 1;
1068                                 if (*last == next)
1069                                         *last = tmp;
1070                                 next = tmp;
1071                         }
1072                         if (pgcnt > 1) {
1073                                 /*
1074                                  * If inside large page, jump to next large
1075                                  * page
1076                                  */

1305                                         pgend = p_end;
1306 
1307                                 /*
1308                                  * Compute number of pages from large page
1309                                  * which are mapped.
1310                                  */
1311                                 pgcnt = pgend - p_index + 1;
1312 
1313                                 /*
1314                                  * Point indicies at page after large page,
1315                                  * or at page after end of mapping.
1316                                  */
1317                                 p_index += pgcnt;
1318                                 s_index += pgcnt;
1319                         } else {
1320                                 p_index++;
1321                                 s_index++;
1322                         }
1323 
1324                         /*
1325                          * Pages on the free list aren't counted for the rss.
1326                          */
1327                         if (PP_ISFREE(page))
1328                                 continue;
1329 
1330                         /*
1331                          * Assume anon structs with a refcnt
1332                          * of 1 are not COW shared, so there
1333                          * is no reason to track them per entity.
1334                          */
1335                         if (cnt == 1) {
1336                                 panon += pgcnt;
1337                                 continue;
1338                         }
1339                         for (entity = vmu_entities; entity != NULL;
1340                             entity = entity->vme_next_calc) {
1341 
1342                                 result = &entity->vme_result;
1343                                 /*
1344                                  * Track COW anons per entity so
1345                                  * they are not double counted.
1346                                  */
1347                                 if (vmu_find_insert_anon(entity->vme_anon_hash,
1348                                     (caddr_t)ap) == 0)
1349                                         continue;
1350

1468  * which are relative to the process.  Then calculate each segment
1469  * in the process'es address space for each relevant entity.
1470  */
1471 static void
1472 vmu_calculate_proc(proc_t *p)
1473 {
1474         vmu_entity_t *entities = NULL;
1475         vmu_zone_t *zone;
1476         vmu_entity_t *tmp;
1477         struct as *as;
1478         struct seg *seg;
1479         int ret;
1480 
1481         /* Figure out which entities are being computed */
1482         if ((vmu_data.vmu_system) != NULL) {
1483                 tmp = vmu_data.vmu_system;
1484                 tmp->vme_next_calc = entities;
1485                 entities = tmp;
1486         }
1487         if (vmu_data.vmu_calc_flags &
1488             (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
1489             VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1490             VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1491             VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1492             VMUSAGE_ALL_EUSERS)) {
1493                 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1494                     (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1495                     (mod_hash_val_t *)&zone);
1496                 if (ret != 0) {
1497                         zone = vmu_alloc_zone(p->p_zone->zone_id);
1498                         ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1499                             (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1500                             (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1501                         ASSERT(ret == 0);
1502                 }
1503                 if (zone->vmz_zone != NULL) {
1504                         tmp = zone->vmz_zone;
1505                         tmp->vme_next_calc = entities;
1506                         entities = tmp;
1507                 }
1508                 if (vmu_data.vmu_calc_flags &
1509                     (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1510                         tmp = vmu_find_insert_entity(zone->vmz_projects_hash,

1747         cache->vmc_refcnt++;
1748 }
1749 
1750 /*
1751  * free cache data
1752  */
1753 static void
1754 vmu_cache_rele(vmu_cache_t *cache)
1755 {
1756         ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1757         ASSERT(cache->vmc_refcnt > 0);
1758         cache->vmc_refcnt--;
1759         if (cache->vmc_refcnt == 0) {
1760                 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1761                     cache->vmc_nresults);
1762                 kmem_free(cache, sizeof (vmu_cache_t));
1763         }
1764 }
1765 
1766 /*
1767  * When new data is calculated, update the phys_mem rctl usage value in the
1768  * zones.
1769  */
1770 static void
1771 vmu_update_zone_rctls(vmu_cache_t *cache)
1772 {
1773         vmusage_t       *rp;
1774         size_t          i = 0;
1775         zone_t          *zp;
1776 
1777         for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
1778                 if (rp->vmu_type == VMUSAGE_ZONE &&
1779                     rp->vmu_zoneid != ALL_ZONES) {
1780                         if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
1781                                 zp->zone_phys_mem = rp->vmu_rss_all;
1782                                 zone_rele(zp);
1783                         }
1784                 }
1785         }
1786 }
1787 
1788 /*
1789  * Copy out the cached results to a caller.  Inspect the callers flags
1790  * and zone to determine which cached results should be copied.
1791  */
1792 static int
1793 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1794     uint_t flags, id_t req_zone_id, int cpflg)
1795 {
1796         vmusage_t *result, *out_result;
1797         vmusage_t dummy;
1798         size_t i, count = 0;
1799         size_t bufsize;
1800         int ret = 0;
1801         uint_t types = 0;
1802 
1803         if (nres != NULL) {
1804                 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1805                         return (set_errno(EFAULT));
1806         } else {
1807                 bufsize = 0;
1808         }
1809 
1810         /* figure out what results the caller is interested in. */
1811         if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1812                 types |= VMUSAGE_SYSTEM;
1813         if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
1814                 types |= VMUSAGE_ZONE;
1815         if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1816             VMUSAGE_COL_PROJECTS))
1817                 types |= VMUSAGE_PROJECTS;
1818         if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1819                 types |= VMUSAGE_TASKS;
1820         if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1821                 types |= VMUSAGE_RUSERS;
1822         if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1823                 types |= VMUSAGE_EUSERS;
1824 
1825         /* count results for current zone */
1826         out_result = buf;
1827         for (result = cache->vmc_results, i = 0;
1828             i < cache->vmc_nresults; result++, i++) {
1829 
1830                 /* Do not return "other-zone" results to non-global zones */
1831                 if (curproc->p_zone != global_zone &&
1832                     curproc->p_zone->zone_id != result->vmu_zoneid)
1833                         continue;

1856                         }
1857                 }
1858 
1859                 /* Skip results that do not match requested type */
1860                 if ((result->vmu_type & types) == 0)
1861                         continue;
1862 
1863                 /* Skip collated results if not requested */
1864                 if (result->vmu_zoneid == ALL_ZONES) {
1865                         if (result->vmu_type == VMUSAGE_PROJECTS &&
1866                             (flags & VMUSAGE_COL_PROJECTS) == 0)
1867                                 continue;
1868                         if (result->vmu_type == VMUSAGE_EUSERS &&
1869                             (flags & VMUSAGE_COL_EUSERS) == 0)
1870                                 continue;
1871                         if (result->vmu_type == VMUSAGE_RUSERS &&
1872                             (flags & VMUSAGE_COL_RUSERS) == 0)
1873                                 continue;
1874                 }
1875 
1876                 if (result->vmu_type == VMUSAGE_ZONE &&
1877                     flags & VMUSAGE_A_ZONE) {
1878                         /* Skip non-requested zone results */
1879                         if (result->vmu_zoneid != req_zone_id)
1880                                 continue;
1881                 } else {
1882                         /* Skip "other zone" results if not requested */
1883                         if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1884                                 if (result->vmu_type == VMUSAGE_ZONE &&
1885                                     (flags & VMUSAGE_ALL_ZONES) == 0)
1886                                         continue;
1887                                 if (result->vmu_type == VMUSAGE_PROJECTS &&
1888                                     (flags & (VMUSAGE_ALL_PROJECTS |
1889                                     VMUSAGE_COL_PROJECTS)) == 0)
1890                                         continue;
1891                                 if (result->vmu_type == VMUSAGE_TASKS &&
1892                                     (flags & VMUSAGE_ALL_TASKS) == 0)
1893                                         continue;
1894                                 if (result->vmu_type == VMUSAGE_RUSERS &&
1895                                     (flags & (VMUSAGE_ALL_RUSERS |
1896                                     VMUSAGE_COL_RUSERS)) == 0)
1897                                         continue;
1898                                 if (result->vmu_type == VMUSAGE_EUSERS &&
1899                                     (flags & (VMUSAGE_ALL_EUSERS |
1900                                     VMUSAGE_COL_EUSERS)) == 0)
1901                                         continue;
1902                         }
1903                 }
1904                 count++;
1905                 if (out_result != NULL) {
1906                         if (bufsize < count) {
1907                                 ret = set_errno(EOVERFLOW);
1908                         } else {
1909                                 if (ddi_copyout(result, out_result,
1910                                     sizeof (vmusage_t), cpflg))
1911                                         return (set_errno(EFAULT));
1912                                 out_result++;
1913                         }
1914                 }
1915         }
1916         if (nres != NULL)
1917                 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1918                         return (set_errno(EFAULT));
1919 
1920         return (ret);
1921 }
1922 
1923 /*

1938  *      nres:   Set to number of vmusage_t structures pointed to by buf
1939  *              before calling vm_getusage().
1940  *              On return 0 (success) or ENOSPC, is set to the number of result
1941  *              structures returned or attempted to return.
1942  *
1943  * returns 0 on success, -1 on failure:
1944  *      EINTR (interrupted)
1945  *      ENOSPC (nres to small for results, nres set to needed value for success)
1946  *      EINVAL (flags invalid)
1947  *      EFAULT (bad address for buf or nres)
1948  */
1949 int
1950 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1951 {
1952         vmu_entity_t *entity;
1953         vmusage_t *result;
1954         int ret = 0;
1955         int cacherecent = 0;
1956         hrtime_t now;
1957         uint_t flags_orig;
1958         id_t req_zone_id;
1959 
1960         /*
1961          * Non-global zones cannot request system wide and/or collated
1962          * results, or the system result, or usage of another zone, so munge
1963          * the flags accordingly.
1964          */
1965         flags_orig = flags;
1966         if (curproc->p_zone != global_zone) {
1967                 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1968                         flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1969                         flags |= VMUSAGE_PROJECTS;
1970                 }
1971                 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1972                         flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1973                         flags |= VMUSAGE_RUSERS;
1974                 }
1975                 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1976                         flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1977                         flags |= VMUSAGE_EUSERS;
1978                 }
1979                 if (flags & VMUSAGE_SYSTEM) {
1980                         flags &= ~VMUSAGE_SYSTEM;
1981                         flags |= VMUSAGE_ZONE;
1982                 }
1983                 if (flags & VMUSAGE_A_ZONE) {
1984                         flags &= ~VMUSAGE_A_ZONE;
1985                         flags |= VMUSAGE_ZONE;
1986                 }
1987         }
1988 
1989         /* Check for unknown flags */
1990         if ((flags & (~VMUSAGE_MASK)) != 0)
1991                 return (set_errno(EINVAL));
1992 
1993         /* Check for no flags */
1994         if ((flags & VMUSAGE_MASK) == 0)
1995                 return (set_errno(EINVAL));
1996 
1997         /* If requesting results for a specific zone, get the zone ID */
1998         if (flags & VMUSAGE_A_ZONE) {
1999                 size_t bufsize;
2000                 vmusage_t zreq;
2001 
2002                 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
2003                         return (set_errno(EFAULT));
2004                 /* Requested zone ID is passed in buf, so 0 len not allowed */
2005                 if (bufsize == 0)
2006                         return (set_errno(EINVAL));
2007                 if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
2008                         return (set_errno(EFAULT));
2009                 req_zone_id = zreq.vmu_id;
2010         }
2011 
2012         mutex_enter(&vmu_data.vmu_lock);
2013         now = gethrtime();
2014 
2015 start:
2016         if (vmu_data.vmu_cache != NULL) {
2017 
2018                 vmu_cache_t *cache;
2019 
2020                 if ((vmu_data.vmu_cache->vmc_timestamp +
2021                     ((hrtime_t)age * NANOSEC)) > now)
2022                         cacherecent = 1;
2023 
2024                 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
2025                     cacherecent == 1) {
2026                         cache = vmu_data.vmu_cache;
2027                         vmu_cache_hold(cache);
2028                         mutex_exit(&vmu_data.vmu_lock);
2029 
2030                         ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2031                             req_zone_id, cpflg);
2032                         mutex_enter(&vmu_data.vmu_lock);
2033                         vmu_cache_rele(cache);
2034                         if (vmu_data.vmu_pending_waiters > 0)
2035                                 cv_broadcast(&vmu_data.vmu_cv);
2036                         mutex_exit(&vmu_data.vmu_lock);
2037                         return (ret);
2038                 }
2039                 /*
2040                  * If the cache is recent, it is likely that there are other
2041                  * consumers of vm_getusage running, so add their flags to the
2042                  * desired flags for the calculation.
2043                  */
2044                 if (cacherecent == 1)
2045                         flags = vmu_data.vmu_cache->vmc_flags | flags;
2046         }
2047         if (vmu_data.vmu_calc_thread == NULL) {
2048 
2049                 vmu_cache_t *cache;
2050 
2051                 vmu_data.vmu_calc_thread = curthread;

2067                     vmu_cache_alloc(vmu_data.vmu_nentities,
2068                     vmu_data.vmu_calc_flags);
2069 
2070                 result = cache->vmc_results;
2071                 for (entity = vmu_data.vmu_entities; entity != NULL;
2072                     entity = entity->vme_next) {
2073                         *result = entity->vme_result;
2074                         result++;
2075                 }
2076                 cache->vmc_timestamp = gethrtime();
2077                 vmu_cache_hold(cache);
2078 
2079                 vmu_data.vmu_calc_flags = 0;
2080                 vmu_data.vmu_calc_thread = NULL;
2081 
2082                 if (vmu_data.vmu_pending_waiters > 0)
2083                         cv_broadcast(&vmu_data.vmu_cv);
2084 
2085                 mutex_exit(&vmu_data.vmu_lock);
2086 
2087                 /* update zone's phys. mem. rctl usage */
2088                 vmu_update_zone_rctls(cache);
2089                 /* copy cache */
2090                 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2091                     req_zone_id, cpflg);
2092                 mutex_enter(&vmu_data.vmu_lock);
2093                 vmu_cache_rele(cache);
2094                 mutex_exit(&vmu_data.vmu_lock);
2095 
2096                 return (ret);
2097         }
2098         vmu_data.vmu_pending_flags |= flags;
2099         vmu_data.vmu_pending_waiters++;
2100         while (vmu_data.vmu_calc_thread != NULL) {
2101                 if (cv_wait_sig(&vmu_data.vmu_cv,
2102                     &vmu_data.vmu_lock) == 0) {
2103                         vmu_data.vmu_pending_waiters--;
2104                         mutex_exit(&vmu_data.vmu_lock);
2105                         return (set_errno(EINTR));
2106                 }
2107         }
2108         vmu_data.vmu_pending_waiters--;
2109         goto start;
2110 }
2111 
2112 #if defined(__x86)
2113 /*
2114  * Attempt to invalidate all of the pages in the mapping for the given process.
2115  */
2116 static void
2117 map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
2118 {
2119         page_t          *pp;
2120         size_t          psize;
2121         u_offset_t      off;
2122         caddr_t         eaddr;
2123         struct vnode    *vp;
2124         struct segvn_data *svd;
2125         struct hat      *victim_hat;
2126 
2127         ASSERT((addr + size) <= (seg->s_base + seg->s_size));
2128 
2129         victim_hat = p->p_as->a_hat;
2130         svd = (struct segvn_data *)seg->s_data;
2131         vp = svd->vp;
2132         psize = page_get_pagesize(seg->s_szc);
2133 
2134         off = svd->offset + (uintptr_t)(addr - seg->s_base);
2135 
2136         for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
2137                 pp = page_lookup_nowait(vp, off, SE_SHARED);
2138 
2139                 if (pp != NULL) {
2140                         /* following logic based on pvn_getdirty() */
2141 
2142                         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2143                                 page_unlock(pp);
2144                                 continue;
2145                         }
2146 
2147                         page_io_lock(pp);
2148                         hat_page_inval(pp, 0, victim_hat);
2149                         page_io_unlock(pp);
2150 
2151                         /*
2152                          * For B_INVALCURONLY-style handling we let
2153                          * page_release call VN_DISPOSE if no one else is using
2154                          * the page.
2155                          *
2156                          * A hat_ismod() check would be useless because:
2157                          * (1) we are not be holding SE_EXCL lock
2158                          * (2) we've not unloaded _all_ translations
2159                          *
2160                          * Let page_release() do the heavy-lifting.
2161                          */
2162                         (void) page_release(pp, 1);
2163                 }
2164         }
2165 }
2166 
2167 /*
2168  * vm_map_inval()
2169  *
2170  * Invalidate as many pages as possible within the given mapping for the given
2171  * process. addr is expected to be the base address of the mapping and size is
2172  * the length of the mapping. In some cases a mapping will encompass an
2173  * entire segment, but at least for anon or stack mappings, these will be
2174  * regions within a single large segment. Thus, the invalidation is oriented
2175  * around a single mapping and not an entire segment.
2176  *
2177  * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
2178  * this code is only applicable to x86.
2179  */
2180 int
2181 vm_map_inval(pid_t pid, caddr_t addr, size_t size)
2182 {
2183         int ret;
2184         int error = 0;
2185         proc_t *p;              /* target proc */
2186         struct as *as;          /* target proc's address space */
2187         struct seg *seg;        /* working segment */
2188 
2189         if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
2190                 return (set_errno(EPERM));
2191 
2192         /* If not a valid mapping address, return an error */
2193         if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
2194                 return (set_errno(EINVAL));
2195 
2196 again:
2197         mutex_enter(&pidlock);
2198         p = prfind(pid);
2199         if (p == NULL) {
2200                 mutex_exit(&pidlock);
2201                 return (set_errno(ESRCH));
2202         }
2203 
2204         mutex_enter(&p->p_lock);
2205         mutex_exit(&pidlock);
2206 
2207         if (panicstr != NULL) {
2208                 mutex_exit(&p->p_lock);
2209                 return (0);
2210         }
2211 
2212         as = p->p_as;
2213 
2214         /*
2215          * Try to set P_PR_LOCK - prevents process "changing shape"
2216          * - blocks fork
2217          * - blocks sigkill
2218          * - cannot be a system proc
2219          * - must be fully created proc
2220          */
2221         ret = sprtrylock_proc(p);
2222         if (ret == -1) {
2223                 /* Process in invalid state */
2224                 mutex_exit(&p->p_lock);
2225                 return (set_errno(ESRCH));
2226         }
2227 
2228         if (ret == 1) {
2229                 /*
2230                  * P_PR_LOCK is already set. Wait and try again. This also
2231                  * drops p_lock so p may no longer be valid since the proc may
2232                  * have exited.
2233                  */
2234                 sprwaitlock_proc(p);
2235                 goto again;
2236         }
2237 
2238         /* P_PR_LOCK is now set */
2239         mutex_exit(&p->p_lock);
2240 
2241         AS_LOCK_ENTER(as, RW_READER);
2242         if ((seg = as_segat(as, addr)) == NULL) {
2243                 AS_LOCK_EXIT(as);
2244                 mutex_enter(&p->p_lock);
2245                 sprunlock(p);
2246                 return (set_errno(ENOMEM));
2247         }
2248 
2249         /*
2250          * The invalidation behavior only makes sense for vnode-backed segments.
2251          */
2252         if (seg->s_ops != &segvn_ops) {
2253                 AS_LOCK_EXIT(as);
2254                 mutex_enter(&p->p_lock);
2255                 sprunlock(p);
2256                 return (0);
2257         }
2258 
2259         /*
2260          * If the mapping is out of bounds of the segement return an error.
2261          */
2262         if ((addr + size) > (seg->s_base + seg->s_size)) {
2263                 AS_LOCK_EXIT(as);
2264                 mutex_enter(&p->p_lock);
2265                 sprunlock(p);
2266                 return (set_errno(EINVAL));
2267         }
2268 
2269         /*
2270          * Don't use MS_INVALCURPROC flag here since that would eventually
2271          * initiate hat invalidation based on curthread. Since we're doing this
2272          * on behalf of a different process, that would erroneously invalidate
2273          * our own process mappings.
2274          */
2275         error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
2276         if (error == 0) {
2277                 /*
2278                  * Since we didn't invalidate during the sync above, we now
2279                  * try to invalidate all of the pages in the mapping.
2280                  */
2281                 map_inval(p, seg, addr, size);
2282         }
2283         AS_LOCK_EXIT(as);
2284 
2285         mutex_enter(&p->p_lock);
2286         sprunlock(p);
2287 
2288         if (error)
2289                 (void) set_errno(error);
2290         return (error);
2291 }
2292 #endif