8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * vm_usage
29 *
30 * This file implements the getvmusage() private system call.
31 * getvmusage() counts the amount of resident memory pages and swap
32 * reserved by the specified process collective. A "process collective" is
33 * the set of processes owned by a particular, zone, project, task, or user.
34 *
35 * rss and swap are counted so that for a given process collective, a page is
36 * only counted once. For example, this means that if multiple processes in
37 * the same project map the same page, then the project will only be charged
38 * once for that page. On the other hand, if two processes in different
39 * projects map the same page, then both projects will be charged
40 * for the page.
41 *
42 * The vm_getusage() calculation is implemented so that the first thread
43 * performs the rss/swap counting. Other callers will wait for that thread to
44 * finish, copying the results. This enables multiple rcapds and prstats to
45 * consume data from the same calculation. The results are also cached so that
46 * a caller interested in recent results can just copy them instead of starting
47 * a new calculation. The caller passes the maximium age (in seconds) of the
501 * Allocate a zone entity, and hashes for tracking visited vm objects
502 * for projects, tasks, and users within that zone.
503 */
504 static vmu_zone_t *
505 vmu_alloc_zone(id_t id)
506 {
507 vmu_zone_t *zone;
508
509 if (vmu_data.vmu_free_zones != NULL) {
510 zone = vmu_data.vmu_free_zones;
511 vmu_data.vmu_free_zones =
512 vmu_data.vmu_free_zones->vmz_next;
513 zone->vmz_next = NULL;
514 zone->vmz_zone = NULL;
515 } else {
516 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
517 }
518
519 zone->vmz_id = id;
520
521 if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
522 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
523
524 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
525 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
526 zone->vmz_projects_hash = mod_hash_create_idhash(
527 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
528
529 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
530 != 0 && zone->vmz_tasks_hash == NULL)
531 zone->vmz_tasks_hash = mod_hash_create_idhash(
532 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
533
534 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
535 != 0 && zone->vmz_rusers_hash == NULL)
536 zone->vmz_rusers_hash = mod_hash_create_idhash(
537 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
538
539 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
540 != 0 && zone->vmz_eusers_hash == NULL)
541 zone->vmz_eusers_hash = mod_hash_create_idhash(
901 pgcnt_t index;
902 short bound_type;
903 short page_type;
904 vnode_t *vn;
905 anoff_t off;
906 struct anon *ap;
907
908 next = *first;
909 /* Shared anon slots don't change once set. */
910 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
911 for (;;) {
912 if (incore == B_TRUE)
913 next->vmb_type = VMUSAGE_BOUND_INCORE;
914
915 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
916 if (next == *last)
917 break;
918 next = AVL_NEXT(tree, next);
919 continue;
920 }
921 bound_type = next->vmb_type;
922 index = next->vmb_start;
923 while (index <= next->vmb_end) {
924
925 /*
926 * These are used to determine how much to increment
927 * index when a large page is found.
928 */
929 page_t *page;
930 pgcnt_t pgcnt = 1;
931 uint_t pgshft;
932 pgcnt_t pgmsk;
933
934 ap = anon_get_ptr(amp->ahp, index);
935 if (ap != NULL)
936 swap_xlate(ap, &vn, &off);
937
938 if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
939 (page = page_exists(vn, off)) != NULL) {
940 page_type = VMUSAGE_BOUND_INCORE;
941 if (page->p_szc > 0) {
942 pgcnt = page_get_pagecnt(page->p_szc);
943 pgshft = page_get_shift(page->p_szc);
944 pgmsk = (0x1 << (pgshft - PAGESHIFT))
945 - 1;
946 }
947 } else {
948 page_type = VMUSAGE_BOUND_NOT_INCORE;
949 }
950 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
951 next->vmb_type = page_type;
952 } else if (next->vmb_type != page_type) {
953 /*
954 * If current bound type does not match page
955 * type, need to split off new bound.
956 */
957 tmp = vmu_alloc_bound();
958 tmp->vmb_type = page_type;
959 tmp->vmb_start = index;
960 tmp->vmb_end = next->vmb_end;
961 avl_insert_here(tree, tmp, next, AVL_AFTER);
962 next->vmb_end = index - 1;
963 if (*last == next)
964 *last = tmp;
965 next = tmp;
966 }
967 if (pgcnt > 1) {
968 /*
969 * If inside large page, jump to next large
970 * page
971 */
992 vmu_bound_t **first, vmu_bound_t **last)
993 {
994 vmu_bound_t *next;
995 vmu_bound_t *tmp;
996 pgcnt_t index;
997 short bound_type;
998 short page_type;
999
1000 next = *first;
1001 for (;;) {
1002 if (vnode->v_pages == NULL)
1003 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1004
1005 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1006 if (next == *last)
1007 break;
1008 next = AVL_NEXT(tree, next);
1009 continue;
1010 }
1011
1012 bound_type = next->vmb_type;
1013 index = next->vmb_start;
1014 while (index <= next->vmb_end) {
1015
1016 /*
1017 * These are used to determine how much to increment
1018 * index when a large page is found.
1019 */
1020 page_t *page;
1021 pgcnt_t pgcnt = 1;
1022 uint_t pgshft;
1023 pgcnt_t pgmsk;
1024
1025 if (vnode->v_pages != NULL &&
1026 (page = page_exists(vnode, ptob(index))) != NULL) {
1027 page_type = VMUSAGE_BOUND_INCORE;
1028 if (page->p_szc > 0) {
1029 pgcnt = page_get_pagecnt(page->p_szc);
1030 pgshft = page_get_shift(page->p_szc);
1031 pgmsk = (0x1 << (pgshft - PAGESHIFT))
1032 - 1;
1033 }
1034 } else {
1035 page_type = VMUSAGE_BOUND_NOT_INCORE;
1036 }
1037 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1038 next->vmb_type = page_type;
1039 } else if (next->vmb_type != page_type) {
1040 /*
1041 * If current bound type does not match page
1042 * type, need to split off new bound.
1043 */
1044 tmp = vmu_alloc_bound();
1045 tmp->vmb_type = page_type;
1046 tmp->vmb_start = index;
1047 tmp->vmb_end = next->vmb_end;
1048 avl_insert_here(tree, tmp, next, AVL_AFTER);
1049 next->vmb_end = index - 1;
1050 if (*last == next)
1051 *last = tmp;
1052 next = tmp;
1053 }
1054 if (pgcnt > 1) {
1055 /*
1056 * If inside large page, jump to next large
1057 * page
1058 */
1287 pgend = p_end;
1288
1289 /*
1290 * Compute number of pages from large page
1291 * which are mapped.
1292 */
1293 pgcnt = pgend - p_index + 1;
1294
1295 /*
1296 * Point indicies at page after large page,
1297 * or at page after end of mapping.
1298 */
1299 p_index += pgcnt;
1300 s_index += pgcnt;
1301 } else {
1302 p_index++;
1303 s_index++;
1304 }
1305
1306 /*
1307 * Assume anon structs with a refcnt
1308 * of 1 are not COW shared, so there
1309 * is no reason to track them per entity.
1310 */
1311 if (cnt == 1) {
1312 panon += pgcnt;
1313 continue;
1314 }
1315 for (entity = vmu_entities; entity != NULL;
1316 entity = entity->vme_next_calc) {
1317
1318 result = &entity->vme_result;
1319 /*
1320 * Track COW anons per entity so
1321 * they are not double counted.
1322 */
1323 if (vmu_find_insert_anon(entity->vme_anon_hash,
1324 (caddr_t)ap) == 0)
1325 continue;
1326
1444 * which are relative to the process. Then calculate each segment
1445 * in the process'es address space for each relevant entity.
1446 */
1447 static void
1448 vmu_calculate_proc(proc_t *p)
1449 {
1450 vmu_entity_t *entities = NULL;
1451 vmu_zone_t *zone;
1452 vmu_entity_t *tmp;
1453 struct as *as;
1454 struct seg *seg;
1455 int ret;
1456
1457 /* Figure out which entities are being computed */
1458 if ((vmu_data.vmu_system) != NULL) {
1459 tmp = vmu_data.vmu_system;
1460 tmp->vme_next_calc = entities;
1461 entities = tmp;
1462 }
1463 if (vmu_data.vmu_calc_flags &
1464 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1465 VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1466 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1467 VMUSAGE_ALL_EUSERS)) {
1468 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1469 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1470 (mod_hash_val_t *)&zone);
1471 if (ret != 0) {
1472 zone = vmu_alloc_zone(p->p_zone->zone_id);
1473 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1474 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1475 (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1476 ASSERT(ret == 0);
1477 }
1478 if (zone->vmz_zone != NULL) {
1479 tmp = zone->vmz_zone;
1480 tmp->vme_next_calc = entities;
1481 entities = tmp;
1482 }
1483 if (vmu_data.vmu_calc_flags &
1484 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1485 tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1722 cache->vmc_refcnt++;
1723 }
1724
1725 /*
1726 * free cache data
1727 */
1728 static void
1729 vmu_cache_rele(vmu_cache_t *cache)
1730 {
1731 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1732 ASSERT(cache->vmc_refcnt > 0);
1733 cache->vmc_refcnt--;
1734 if (cache->vmc_refcnt == 0) {
1735 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1736 cache->vmc_nresults);
1737 kmem_free(cache, sizeof (vmu_cache_t));
1738 }
1739 }
1740
1741 /*
1742 * Copy out the cached results to a caller. Inspect the callers flags
1743 * and zone to determine which cached results should be copied.
1744 */
1745 static int
1746 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1747 uint_t flags, int cpflg)
1748 {
1749 vmusage_t *result, *out_result;
1750 vmusage_t dummy;
1751 size_t i, count = 0;
1752 size_t bufsize;
1753 int ret = 0;
1754 uint_t types = 0;
1755
1756 if (nres != NULL) {
1757 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1758 return (set_errno(EFAULT));
1759 } else {
1760 bufsize = 0;
1761 }
1762
1763 /* figure out what results the caller is interested in. */
1764 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1765 types |= VMUSAGE_SYSTEM;
1766 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
1767 types |= VMUSAGE_ZONE;
1768 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1769 VMUSAGE_COL_PROJECTS))
1770 types |= VMUSAGE_PROJECTS;
1771 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1772 types |= VMUSAGE_TASKS;
1773 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1774 types |= VMUSAGE_RUSERS;
1775 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1776 types |= VMUSAGE_EUSERS;
1777
1778 /* count results for current zone */
1779 out_result = buf;
1780 for (result = cache->vmc_results, i = 0;
1781 i < cache->vmc_nresults; result++, i++) {
1782
1783 /* Do not return "other-zone" results to non-global zones */
1784 if (curproc->p_zone != global_zone &&
1785 curproc->p_zone->zone_id != result->vmu_zoneid)
1786 continue;
1809 }
1810 }
1811
1812 /* Skip results that do not match requested type */
1813 if ((result->vmu_type & types) == 0)
1814 continue;
1815
1816 /* Skip collated results if not requested */
1817 if (result->vmu_zoneid == ALL_ZONES) {
1818 if (result->vmu_type == VMUSAGE_PROJECTS &&
1819 (flags & VMUSAGE_COL_PROJECTS) == 0)
1820 continue;
1821 if (result->vmu_type == VMUSAGE_EUSERS &&
1822 (flags & VMUSAGE_COL_EUSERS) == 0)
1823 continue;
1824 if (result->vmu_type == VMUSAGE_RUSERS &&
1825 (flags & VMUSAGE_COL_RUSERS) == 0)
1826 continue;
1827 }
1828
1829 /* Skip "other zone" results if not requested */
1830 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1831 if (result->vmu_type == VMUSAGE_ZONE &&
1832 (flags & VMUSAGE_ALL_ZONES) == 0)
1833 continue;
1834 if (result->vmu_type == VMUSAGE_PROJECTS &&
1835 (flags & (VMUSAGE_ALL_PROJECTS |
1836 VMUSAGE_COL_PROJECTS)) == 0)
1837 continue;
1838 if (result->vmu_type == VMUSAGE_TASKS &&
1839 (flags & VMUSAGE_ALL_TASKS) == 0)
1840 continue;
1841 if (result->vmu_type == VMUSAGE_RUSERS &&
1842 (flags & (VMUSAGE_ALL_RUSERS |
1843 VMUSAGE_COL_RUSERS)) == 0)
1844 continue;
1845 if (result->vmu_type == VMUSAGE_EUSERS &&
1846 (flags & (VMUSAGE_ALL_EUSERS |
1847 VMUSAGE_COL_EUSERS)) == 0)
1848 continue;
1849 }
1850 count++;
1851 if (out_result != NULL) {
1852 if (bufsize < count) {
1853 ret = set_errno(EOVERFLOW);
1854 } else {
1855 if (ddi_copyout(result, out_result,
1856 sizeof (vmusage_t), cpflg))
1857 return (set_errno(EFAULT));
1858 out_result++;
1859 }
1860 }
1861 }
1862 if (nres != NULL)
1863 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1864 return (set_errno(EFAULT));
1865
1866 return (ret);
1867 }
1868
1869 /*
1884 * nres: Set to number of vmusage_t structures pointed to by buf
1885 * before calling vm_getusage().
1886 * On return 0 (success) or ENOSPC, is set to the number of result
1887 * structures returned or attempted to return.
1888 *
1889 * returns 0 on success, -1 on failure:
1890 * EINTR (interrupted)
1891 * ENOSPC (nres to small for results, nres set to needed value for success)
1892 * EINVAL (flags invalid)
1893 * EFAULT (bad address for buf or nres)
1894 */
1895 int
1896 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1897 {
1898 vmu_entity_t *entity;
1899 vmusage_t *result;
1900 int ret = 0;
1901 int cacherecent = 0;
1902 hrtime_t now;
1903 uint_t flags_orig;
1904
1905 /*
1906 * Non-global zones cannot request system wide and/or collated
1907 * results, or the system result, so munge the flags accordingly.
1908 */
1909 flags_orig = flags;
1910 if (curproc->p_zone != global_zone) {
1911 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1912 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1913 flags |= VMUSAGE_PROJECTS;
1914 }
1915 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1916 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1917 flags |= VMUSAGE_RUSERS;
1918 }
1919 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1920 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1921 flags |= VMUSAGE_EUSERS;
1922 }
1923 if (flags & VMUSAGE_SYSTEM) {
1924 flags &= ~VMUSAGE_SYSTEM;
1925 flags |= VMUSAGE_ZONE;
1926 }
1927 }
1928
1929 /* Check for unknown flags */
1930 if ((flags & (~VMUSAGE_MASK)) != 0)
1931 return (set_errno(EINVAL));
1932
1933 /* Check for no flags */
1934 if ((flags & VMUSAGE_MASK) == 0)
1935 return (set_errno(EINVAL));
1936
1937 mutex_enter(&vmu_data.vmu_lock);
1938 now = gethrtime();
1939
1940 start:
1941 if (vmu_data.vmu_cache != NULL) {
1942
1943 vmu_cache_t *cache;
1944
1945 if ((vmu_data.vmu_cache->vmc_timestamp +
1946 ((hrtime_t)age * NANOSEC)) > now)
1947 cacherecent = 1;
1948
1949 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1950 cacherecent == 1) {
1951 cache = vmu_data.vmu_cache;
1952 vmu_cache_hold(cache);
1953 mutex_exit(&vmu_data.vmu_lock);
1954
1955 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
1956 cpflg);
1957 mutex_enter(&vmu_data.vmu_lock);
1958 vmu_cache_rele(cache);
1959 if (vmu_data.vmu_pending_waiters > 0)
1960 cv_broadcast(&vmu_data.vmu_cv);
1961 mutex_exit(&vmu_data.vmu_lock);
1962 return (ret);
1963 }
1964 /*
1965 * If the cache is recent, it is likely that there are other
1966 * consumers of vm_getusage running, so add their flags to the
1967 * desired flags for the calculation.
1968 */
1969 if (cacherecent == 1)
1970 flags = vmu_data.vmu_cache->vmc_flags | flags;
1971 }
1972 if (vmu_data.vmu_calc_thread == NULL) {
1973
1974 vmu_cache_t *cache;
1975
1976 vmu_data.vmu_calc_thread = curthread;
1992 vmu_cache_alloc(vmu_data.vmu_nentities,
1993 vmu_data.vmu_calc_flags);
1994
1995 result = cache->vmc_results;
1996 for (entity = vmu_data.vmu_entities; entity != NULL;
1997 entity = entity->vme_next) {
1998 *result = entity->vme_result;
1999 result++;
2000 }
2001 cache->vmc_timestamp = gethrtime();
2002 vmu_cache_hold(cache);
2003
2004 vmu_data.vmu_calc_flags = 0;
2005 vmu_data.vmu_calc_thread = NULL;
2006
2007 if (vmu_data.vmu_pending_waiters > 0)
2008 cv_broadcast(&vmu_data.vmu_cv);
2009
2010 mutex_exit(&vmu_data.vmu_lock);
2011
2012 /* copy cache */
2013 ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
2014 mutex_enter(&vmu_data.vmu_lock);
2015 vmu_cache_rele(cache);
2016 mutex_exit(&vmu_data.vmu_lock);
2017
2018 return (ret);
2019 }
2020 vmu_data.vmu_pending_flags |= flags;
2021 vmu_data.vmu_pending_waiters++;
2022 while (vmu_data.vmu_calc_thread != NULL) {
2023 if (cv_wait_sig(&vmu_data.vmu_cv,
2024 &vmu_data.vmu_lock) == 0) {
2025 vmu_data.vmu_pending_waiters--;
2026 mutex_exit(&vmu_data.vmu_lock);
2027 return (set_errno(EINTR));
2028 }
2029 }
2030 vmu_data.vmu_pending_waiters--;
2031 goto start;
2032 }
|
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright 2016, Joyent, Inc.
29 */
30
31 /*
32 * vm_usage
33 *
34 * This file implements the getvmusage() private system call.
35 * getvmusage() counts the amount of resident memory pages and swap
36 * reserved by the specified process collective. A "process collective" is
37 * the set of processes owned by a particular, zone, project, task, or user.
38 *
39 * rss and swap are counted so that for a given process collective, a page is
40 * only counted once. For example, this means that if multiple processes in
41 * the same project map the same page, then the project will only be charged
42 * once for that page. On the other hand, if two processes in different
43 * projects map the same page, then both projects will be charged
44 * for the page.
45 *
46 * The vm_getusage() calculation is implemented so that the first thread
47 * performs the rss/swap counting. Other callers will wait for that thread to
48 * finish, copying the results. This enables multiple rcapds and prstats to
49 * consume data from the same calculation. The results are also cached so that
50 * a caller interested in recent results can just copy them instead of starting
51 * a new calculation. The caller passes the maximium age (in seconds) of the
505 * Allocate a zone entity, and hashes for tracking visited vm objects
506 * for projects, tasks, and users within that zone.
507 */
508 static vmu_zone_t *
509 vmu_alloc_zone(id_t id)
510 {
511 vmu_zone_t *zone;
512
513 if (vmu_data.vmu_free_zones != NULL) {
514 zone = vmu_data.vmu_free_zones;
515 vmu_data.vmu_free_zones =
516 vmu_data.vmu_free_zones->vmz_next;
517 zone->vmz_next = NULL;
518 zone->vmz_zone = NULL;
519 } else {
520 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
521 }
522
523 zone->vmz_id = id;
524
525 if ((vmu_data.vmu_calc_flags &
526 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
527 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
528
529 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
530 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
531 zone->vmz_projects_hash = mod_hash_create_idhash(
532 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
533
534 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
535 != 0 && zone->vmz_tasks_hash == NULL)
536 zone->vmz_tasks_hash = mod_hash_create_idhash(
537 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
538
539 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
540 != 0 && zone->vmz_rusers_hash == NULL)
541 zone->vmz_rusers_hash = mod_hash_create_idhash(
542 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
543
544 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
545 != 0 && zone->vmz_eusers_hash == NULL)
546 zone->vmz_eusers_hash = mod_hash_create_idhash(
906 pgcnt_t index;
907 short bound_type;
908 short page_type;
909 vnode_t *vn;
910 anoff_t off;
911 struct anon *ap;
912
913 next = *first;
914 /* Shared anon slots don't change once set. */
915 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
916 for (;;) {
917 if (incore == B_TRUE)
918 next->vmb_type = VMUSAGE_BOUND_INCORE;
919
920 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
921 if (next == *last)
922 break;
923 next = AVL_NEXT(tree, next);
924 continue;
925 }
926
927 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
928 bound_type = next->vmb_type;
929 index = next->vmb_start;
930 while (index <= next->vmb_end) {
931
932 /*
933 * These are used to determine how much to increment
934 * index when a large page is found.
935 */
936 page_t *page;
937 pgcnt_t pgcnt = 1;
938 uint_t pgshft;
939 pgcnt_t pgmsk;
940
941 ap = anon_get_ptr(amp->ahp, index);
942 if (ap != NULL)
943 swap_xlate(ap, &vn, &off);
944
945 if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
946 (page = page_exists(vn, off)) != NULL) {
947 if (PP_ISFREE(page))
948 page_type = VMUSAGE_BOUND_NOT_INCORE;
949 else
950 page_type = VMUSAGE_BOUND_INCORE;
951 if (page->p_szc > 0) {
952 pgcnt = page_get_pagecnt(page->p_szc);
953 pgshft = page_get_shift(page->p_szc);
954 pgmsk = (0x1 << (pgshft - PAGESHIFT))
955 - 1;
956 }
957 } else {
958 page_type = VMUSAGE_BOUND_NOT_INCORE;
959 }
960
961 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
962 next->vmb_type = page_type;
963 bound_type = page_type;
964 } else if (next->vmb_type != page_type) {
965 /*
966 * If current bound type does not match page
967 * type, need to split off new bound.
968 */
969 tmp = vmu_alloc_bound();
970 tmp->vmb_type = page_type;
971 tmp->vmb_start = index;
972 tmp->vmb_end = next->vmb_end;
973 avl_insert_here(tree, tmp, next, AVL_AFTER);
974 next->vmb_end = index - 1;
975 if (*last == next)
976 *last = tmp;
977 next = tmp;
978 }
979 if (pgcnt > 1) {
980 /*
981 * If inside large page, jump to next large
982 * page
983 */
1004 vmu_bound_t **first, vmu_bound_t **last)
1005 {
1006 vmu_bound_t *next;
1007 vmu_bound_t *tmp;
1008 pgcnt_t index;
1009 short bound_type;
1010 short page_type;
1011
1012 next = *first;
1013 for (;;) {
1014 if (vnode->v_pages == NULL)
1015 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1016
1017 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1018 if (next == *last)
1019 break;
1020 next = AVL_NEXT(tree, next);
1021 continue;
1022 }
1023
1024 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
1025 bound_type = next->vmb_type;
1026 index = next->vmb_start;
1027 while (index <= next->vmb_end) {
1028
1029 /*
1030 * These are used to determine how much to increment
1031 * index when a large page is found.
1032 */
1033 page_t *page;
1034 pgcnt_t pgcnt = 1;
1035 uint_t pgshft;
1036 pgcnt_t pgmsk;
1037
1038 if (vnode->v_pages != NULL &&
1039 (page = page_exists(vnode, ptob(index))) != NULL) {
1040 if (PP_ISFREE(page))
1041 page_type = VMUSAGE_BOUND_NOT_INCORE;
1042 else
1043 page_type = VMUSAGE_BOUND_INCORE;
1044 if (page->p_szc > 0) {
1045 pgcnt = page_get_pagecnt(page->p_szc);
1046 pgshft = page_get_shift(page->p_szc);
1047 pgmsk = (0x1 << (pgshft - PAGESHIFT))
1048 - 1;
1049 }
1050 } else {
1051 page_type = VMUSAGE_BOUND_NOT_INCORE;
1052 }
1053
1054 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1055 next->vmb_type = page_type;
1056 bound_type = page_type;
1057 } else if (next->vmb_type != page_type) {
1058 /*
1059 * If current bound type does not match page
1060 * type, need to split off new bound.
1061 */
1062 tmp = vmu_alloc_bound();
1063 tmp->vmb_type = page_type;
1064 tmp->vmb_start = index;
1065 tmp->vmb_end = next->vmb_end;
1066 avl_insert_here(tree, tmp, next, AVL_AFTER);
1067 next->vmb_end = index - 1;
1068 if (*last == next)
1069 *last = tmp;
1070 next = tmp;
1071 }
1072 if (pgcnt > 1) {
1073 /*
1074 * If inside large page, jump to next large
1075 * page
1076 */
1305 pgend = p_end;
1306
1307 /*
1308 * Compute number of pages from large page
1309 * which are mapped.
1310 */
1311 pgcnt = pgend - p_index + 1;
1312
1313 /*
1314 * Point indicies at page after large page,
1315 * or at page after end of mapping.
1316 */
1317 p_index += pgcnt;
1318 s_index += pgcnt;
1319 } else {
1320 p_index++;
1321 s_index++;
1322 }
1323
1324 /*
1325 * Pages on the free list aren't counted for the rss.
1326 */
1327 if (PP_ISFREE(page))
1328 continue;
1329
1330 /*
1331 * Assume anon structs with a refcnt
1332 * of 1 are not COW shared, so there
1333 * is no reason to track them per entity.
1334 */
1335 if (cnt == 1) {
1336 panon += pgcnt;
1337 continue;
1338 }
1339 for (entity = vmu_entities; entity != NULL;
1340 entity = entity->vme_next_calc) {
1341
1342 result = &entity->vme_result;
1343 /*
1344 * Track COW anons per entity so
1345 * they are not double counted.
1346 */
1347 if (vmu_find_insert_anon(entity->vme_anon_hash,
1348 (caddr_t)ap) == 0)
1349 continue;
1350
1468 * which are relative to the process. Then calculate each segment
1469 * in the process'es address space for each relevant entity.
1470 */
1471 static void
1472 vmu_calculate_proc(proc_t *p)
1473 {
1474 vmu_entity_t *entities = NULL;
1475 vmu_zone_t *zone;
1476 vmu_entity_t *tmp;
1477 struct as *as;
1478 struct seg *seg;
1479 int ret;
1480
1481 /* Figure out which entities are being computed */
1482 if ((vmu_data.vmu_system) != NULL) {
1483 tmp = vmu_data.vmu_system;
1484 tmp->vme_next_calc = entities;
1485 entities = tmp;
1486 }
1487 if (vmu_data.vmu_calc_flags &
1488 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
1489 VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1490 VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1491 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1492 VMUSAGE_ALL_EUSERS)) {
1493 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1494 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1495 (mod_hash_val_t *)&zone);
1496 if (ret != 0) {
1497 zone = vmu_alloc_zone(p->p_zone->zone_id);
1498 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1499 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1500 (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1501 ASSERT(ret == 0);
1502 }
1503 if (zone->vmz_zone != NULL) {
1504 tmp = zone->vmz_zone;
1505 tmp->vme_next_calc = entities;
1506 entities = tmp;
1507 }
1508 if (vmu_data.vmu_calc_flags &
1509 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1510 tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1747 cache->vmc_refcnt++;
1748 }
1749
1750 /*
1751 * free cache data
1752 */
1753 static void
1754 vmu_cache_rele(vmu_cache_t *cache)
1755 {
1756 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1757 ASSERT(cache->vmc_refcnt > 0);
1758 cache->vmc_refcnt--;
1759 if (cache->vmc_refcnt == 0) {
1760 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1761 cache->vmc_nresults);
1762 kmem_free(cache, sizeof (vmu_cache_t));
1763 }
1764 }
1765
1766 /*
1767 * When new data is calculated, update the phys_mem rctl usage value in the
1768 * zones.
1769 */
1770 static void
1771 vmu_update_zone_rctls(vmu_cache_t *cache)
1772 {
1773 vmusage_t *rp;
1774 size_t i = 0;
1775 zone_t *zp;
1776
1777 for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
1778 if (rp->vmu_type == VMUSAGE_ZONE &&
1779 rp->vmu_zoneid != ALL_ZONES) {
1780 if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
1781 zp->zone_phys_mem = rp->vmu_rss_all;
1782 zone_rele(zp);
1783 }
1784 }
1785 }
1786 }
1787
1788 /*
1789 * Copy out the cached results to a caller. Inspect the callers flags
1790 * and zone to determine which cached results should be copied.
1791 */
1792 static int
1793 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1794 uint_t flags, id_t req_zone_id, int cpflg)
1795 {
1796 vmusage_t *result, *out_result;
1797 vmusage_t dummy;
1798 size_t i, count = 0;
1799 size_t bufsize;
1800 int ret = 0;
1801 uint_t types = 0;
1802
1803 if (nres != NULL) {
1804 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1805 return (set_errno(EFAULT));
1806 } else {
1807 bufsize = 0;
1808 }
1809
1810 /* figure out what results the caller is interested in. */
1811 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1812 types |= VMUSAGE_SYSTEM;
1813 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
1814 types |= VMUSAGE_ZONE;
1815 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1816 VMUSAGE_COL_PROJECTS))
1817 types |= VMUSAGE_PROJECTS;
1818 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1819 types |= VMUSAGE_TASKS;
1820 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1821 types |= VMUSAGE_RUSERS;
1822 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1823 types |= VMUSAGE_EUSERS;
1824
1825 /* count results for current zone */
1826 out_result = buf;
1827 for (result = cache->vmc_results, i = 0;
1828 i < cache->vmc_nresults; result++, i++) {
1829
1830 /* Do not return "other-zone" results to non-global zones */
1831 if (curproc->p_zone != global_zone &&
1832 curproc->p_zone->zone_id != result->vmu_zoneid)
1833 continue;
1856 }
1857 }
1858
1859 /* Skip results that do not match requested type */
1860 if ((result->vmu_type & types) == 0)
1861 continue;
1862
1863 /* Skip collated results if not requested */
1864 if (result->vmu_zoneid == ALL_ZONES) {
1865 if (result->vmu_type == VMUSAGE_PROJECTS &&
1866 (flags & VMUSAGE_COL_PROJECTS) == 0)
1867 continue;
1868 if (result->vmu_type == VMUSAGE_EUSERS &&
1869 (flags & VMUSAGE_COL_EUSERS) == 0)
1870 continue;
1871 if (result->vmu_type == VMUSAGE_RUSERS &&
1872 (flags & VMUSAGE_COL_RUSERS) == 0)
1873 continue;
1874 }
1875
1876 if (result->vmu_type == VMUSAGE_ZONE &&
1877 flags & VMUSAGE_A_ZONE) {
1878 /* Skip non-requested zone results */
1879 if (result->vmu_zoneid != req_zone_id)
1880 continue;
1881 } else {
1882 /* Skip "other zone" results if not requested */
1883 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1884 if (result->vmu_type == VMUSAGE_ZONE &&
1885 (flags & VMUSAGE_ALL_ZONES) == 0)
1886 continue;
1887 if (result->vmu_type == VMUSAGE_PROJECTS &&
1888 (flags & (VMUSAGE_ALL_PROJECTS |
1889 VMUSAGE_COL_PROJECTS)) == 0)
1890 continue;
1891 if (result->vmu_type == VMUSAGE_TASKS &&
1892 (flags & VMUSAGE_ALL_TASKS) == 0)
1893 continue;
1894 if (result->vmu_type == VMUSAGE_RUSERS &&
1895 (flags & (VMUSAGE_ALL_RUSERS |
1896 VMUSAGE_COL_RUSERS)) == 0)
1897 continue;
1898 if (result->vmu_type == VMUSAGE_EUSERS &&
1899 (flags & (VMUSAGE_ALL_EUSERS |
1900 VMUSAGE_COL_EUSERS)) == 0)
1901 continue;
1902 }
1903 }
1904 count++;
1905 if (out_result != NULL) {
1906 if (bufsize < count) {
1907 ret = set_errno(EOVERFLOW);
1908 } else {
1909 if (ddi_copyout(result, out_result,
1910 sizeof (vmusage_t), cpflg))
1911 return (set_errno(EFAULT));
1912 out_result++;
1913 }
1914 }
1915 }
1916 if (nres != NULL)
1917 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1918 return (set_errno(EFAULT));
1919
1920 return (ret);
1921 }
1922
1923 /*
1938 * nres: Set to number of vmusage_t structures pointed to by buf
1939 * before calling vm_getusage().
1940 * On return 0 (success) or ENOSPC, is set to the number of result
1941 * structures returned or attempted to return.
1942 *
1943 * returns 0 on success, -1 on failure:
1944 * EINTR (interrupted)
1945 * ENOSPC (nres to small for results, nres set to needed value for success)
1946 * EINVAL (flags invalid)
1947 * EFAULT (bad address for buf or nres)
1948 */
1949 int
1950 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1951 {
1952 vmu_entity_t *entity;
1953 vmusage_t *result;
1954 int ret = 0;
1955 int cacherecent = 0;
1956 hrtime_t now;
1957 uint_t flags_orig;
1958 id_t req_zone_id;
1959
1960 /*
1961 * Non-global zones cannot request system wide and/or collated
1962 * results, or the system result, or usage of another zone, so munge
1963 * the flags accordingly.
1964 */
1965 flags_orig = flags;
1966 if (curproc->p_zone != global_zone) {
1967 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1968 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1969 flags |= VMUSAGE_PROJECTS;
1970 }
1971 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1972 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1973 flags |= VMUSAGE_RUSERS;
1974 }
1975 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1976 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1977 flags |= VMUSAGE_EUSERS;
1978 }
1979 if (flags & VMUSAGE_SYSTEM) {
1980 flags &= ~VMUSAGE_SYSTEM;
1981 flags |= VMUSAGE_ZONE;
1982 }
1983 if (flags & VMUSAGE_A_ZONE) {
1984 flags &= ~VMUSAGE_A_ZONE;
1985 flags |= VMUSAGE_ZONE;
1986 }
1987 }
1988
1989 /* Check for unknown flags */
1990 if ((flags & (~VMUSAGE_MASK)) != 0)
1991 return (set_errno(EINVAL));
1992
1993 /* Check for no flags */
1994 if ((flags & VMUSAGE_MASK) == 0)
1995 return (set_errno(EINVAL));
1996
1997 /* If requesting results for a specific zone, get the zone ID */
1998 if (flags & VMUSAGE_A_ZONE) {
1999 size_t bufsize;
2000 vmusage_t zreq;
2001
2002 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
2003 return (set_errno(EFAULT));
2004 /* Requested zone ID is passed in buf, so 0 len not allowed */
2005 if (bufsize == 0)
2006 return (set_errno(EINVAL));
2007 if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
2008 return (set_errno(EFAULT));
2009 req_zone_id = zreq.vmu_id;
2010 }
2011
2012 mutex_enter(&vmu_data.vmu_lock);
2013 now = gethrtime();
2014
2015 start:
2016 if (vmu_data.vmu_cache != NULL) {
2017
2018 vmu_cache_t *cache;
2019
2020 if ((vmu_data.vmu_cache->vmc_timestamp +
2021 ((hrtime_t)age * NANOSEC)) > now)
2022 cacherecent = 1;
2023
2024 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
2025 cacherecent == 1) {
2026 cache = vmu_data.vmu_cache;
2027 vmu_cache_hold(cache);
2028 mutex_exit(&vmu_data.vmu_lock);
2029
2030 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2031 req_zone_id, cpflg);
2032 mutex_enter(&vmu_data.vmu_lock);
2033 vmu_cache_rele(cache);
2034 if (vmu_data.vmu_pending_waiters > 0)
2035 cv_broadcast(&vmu_data.vmu_cv);
2036 mutex_exit(&vmu_data.vmu_lock);
2037 return (ret);
2038 }
2039 /*
2040 * If the cache is recent, it is likely that there are other
2041 * consumers of vm_getusage running, so add their flags to the
2042 * desired flags for the calculation.
2043 */
2044 if (cacherecent == 1)
2045 flags = vmu_data.vmu_cache->vmc_flags | flags;
2046 }
2047 if (vmu_data.vmu_calc_thread == NULL) {
2048
2049 vmu_cache_t *cache;
2050
2051 vmu_data.vmu_calc_thread = curthread;
2067 vmu_cache_alloc(vmu_data.vmu_nentities,
2068 vmu_data.vmu_calc_flags);
2069
2070 result = cache->vmc_results;
2071 for (entity = vmu_data.vmu_entities; entity != NULL;
2072 entity = entity->vme_next) {
2073 *result = entity->vme_result;
2074 result++;
2075 }
2076 cache->vmc_timestamp = gethrtime();
2077 vmu_cache_hold(cache);
2078
2079 vmu_data.vmu_calc_flags = 0;
2080 vmu_data.vmu_calc_thread = NULL;
2081
2082 if (vmu_data.vmu_pending_waiters > 0)
2083 cv_broadcast(&vmu_data.vmu_cv);
2084
2085 mutex_exit(&vmu_data.vmu_lock);
2086
2087 /* update zone's phys. mem. rctl usage */
2088 vmu_update_zone_rctls(cache);
2089 /* copy cache */
2090 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2091 req_zone_id, cpflg);
2092 mutex_enter(&vmu_data.vmu_lock);
2093 vmu_cache_rele(cache);
2094 mutex_exit(&vmu_data.vmu_lock);
2095
2096 return (ret);
2097 }
2098 vmu_data.vmu_pending_flags |= flags;
2099 vmu_data.vmu_pending_waiters++;
2100 while (vmu_data.vmu_calc_thread != NULL) {
2101 if (cv_wait_sig(&vmu_data.vmu_cv,
2102 &vmu_data.vmu_lock) == 0) {
2103 vmu_data.vmu_pending_waiters--;
2104 mutex_exit(&vmu_data.vmu_lock);
2105 return (set_errno(EINTR));
2106 }
2107 }
2108 vmu_data.vmu_pending_waiters--;
2109 goto start;
2110 }
2111
2112 #if defined(__x86)
2113 /*
2114 * Attempt to invalidate all of the pages in the mapping for the given process.
2115 */
2116 static void
2117 map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
2118 {
2119 page_t *pp;
2120 size_t psize;
2121 u_offset_t off;
2122 caddr_t eaddr;
2123 struct vnode *vp;
2124 struct segvn_data *svd;
2125 struct hat *victim_hat;
2126
2127 ASSERT((addr + size) <= (seg->s_base + seg->s_size));
2128
2129 victim_hat = p->p_as->a_hat;
2130 svd = (struct segvn_data *)seg->s_data;
2131 vp = svd->vp;
2132 psize = page_get_pagesize(seg->s_szc);
2133
2134 off = svd->offset + (uintptr_t)(addr - seg->s_base);
2135
2136 for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
2137 pp = page_lookup_nowait(vp, off, SE_SHARED);
2138
2139 if (pp != NULL) {
2140 /* following logic based on pvn_getdirty() */
2141
2142 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2143 page_unlock(pp);
2144 continue;
2145 }
2146
2147 page_io_lock(pp);
2148 hat_page_inval(pp, 0, victim_hat);
2149 page_io_unlock(pp);
2150
2151 /*
2152 * For B_INVALCURONLY-style handling we let
2153 * page_release call VN_DISPOSE if no one else is using
2154 * the page.
2155 *
2156 * A hat_ismod() check would be useless because:
2157 * (1) we are not be holding SE_EXCL lock
2158 * (2) we've not unloaded _all_ translations
2159 *
2160 * Let page_release() do the heavy-lifting.
2161 */
2162 (void) page_release(pp, 1);
2163 }
2164 }
2165 }
2166
2167 /*
2168 * vm_map_inval()
2169 *
2170 * Invalidate as many pages as possible within the given mapping for the given
2171 * process. addr is expected to be the base address of the mapping and size is
2172 * the length of the mapping. In some cases a mapping will encompass an
2173 * entire segment, but at least for anon or stack mappings, these will be
2174 * regions within a single large segment. Thus, the invalidation is oriented
2175 * around a single mapping and not an entire segment.
2176 *
2177 * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
2178 * this code is only applicable to x86.
2179 */
2180 int
2181 vm_map_inval(pid_t pid, caddr_t addr, size_t size)
2182 {
2183 int ret;
2184 int error = 0;
2185 proc_t *p; /* target proc */
2186 struct as *as; /* target proc's address space */
2187 struct seg *seg; /* working segment */
2188
2189 if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
2190 return (set_errno(EPERM));
2191
2192 /* If not a valid mapping address, return an error */
2193 if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
2194 return (set_errno(EINVAL));
2195
2196 again:
2197 mutex_enter(&pidlock);
2198 p = prfind(pid);
2199 if (p == NULL) {
2200 mutex_exit(&pidlock);
2201 return (set_errno(ESRCH));
2202 }
2203
2204 mutex_enter(&p->p_lock);
2205 mutex_exit(&pidlock);
2206
2207 if (panicstr != NULL) {
2208 mutex_exit(&p->p_lock);
2209 return (0);
2210 }
2211
2212 as = p->p_as;
2213
2214 /*
2215 * Try to set P_PR_LOCK - prevents process "changing shape"
2216 * - blocks fork
2217 * - blocks sigkill
2218 * - cannot be a system proc
2219 * - must be fully created proc
2220 */
2221 ret = sprtrylock_proc(p);
2222 if (ret == -1) {
2223 /* Process in invalid state */
2224 mutex_exit(&p->p_lock);
2225 return (set_errno(ESRCH));
2226 }
2227
2228 if (ret == 1) {
2229 /*
2230 * P_PR_LOCK is already set. Wait and try again. This also
2231 * drops p_lock so p may no longer be valid since the proc may
2232 * have exited.
2233 */
2234 sprwaitlock_proc(p);
2235 goto again;
2236 }
2237
2238 /* P_PR_LOCK is now set */
2239 mutex_exit(&p->p_lock);
2240
2241 AS_LOCK_ENTER(as, RW_READER);
2242 if ((seg = as_segat(as, addr)) == NULL) {
2243 AS_LOCK_EXIT(as);
2244 mutex_enter(&p->p_lock);
2245 sprunlock(p);
2246 return (set_errno(ENOMEM));
2247 }
2248
2249 /*
2250 * The invalidation behavior only makes sense for vnode-backed segments.
2251 */
2252 if (seg->s_ops != &segvn_ops) {
2253 AS_LOCK_EXIT(as);
2254 mutex_enter(&p->p_lock);
2255 sprunlock(p);
2256 return (0);
2257 }
2258
2259 /*
2260 * If the mapping is out of bounds of the segement return an error.
2261 */
2262 if ((addr + size) > (seg->s_base + seg->s_size)) {
2263 AS_LOCK_EXIT(as);
2264 mutex_enter(&p->p_lock);
2265 sprunlock(p);
2266 return (set_errno(EINVAL));
2267 }
2268
2269 /*
2270 * Don't use MS_INVALCURPROC flag here since that would eventually
2271 * initiate hat invalidation based on curthread. Since we're doing this
2272 * on behalf of a different process, that would erroneously invalidate
2273 * our own process mappings.
2274 */
2275 error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
2276 if (error == 0) {
2277 /*
2278 * Since we didn't invalidate during the sync above, we now
2279 * try to invalidate all of the pages in the mapping.
2280 */
2281 map_inval(p, seg, addr, size);
2282 }
2283 AS_LOCK_EXIT(as);
2284
2285 mutex_enter(&p->p_lock);
2286 sprunlock(p);
2287
2288 if (error)
2289 (void) set_errno(error);
2290 return (error);
2291 }
2292 #endif
|