Print this page
OS-5078 illumos#6514 broke vm_usage and lx proc
OS-2969 vm_getusage syscall accurate zone RSS is overcounting
OS-3088 need a lighterweight page invalidation mechanism for zone memcap
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process
OS-750 improve RUSAGESYS_GETVMUSAGE for zoneadmd
OS-399 zone phys. mem. cap should be a rctl and have associated kstat
*** 23,32 ****
--- 23,36 ----
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
+ * Copyright 2016, Joyent, Inc.
+ */
+
+ /*
* vm_usage
*
* This file implements the getvmusage() private system call.
* getvmusage() counts the amount of resident memory pages and swap
* reserved by the specified process collective. A "process collective" is
*** 516,526 ****
zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
}
zone->vmz_id = id;
! if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
zone->vmz_projects_hash = mod_hash_create_idhash(
--- 520,531 ----
zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
}
zone->vmz_id = id;
! if ((vmu_data.vmu_calc_flags &
! (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
zone->vmz_projects_hash = mod_hash_create_idhash(
*** 916,925 ****
--- 921,932 ----
if (next == *last)
break;
next = AVL_NEXT(tree, next);
continue;
}
+
+ ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
bound_type = next->vmb_type;
index = next->vmb_start;
while (index <= next->vmb_end) {
/*
*** 935,944 ****
--- 942,954 ----
if (ap != NULL)
swap_xlate(ap, &vn, &off);
if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
(page = page_exists(vn, off)) != NULL) {
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
pgmsk = (0x1 << (pgshft - PAGESHIFT))
*** 945,956 ****
--- 955,968 ----
- 1;
}
} else {
page_type = VMUSAGE_BOUND_NOT_INCORE;
}
+
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
next->vmb_type = page_type;
+ bound_type = page_type;
} else if (next->vmb_type != page_type) {
/*
* If current bound type does not match page
* type, need to split off new bound.
*/
*** 1007,1016 ****
--- 1019,1029 ----
break;
next = AVL_NEXT(tree, next);
continue;
}
+ ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
bound_type = next->vmb_type;
index = next->vmb_start;
while (index <= next->vmb_end) {
/*
*** 1022,1031 ****
--- 1035,1047 ----
uint_t pgshft;
pgcnt_t pgmsk;
if (vnode->v_pages != NULL &&
(page = page_exists(vnode, ptob(index))) != NULL) {
+ if (PP_ISFREE(page))
+ page_type = VMUSAGE_BOUND_NOT_INCORE;
+ else
page_type = VMUSAGE_BOUND_INCORE;
if (page->p_szc > 0) {
pgcnt = page_get_pagecnt(page->p_szc);
pgshft = page_get_shift(page->p_szc);
pgmsk = (0x1 << (pgshft - PAGESHIFT))
*** 1032,1043 ****
--- 1048,1061 ----
- 1;
}
} else {
page_type = VMUSAGE_BOUND_NOT_INCORE;
}
+
if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
next->vmb_type = page_type;
+ bound_type = page_type;
} else if (next->vmb_type != page_type) {
/*
* If current bound type does not match page
* type, need to split off new bound.
*/
*** 1302,1311 ****
--- 1320,1335 ----
p_index++;
s_index++;
}
/*
+ * Pages on the free list aren't counted for the rss.
+ */
+ if (PP_ISFREE(page))
+ continue;
+
+ /*
* Assume anon structs with a refcnt
* of 1 are not COW shared, so there
* is no reason to track them per entity.
*/
if (cnt == 1) {
*** 1459,1470 ****
tmp = vmu_data.vmu_system;
tmp->vme_next_calc = entities;
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
! (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
! VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
VMUSAGE_ALL_EUSERS)) {
ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
(mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
(mod_hash_val_t *)&zone);
--- 1483,1495 ----
tmp = vmu_data.vmu_system;
tmp->vme_next_calc = entities;
entities = tmp;
}
if (vmu_data.vmu_calc_flags &
! (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
! VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
! VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
VMUSAGE_ALL_EUSERS)) {
ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
(mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
(mod_hash_val_t *)&zone);
*** 1737,1752 ****
kmem_free(cache, sizeof (vmu_cache_t));
}
}
/*
* Copy out the cached results to a caller. Inspect the callers flags
* and zone to determine which cached results should be copied.
*/
static int
vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
! uint_t flags, int cpflg)
{
vmusage_t *result, *out_result;
vmusage_t dummy;
size_t i, count = 0;
size_t bufsize;
--- 1762,1799 ----
kmem_free(cache, sizeof (vmu_cache_t));
}
}
/*
+ * When new data is calculated, update the phys_mem rctl usage value in the
+ * zones.
+ */
+ static void
+ vmu_update_zone_rctls(vmu_cache_t *cache)
+ {
+ vmusage_t *rp;
+ size_t i = 0;
+ zone_t *zp;
+
+ for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
+ if (rp->vmu_type == VMUSAGE_ZONE &&
+ rp->vmu_zoneid != ALL_ZONES) {
+ if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
+ zp->zone_phys_mem = rp->vmu_rss_all;
+ zone_rele(zp);
+ }
+ }
+ }
+ }
+
+ /*
* Copy out the cached results to a caller. Inspect the callers flags
* and zone to determine which cached results should be copied.
*/
static int
vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
! uint_t flags, id_t req_zone_id, int cpflg)
{
vmusage_t *result, *out_result;
vmusage_t dummy;
size_t i, count = 0;
size_t bufsize;
*** 1761,1771 ****
}
/* figure out what results the caller is interested in. */
if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
types |= VMUSAGE_SYSTEM;
! if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
types |= VMUSAGE_ZONE;
if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
VMUSAGE_COL_PROJECTS))
types |= VMUSAGE_PROJECTS;
if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
--- 1808,1818 ----
}
/* figure out what results the caller is interested in. */
if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
types |= VMUSAGE_SYSTEM;
! if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
types |= VMUSAGE_ZONE;
if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
VMUSAGE_COL_PROJECTS))
types |= VMUSAGE_PROJECTS;
if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
*** 1824,1833 ****
--- 1871,1886 ----
if (result->vmu_type == VMUSAGE_RUSERS &&
(flags & VMUSAGE_COL_RUSERS) == 0)
continue;
}
+ if (result->vmu_type == VMUSAGE_ZONE &&
+ flags & VMUSAGE_A_ZONE) {
+ /* Skip non-requested zone results */
+ if (result->vmu_zoneid != req_zone_id)
+ continue;
+ } else {
/* Skip "other zone" results if not requested */
if (result->vmu_zoneid != curproc->p_zone->zone_id) {
if (result->vmu_type == VMUSAGE_ZONE &&
(flags & VMUSAGE_ALL_ZONES) == 0)
continue;
*** 1845,1854 ****
--- 1898,1908 ----
if (result->vmu_type == VMUSAGE_EUSERS &&
(flags & (VMUSAGE_ALL_EUSERS |
VMUSAGE_COL_EUSERS)) == 0)
continue;
}
+ }
count++;
if (out_result != NULL) {
if (bufsize < count) {
ret = set_errno(EOVERFLOW);
} else {
*** 1899,1912 ****
vmusage_t *result;
int ret = 0;
int cacherecent = 0;
hrtime_t now;
uint_t flags_orig;
/*
* Non-global zones cannot request system wide and/or collated
! * results, or the system result, so munge the flags accordingly.
*/
flags_orig = flags;
if (curproc->p_zone != global_zone) {
if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
--- 1953,1968 ----
vmusage_t *result;
int ret = 0;
int cacherecent = 0;
hrtime_t now;
uint_t flags_orig;
+ id_t req_zone_id;
/*
* Non-global zones cannot request system wide and/or collated
! * results, or the system result, or usage of another zone, so munge
! * the flags accordingly.
*/
flags_orig = flags;
if (curproc->p_zone != global_zone) {
if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
*** 1922,1941 ****
--- 1978,2016 ----
}
if (flags & VMUSAGE_SYSTEM) {
flags &= ~VMUSAGE_SYSTEM;
flags |= VMUSAGE_ZONE;
}
+ if (flags & VMUSAGE_A_ZONE) {
+ flags &= ~VMUSAGE_A_ZONE;
+ flags |= VMUSAGE_ZONE;
}
+ }
/* Check for unknown flags */
if ((flags & (~VMUSAGE_MASK)) != 0)
return (set_errno(EINVAL));
/* Check for no flags */
if ((flags & VMUSAGE_MASK) == 0)
return (set_errno(EINVAL));
+ /* If requesting results for a specific zone, get the zone ID */
+ if (flags & VMUSAGE_A_ZONE) {
+ size_t bufsize;
+ vmusage_t zreq;
+
+ if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
+ return (set_errno(EFAULT));
+ /* Requested zone ID is passed in buf, so 0 len not allowed */
+ if (bufsize == 0)
+ return (set_errno(EINVAL));
+ if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
+ return (set_errno(EFAULT));
+ req_zone_id = zreq.vmu_id;
+ }
+
mutex_enter(&vmu_data.vmu_lock);
now = gethrtime();
start:
if (vmu_data.vmu_cache != NULL) {
*** 1951,1961 ****
cache = vmu_data.vmu_cache;
vmu_cache_hold(cache);
mutex_exit(&vmu_data.vmu_lock);
ret = vmu_copyout_results(cache, buf, nres, flags_orig,
! cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
if (vmu_data.vmu_pending_waiters > 0)
cv_broadcast(&vmu_data.vmu_cv);
mutex_exit(&vmu_data.vmu_lock);
--- 2026,2036 ----
cache = vmu_data.vmu_cache;
vmu_cache_hold(cache);
mutex_exit(&vmu_data.vmu_lock);
ret = vmu_copyout_results(cache, buf, nres, flags_orig,
! req_zone_id, cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
if (vmu_data.vmu_pending_waiters > 0)
cv_broadcast(&vmu_data.vmu_cv);
mutex_exit(&vmu_data.vmu_lock);
*** 2007,2018 ****
if (vmu_data.vmu_pending_waiters > 0)
cv_broadcast(&vmu_data.vmu_cv);
mutex_exit(&vmu_data.vmu_lock);
/* copy cache */
! ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
mutex_exit(&vmu_data.vmu_lock);
return (ret);
--- 2082,2096 ----
if (vmu_data.vmu_pending_waiters > 0)
cv_broadcast(&vmu_data.vmu_cv);
mutex_exit(&vmu_data.vmu_lock);
+ /* update zone's phys. mem. rctl usage */
+ vmu_update_zone_rctls(cache);
/* copy cache */
! ret = vmu_copyout_results(cache, buf, nres, flags_orig,
! req_zone_id, cpflg);
mutex_enter(&vmu_data.vmu_lock);
vmu_cache_rele(cache);
mutex_exit(&vmu_data.vmu_lock);
return (ret);
*** 2028,2032 ****
--- 2106,2292 ----
}
}
vmu_data.vmu_pending_waiters--;
goto start;
}
+
+ #if defined(__x86)
+ /*
+ * Attempt to invalidate all of the pages in the mapping for the given process.
+ */
+ static void
+ map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
+ {
+ page_t *pp;
+ size_t psize;
+ u_offset_t off;
+ caddr_t eaddr;
+ struct vnode *vp;
+ struct segvn_data *svd;
+ struct hat *victim_hat;
+
+ ASSERT((addr + size) <= (seg->s_base + seg->s_size));
+
+ victim_hat = p->p_as->a_hat;
+ svd = (struct segvn_data *)seg->s_data;
+ vp = svd->vp;
+ psize = page_get_pagesize(seg->s_szc);
+
+ off = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+ for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
+ pp = page_lookup_nowait(vp, off, SE_SHARED);
+
+ if (pp != NULL) {
+ /* following logic based on pvn_getdirty() */
+
+ if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+ page_unlock(pp);
+ continue;
+ }
+
+ page_io_lock(pp);
+ hat_page_inval(pp, 0, victim_hat);
+ page_io_unlock(pp);
+
+ /*
+ * For B_INVALCURONLY-style handling we let
+ * page_release call VN_DISPOSE if no one else is using
+ * the page.
+ *
+ * A hat_ismod() check would be useless because:
+ * (1) we are not be holding SE_EXCL lock
+ * (2) we've not unloaded _all_ translations
+ *
+ * Let page_release() do the heavy-lifting.
+ */
+ (void) page_release(pp, 1);
+ }
+ }
+ }
+
+ /*
+ * vm_map_inval()
+ *
+ * Invalidate as many pages as possible within the given mapping for the given
+ * process. addr is expected to be the base address of the mapping and size is
+ * the length of the mapping. In some cases a mapping will encompass an
+ * entire segment, but at least for anon or stack mappings, these will be
+ * regions within a single large segment. Thus, the invalidation is oriented
+ * around a single mapping and not an entire segment.
+ *
+ * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
+ * this code is only applicable to x86.
+ */
+ int
+ vm_map_inval(pid_t pid, caddr_t addr, size_t size)
+ {
+ int ret;
+ int error = 0;
+ proc_t *p; /* target proc */
+ struct as *as; /* target proc's address space */
+ struct seg *seg; /* working segment */
+
+ if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
+ return (set_errno(EPERM));
+
+ /* If not a valid mapping address, return an error */
+ if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
+ return (set_errno(EINVAL));
+
+ again:
+ mutex_enter(&pidlock);
+ p = prfind(pid);
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ return (set_errno(ESRCH));
+ }
+
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ if (panicstr != NULL) {
+ mutex_exit(&p->p_lock);
+ return (0);
+ }
+
+ as = p->p_as;
+
+ /*
+ * Try to set P_PR_LOCK - prevents process "changing shape"
+ * - blocks fork
+ * - blocks sigkill
+ * - cannot be a system proc
+ * - must be fully created proc
+ */
+ ret = sprtrylock_proc(p);
+ if (ret == -1) {
+ /* Process in invalid state */
+ mutex_exit(&p->p_lock);
+ return (set_errno(ESRCH));
+ }
+
+ if (ret == 1) {
+ /*
+ * P_PR_LOCK is already set. Wait and try again. This also
+ * drops p_lock so p may no longer be valid since the proc may
+ * have exited.
+ */
+ sprwaitlock_proc(p);
+ goto again;
+ }
+
+ /* P_PR_LOCK is now set */
+ mutex_exit(&p->p_lock);
+
+ AS_LOCK_ENTER(as, RW_READER);
+ if ((seg = as_segat(as, addr)) == NULL) {
+ AS_LOCK_EXIT(as);
+ mutex_enter(&p->p_lock);
+ sprunlock(p);
+ return (set_errno(ENOMEM));
+ }
+
+ /*
+ * The invalidation behavior only makes sense for vnode-backed segments.
+ */
+ if (seg->s_ops != &segvn_ops) {
+ AS_LOCK_EXIT(as);
+ mutex_enter(&p->p_lock);
+ sprunlock(p);
+ return (0);
+ }
+
+ /*
+ * If the mapping is out of bounds of the segement return an error.
+ */
+ if ((addr + size) > (seg->s_base + seg->s_size)) {
+ AS_LOCK_EXIT(as);
+ mutex_enter(&p->p_lock);
+ sprunlock(p);
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * Don't use MS_INVALCURPROC flag here since that would eventually
+ * initiate hat invalidation based on curthread. Since we're doing this
+ * on behalf of a different process, that would erroneously invalidate
+ * our own process mappings.
+ */
+ error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
+ if (error == 0) {
+ /*
+ * Since we didn't invalidate during the sync above, we now
+ * try to invalidate all of the pages in the mapping.
+ */
+ map_inval(p, seg, addr, size);
+ }
+ AS_LOCK_EXIT(as);
+
+ mutex_enter(&p->p_lock);
+ sprunlock(p);
+
+ if (error)
+ (void) set_errno(error);
+ return (error);
+ }
+ #endif