Print this page
OS-5078 illumos#6514 broke vm_usage and lx proc
OS-2969 vm_getusage syscall accurate zone RSS is overcounting
OS-3088 need a lighterweight page invalidation mechanism for zone memcap
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process
OS-750 improve RUSAGESYS_GETVMUSAGE for zoneadmd
OS-399 zone phys. mem. cap should be a rctl and have associated kstat

*** 23,32 **** --- 23,36 ---- * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* + * Copyright 2016, Joyent, Inc. + */ + + /* * vm_usage * * This file implements the getvmusage() private system call. * getvmusage() counts the amount of resident memory pages and swap * reserved by the specified process collective. A "process collective" is
*** 516,526 **** zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); } zone->vmz_id = id; ! if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) zone->vmz_projects_hash = mod_hash_create_idhash( --- 520,531 ---- zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); } zone->vmz_id = id; ! if ((vmu_data.vmu_calc_flags & ! (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) zone->vmz_projects_hash = mod_hash_create_idhash(
*** 916,925 **** --- 921,932 ---- if (next == *last) break; next = AVL_NEXT(tree, next); continue; } + + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { /*
*** 935,944 **** --- 942,954 ---- if (ap != NULL) swap_xlate(ap, &vn, &off); if (ap != NULL && vn != NULL && vn->v_pages != NULL && (page = page_exists(vn, off)) != NULL) { + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); pgmsk = (0x1 << (pgshft - PAGESHIFT))
*** 945,956 **** --- 955,968 ---- - 1; } } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page * type, need to split off new bound. */
*** 1007,1016 **** --- 1019,1029 ---- break; next = AVL_NEXT(tree, next); continue; } + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { /*
*** 1022,1031 **** --- 1035,1047 ---- uint_t pgshft; pgcnt_t pgmsk; if (vnode->v_pages != NULL && (page = page_exists(vnode, ptob(index))) != NULL) { + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); pgmsk = (0x1 << (pgshft - PAGESHIFT))
*** 1032,1043 **** --- 1048,1061 ---- - 1; } } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page * type, need to split off new bound. */
*** 1302,1311 **** --- 1320,1335 ---- p_index++; s_index++; } /* + * Pages on the free list aren't counted for the rss. + */ + if (PP_ISFREE(page)) + continue; + + /* * Assume anon structs with a refcnt * of 1 are not COW shared, so there * is no reason to track them per entity. */ if (cnt == 1) {
*** 1459,1470 **** tmp = vmu_data.vmu_system; tmp->vme_next_calc = entities; entities = tmp; } if (vmu_data.vmu_calc_flags & ! (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | ! VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, (mod_hash_val_t *)&zone); --- 1483,1495 ---- tmp = vmu_data.vmu_system; tmp->vme_next_calc = entities; entities = tmp; } if (vmu_data.vmu_calc_flags & ! (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE | ! VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | ! VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, (mod_hash_val_t *)&zone);
*** 1737,1752 **** kmem_free(cache, sizeof (vmu_cache_t)); } } /* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, ! uint_t flags, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; size_t i, count = 0; size_t bufsize; --- 1762,1799 ---- kmem_free(cache, sizeof (vmu_cache_t)); } } /* + * When new data is calculated, update the phys_mem rctl usage value in the + * zones. + */ + static void + vmu_update_zone_rctls(vmu_cache_t *cache) + { + vmusage_t *rp; + size_t i = 0; + zone_t *zp; + + for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) { + if (rp->vmu_type == VMUSAGE_ZONE && + rp->vmu_zoneid != ALL_ZONES) { + if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) { + zp->zone_phys_mem = rp->vmu_rss_all; + zone_rele(zp); + } + } + } + } + + /* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, ! uint_t flags, id_t req_zone_id, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; size_t i, count = 0; size_t bufsize;
*** 1761,1771 **** } /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; ! if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) types |= VMUSAGE_PROJECTS; if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) --- 1808,1818 ---- } /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; ! if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) types |= VMUSAGE_PROJECTS; if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
*** 1824,1833 **** --- 1871,1886 ---- if (result->vmu_type == VMUSAGE_RUSERS && (flags & VMUSAGE_COL_RUSERS) == 0) continue; } + if (result->vmu_type == VMUSAGE_ZONE && + flags & VMUSAGE_A_ZONE) { + /* Skip non-requested zone results */ + if (result->vmu_zoneid != req_zone_id) + continue; + } else { /* Skip "other zone" results if not requested */ if (result->vmu_zoneid != curproc->p_zone->zone_id) { if (result->vmu_type == VMUSAGE_ZONE && (flags & VMUSAGE_ALL_ZONES) == 0) continue;
*** 1845,1854 **** --- 1898,1908 ---- if (result->vmu_type == VMUSAGE_EUSERS && (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) == 0) continue; } + } count++; if (out_result != NULL) { if (bufsize < count) { ret = set_errno(EOVERFLOW); } else {
*** 1899,1912 **** vmusage_t *result; int ret = 0; int cacherecent = 0; hrtime_t now; uint_t flags_orig; /* * Non-global zones cannot request system wide and/or collated ! * results, or the system result, so munge the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); --- 1953,1968 ---- vmusage_t *result; int ret = 0; int cacherecent = 0; hrtime_t now; uint_t flags_orig; + id_t req_zone_id; /* * Non-global zones cannot request system wide and/or collated ! * results, or the system result, or usage of another zone, so munge ! * the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
*** 1922,1941 **** --- 1978,2016 ---- } if (flags & VMUSAGE_SYSTEM) { flags &= ~VMUSAGE_SYSTEM; flags |= VMUSAGE_ZONE; } + if (flags & VMUSAGE_A_ZONE) { + flags &= ~VMUSAGE_A_ZONE; + flags |= VMUSAGE_ZONE; } + } /* Check for unknown flags */ if ((flags & (~VMUSAGE_MASK)) != 0) return (set_errno(EINVAL)); /* Check for no flags */ if ((flags & VMUSAGE_MASK) == 0) return (set_errno(EINVAL)); + /* If requesting results for a specific zone, get the zone ID */ + if (flags & VMUSAGE_A_ZONE) { + size_t bufsize; + vmusage_t zreq; + + if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) + return (set_errno(EFAULT)); + /* Requested zone ID is passed in buf, so 0 len not allowed */ + if (bufsize == 0) + return (set_errno(EINVAL)); + if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg)) + return (set_errno(EFAULT)); + req_zone_id = zreq.vmu_id; + } + mutex_enter(&vmu_data.vmu_lock); now = gethrtime(); start: if (vmu_data.vmu_cache != NULL) {
*** 1951,1961 **** cache = vmu_data.vmu_cache; vmu_cache_hold(cache); mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, ! cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) cv_broadcast(&vmu_data.vmu_cv); mutex_exit(&vmu_data.vmu_lock); --- 2026,2036 ---- cache = vmu_data.vmu_cache; vmu_cache_hold(cache); mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, ! req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) cv_broadcast(&vmu_data.vmu_cv); mutex_exit(&vmu_data.vmu_lock);
*** 2007,2018 **** if (vmu_data.vmu_pending_waiters > 0) cv_broadcast(&vmu_data.vmu_cv); mutex_exit(&vmu_data.vmu_lock); /* copy cache */ ! ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); return (ret); --- 2082,2096 ---- if (vmu_data.vmu_pending_waiters > 0) cv_broadcast(&vmu_data.vmu_cv); mutex_exit(&vmu_data.vmu_lock); + /* update zone's phys. mem. rctl usage */ + vmu_update_zone_rctls(cache); /* copy cache */ ! ret = vmu_copyout_results(cache, buf, nres, flags_orig, ! req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); return (ret);
*** 2028,2032 **** --- 2106,2292 ---- } } vmu_data.vmu_pending_waiters--; goto start; } + + #if defined(__x86) + /* + * Attempt to invalidate all of the pages in the mapping for the given process. + */ + static void + map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size) + { + page_t *pp; + size_t psize; + u_offset_t off; + caddr_t eaddr; + struct vnode *vp; + struct segvn_data *svd; + struct hat *victim_hat; + + ASSERT((addr + size) <= (seg->s_base + seg->s_size)); + + victim_hat = p->p_as->a_hat; + svd = (struct segvn_data *)seg->s_data; + vp = svd->vp; + psize = page_get_pagesize(seg->s_szc); + + off = svd->offset + (uintptr_t)(addr - seg->s_base); + + for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) { + pp = page_lookup_nowait(vp, off, SE_SHARED); + + if (pp != NULL) { + /* following logic based on pvn_getdirty() */ + + if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { + page_unlock(pp); + continue; + } + + page_io_lock(pp); + hat_page_inval(pp, 0, victim_hat); + page_io_unlock(pp); + + /* + * For B_INVALCURONLY-style handling we let + * page_release call VN_DISPOSE if no one else is using + * the page. + * + * A hat_ismod() check would be useless because: + * (1) we are not be holding SE_EXCL lock + * (2) we've not unloaded _all_ translations + * + * Let page_release() do the heavy-lifting. + */ + (void) page_release(pp, 1); + } + } + } + + /* + * vm_map_inval() + * + * Invalidate as many pages as possible within the given mapping for the given + * process. addr is expected to be the base address of the mapping and size is + * the length of the mapping. In some cases a mapping will encompass an + * entire segment, but at least for anon or stack mappings, these will be + * regions within a single large segment. Thus, the invalidation is oriented + * around a single mapping and not an entire segment. + * + * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so + * this code is only applicable to x86. + */ + int + vm_map_inval(pid_t pid, caddr_t addr, size_t size) + { + int ret; + int error = 0; + proc_t *p; /* target proc */ + struct as *as; /* target proc's address space */ + struct seg *seg; /* working segment */ + + if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0) + return (set_errno(EPERM)); + + /* If not a valid mapping address, return an error */ + if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr) + return (set_errno(EINVAL)); + + again: + mutex_enter(&pidlock); + p = prfind(pid); + if (p == NULL) { + mutex_exit(&pidlock); + return (set_errno(ESRCH)); + } + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (panicstr != NULL) { + mutex_exit(&p->p_lock); + return (0); + } + + as = p->p_as; + + /* + * Try to set P_PR_LOCK - prevents process "changing shape" + * - blocks fork + * - blocks sigkill + * - cannot be a system proc + * - must be fully created proc + */ + ret = sprtrylock_proc(p); + if (ret == -1) { + /* Process in invalid state */ + mutex_exit(&p->p_lock); + return (set_errno(ESRCH)); + } + + if (ret == 1) { + /* + * P_PR_LOCK is already set. Wait and try again. This also + * drops p_lock so p may no longer be valid since the proc may + * have exited. + */ + sprwaitlock_proc(p); + goto again; + } + + /* P_PR_LOCK is now set */ + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, RW_READER); + if ((seg = as_segat(as, addr)) == NULL) { + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + sprunlock(p); + return (set_errno(ENOMEM)); + } + + /* + * The invalidation behavior only makes sense for vnode-backed segments. + */ + if (seg->s_ops != &segvn_ops) { + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + sprunlock(p); + return (0); + } + + /* + * If the mapping is out of bounds of the segement return an error. + */ + if ((addr + size) > (seg->s_base + seg->s_size)) { + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + sprunlock(p); + return (set_errno(EINVAL)); + } + + /* + * Don't use MS_INVALCURPROC flag here since that would eventually + * initiate hat invalidation based on curthread. Since we're doing this + * on behalf of a different process, that would erroneously invalidate + * our own process mappings. + */ + error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC); + if (error == 0) { + /* + * Since we didn't invalidate during the sync above, we now + * try to invalidate all of the pages in the mapping. + */ + map_inval(p, seg, addr, size); + } + AS_LOCK_EXIT(as); + + mutex_enter(&p->p_lock); + sprunlock(p); + + if (error) + (void) set_errno(error); + return (error); + } + #endif