Print this page
OS-5078 illumos#6514 broke vm_usage and lx proc
OS-2969 vm_getusage syscall accurate zone RSS is overcounting
OS-3088 need a lighterweight page invalidation mechanism for zone memcap
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process
OS-750 improve RUSAGESYS_GETVMUSAGE for zoneadmd
OS-399 zone phys. mem. cap should be a rctl and have associated kstat

@@ -23,10 +23,14 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
+ * Copyright 2016, Joyent, Inc.
+ */
+
+/*
  * vm_usage
  *
  * This file implements the getvmusage() private system call.
  * getvmusage() counts the amount of resident memory pages and swap
  * reserved by the specified process collective. A "process collective" is

@@ -516,11 +520,12 @@
                 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
         }
 
         zone->vmz_id = id;
 
-        if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
+        if ((vmu_data.vmu_calc_flags &
+            (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
                 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 
         if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
             VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
                 zone->vmz_projects_hash = mod_hash_create_idhash(

@@ -916,10 +921,12 @@
                         if (next == *last)
                                 break;
                         next = AVL_NEXT(tree, next);
                         continue;
                 }
+
+                ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
                 bound_type = next->vmb_type;
                 index = next->vmb_start;
                 while (index <= next->vmb_end) {
 
                         /*

@@ -935,10 +942,13 @@
                         if (ap != NULL)
                                 swap_xlate(ap, &vn, &off);
 
                         if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
                             (page = page_exists(vn, off)) != NULL) {
+                                if (PP_ISFREE(page))
+                                        page_type = VMUSAGE_BOUND_NOT_INCORE;
+                                else
                                 page_type = VMUSAGE_BOUND_INCORE;
                                 if (page->p_szc > 0) {
                                         pgcnt = page_get_pagecnt(page->p_szc);
                                         pgshft = page_get_shift(page->p_szc);
                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))

@@ -945,12 +955,14 @@
                                             - 1;
                                 }
                         } else {
                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
                         }
+
                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
                                 next->vmb_type = page_type;
+                                bound_type = page_type;
                         } else if (next->vmb_type != page_type) {
                                 /*
                                  * If current bound type does not match page
                                  * type, need to split off new bound.
                                  */

@@ -1007,10 +1019,11 @@
                                 break;
                         next = AVL_NEXT(tree, next);
                         continue;
                 }
 
+                ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
                 bound_type = next->vmb_type;
                 index = next->vmb_start;
                 while (index <= next->vmb_end) {
 
                         /*

@@ -1022,10 +1035,13 @@
                         uint_t pgshft;
                         pgcnt_t pgmsk;
 
                         if (vnode->v_pages != NULL &&
                             (page = page_exists(vnode, ptob(index))) != NULL) {
+                                if (PP_ISFREE(page))
+                                        page_type = VMUSAGE_BOUND_NOT_INCORE;
+                                else
                                 page_type = VMUSAGE_BOUND_INCORE;
                                 if (page->p_szc > 0) {
                                         pgcnt = page_get_pagecnt(page->p_szc);
                                         pgshft = page_get_shift(page->p_szc);
                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))

@@ -1032,12 +1048,14 @@
                                             - 1;
                                 }
                         } else {
                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
                         }
+
                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
                                 next->vmb_type = page_type;
+                                bound_type = page_type;
                         } else if (next->vmb_type != page_type) {
                                 /*
                                  * If current bound type does not match page
                                  * type, need to split off new bound.
                                  */

@@ -1302,10 +1320,16 @@
                                 p_index++;
                                 s_index++;
                         }
 
                         /*
+                         * Pages on the free list aren't counted for the rss.
+                         */
+                        if (PP_ISFREE(page))
+                                continue;
+
+                        /*
                          * Assume anon structs with a refcnt
                          * of 1 are not COW shared, so there
                          * is no reason to track them per entity.
                          */
                         if (cnt == 1) {

@@ -1459,12 +1483,13 @@
                 tmp = vmu_data.vmu_system;
                 tmp->vme_next_calc = entities;
                 entities = tmp;
         }
         if (vmu_data.vmu_calc_flags &
-            (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
-            VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
+            (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
+            VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
+            VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
             VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
             VMUSAGE_ALL_EUSERS)) {
                 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
                     (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
                     (mod_hash_val_t *)&zone);

@@ -1737,16 +1762,38 @@
                 kmem_free(cache, sizeof (vmu_cache_t));
         }
 }
 
 /*
+ * When new data is calculated, update the phys_mem rctl usage value in the
+ * zones.
+ */
+static void
+vmu_update_zone_rctls(vmu_cache_t *cache)
+{
+        vmusage_t       *rp;
+        size_t          i = 0;
+        zone_t          *zp;
+
+        for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
+                if (rp->vmu_type == VMUSAGE_ZONE &&
+                    rp->vmu_zoneid != ALL_ZONES) {
+                        if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
+                                zp->zone_phys_mem = rp->vmu_rss_all;
+                                zone_rele(zp);
+                        }
+                }
+        }
+}
+
+/*
  * Copy out the cached results to a caller.  Inspect the callers flags
  * and zone to determine which cached results should be copied.
  */
 static int
 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
-    uint_t flags, int cpflg)
+    uint_t flags, id_t req_zone_id, int cpflg)
 {
         vmusage_t *result, *out_result;
         vmusage_t dummy;
         size_t i, count = 0;
         size_t bufsize;

@@ -1761,11 +1808,11 @@
         }
 
         /* figure out what results the caller is interested in. */
         if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
                 types |= VMUSAGE_SYSTEM;
-        if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
+        if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
                 types |= VMUSAGE_ZONE;
         if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
             VMUSAGE_COL_PROJECTS))
                 types |= VMUSAGE_PROJECTS;
         if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))

@@ -1824,10 +1871,16 @@
                         if (result->vmu_type == VMUSAGE_RUSERS &&
                             (flags & VMUSAGE_COL_RUSERS) == 0)
                                 continue;
                 }
 
+                if (result->vmu_type == VMUSAGE_ZONE &&
+                    flags & VMUSAGE_A_ZONE) {
+                        /* Skip non-requested zone results */
+                        if (result->vmu_zoneid != req_zone_id)
+                                continue;
+                } else {
                 /* Skip "other zone" results if not requested */
                 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
                         if (result->vmu_type == VMUSAGE_ZONE &&
                             (flags & VMUSAGE_ALL_ZONES) == 0)
                                 continue;

@@ -1845,10 +1898,11 @@
                         if (result->vmu_type == VMUSAGE_EUSERS &&
                             (flags & (VMUSAGE_ALL_EUSERS |
                             VMUSAGE_COL_EUSERS)) == 0)
                                 continue;
                 }
+                }
                 count++;
                 if (out_result != NULL) {
                         if (bufsize < count) {
                                 ret = set_errno(EOVERFLOW);
                         } else {

@@ -1899,14 +1953,16 @@
         vmusage_t *result;
         int ret = 0;
         int cacherecent = 0;
         hrtime_t now;
         uint_t flags_orig;
+        id_t req_zone_id;
 
         /*
          * Non-global zones cannot request system wide and/or collated
-         * results, or the system result, so munge the flags accordingly.
+         * results, or the system result, or usage of another zone, so munge
+         * the flags accordingly.
          */
         flags_orig = flags;
         if (curproc->p_zone != global_zone) {
                 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
                         flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);

@@ -1922,20 +1978,39 @@
                 }
                 if (flags & VMUSAGE_SYSTEM) {
                         flags &= ~VMUSAGE_SYSTEM;
                         flags |= VMUSAGE_ZONE;
                 }
+                if (flags & VMUSAGE_A_ZONE) {
+                        flags &= ~VMUSAGE_A_ZONE;
+                        flags |= VMUSAGE_ZONE;
         }
+        }
 
         /* Check for unknown flags */
         if ((flags & (~VMUSAGE_MASK)) != 0)
                 return (set_errno(EINVAL));
 
         /* Check for no flags */
         if ((flags & VMUSAGE_MASK) == 0)
                 return (set_errno(EINVAL));
 
+        /* If requesting results for a specific zone, get the zone ID */
+        if (flags & VMUSAGE_A_ZONE) {
+                size_t bufsize;
+                vmusage_t zreq;
+
+                if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
+                        return (set_errno(EFAULT));
+                /* Requested zone ID is passed in buf, so 0 len not allowed */
+                if (bufsize == 0)
+                        return (set_errno(EINVAL));
+                if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
+                        return (set_errno(EFAULT));
+                req_zone_id = zreq.vmu_id;
+        }
+
         mutex_enter(&vmu_data.vmu_lock);
         now = gethrtime();
 
 start:
         if (vmu_data.vmu_cache != NULL) {

@@ -1951,11 +2026,11 @@
                         cache = vmu_data.vmu_cache;
                         vmu_cache_hold(cache);
                         mutex_exit(&vmu_data.vmu_lock);
 
                         ret = vmu_copyout_results(cache, buf, nres, flags_orig,
-                            cpflg);
+                            req_zone_id, cpflg);
                         mutex_enter(&vmu_data.vmu_lock);
                         vmu_cache_rele(cache);
                         if (vmu_data.vmu_pending_waiters > 0)
                                 cv_broadcast(&vmu_data.vmu_cv);
                         mutex_exit(&vmu_data.vmu_lock);

@@ -2007,12 +2082,15 @@
                 if (vmu_data.vmu_pending_waiters > 0)
                         cv_broadcast(&vmu_data.vmu_cv);
 
                 mutex_exit(&vmu_data.vmu_lock);
 
+                /* update zone's phys. mem. rctl usage */
+                vmu_update_zone_rctls(cache);
                 /* copy cache */
-                ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
+                ret = vmu_copyout_results(cache, buf, nres, flags_orig,
+                    req_zone_id, cpflg);
                 mutex_enter(&vmu_data.vmu_lock);
                 vmu_cache_rele(cache);
                 mutex_exit(&vmu_data.vmu_lock);
 
                 return (ret);

@@ -2028,5 +2106,187 @@
                 }
         }
         vmu_data.vmu_pending_waiters--;
         goto start;
 }
+
+#if defined(__x86)
+/*
+ * Attempt to invalidate all of the pages in the mapping for the given process.
+ */
+static void
+map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
+{
+        page_t          *pp;
+        size_t          psize;
+        u_offset_t      off;
+        caddr_t         eaddr;
+        struct vnode    *vp;
+        struct segvn_data *svd;
+        struct hat      *victim_hat;
+
+        ASSERT((addr + size) <= (seg->s_base + seg->s_size));
+
+        victim_hat = p->p_as->a_hat;
+        svd = (struct segvn_data *)seg->s_data;
+        vp = svd->vp;
+        psize = page_get_pagesize(seg->s_szc);
+
+        off = svd->offset + (uintptr_t)(addr - seg->s_base);
+
+        for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
+                pp = page_lookup_nowait(vp, off, SE_SHARED);
+
+                if (pp != NULL) {
+                        /* following logic based on pvn_getdirty() */
+
+                        if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
+                                page_unlock(pp);
+                                continue;
+                        }
+
+                        page_io_lock(pp);
+                        hat_page_inval(pp, 0, victim_hat);
+                        page_io_unlock(pp);
+
+                        /*
+                         * For B_INVALCURONLY-style handling we let
+                         * page_release call VN_DISPOSE if no one else is using
+                         * the page.
+                         *
+                         * A hat_ismod() check would be useless because:
+                         * (1) we are not be holding SE_EXCL lock
+                         * (2) we've not unloaded _all_ translations
+                         *
+                         * Let page_release() do the heavy-lifting.
+                         */
+                        (void) page_release(pp, 1);
+                }
+        }
+}
+
+/*
+ * vm_map_inval()
+ *
+ * Invalidate as many pages as possible within the given mapping for the given
+ * process. addr is expected to be the base address of the mapping and size is
+ * the length of the mapping. In some cases a mapping will encompass an
+ * entire segment, but at least for anon or stack mappings, these will be
+ * regions within a single large segment. Thus, the invalidation is oriented
+ * around a single mapping and not an entire segment.
+ *
+ * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
+ * this code is only applicable to x86.
+ */
+int
+vm_map_inval(pid_t pid, caddr_t addr, size_t size)
+{
+        int ret;
+        int error = 0;
+        proc_t *p;              /* target proc */
+        struct as *as;          /* target proc's address space */
+        struct seg *seg;        /* working segment */
+
+        if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
+                return (set_errno(EPERM));
+
+        /* If not a valid mapping address, return an error */
+        if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
+                return (set_errno(EINVAL));
+
+again:
+        mutex_enter(&pidlock);
+        p = prfind(pid);
+        if (p == NULL) {
+                mutex_exit(&pidlock);
+                return (set_errno(ESRCH));
+        }
+
+        mutex_enter(&p->p_lock);
+        mutex_exit(&pidlock);
+
+        if (panicstr != NULL) {
+                mutex_exit(&p->p_lock);
+                return (0);
+        }
+
+        as = p->p_as;
+
+        /*
+         * Try to set P_PR_LOCK - prevents process "changing shape"
+         * - blocks fork
+         * - blocks sigkill
+         * - cannot be a system proc
+         * - must be fully created proc
+         */
+        ret = sprtrylock_proc(p);
+        if (ret == -1) {
+                /* Process in invalid state */
+                mutex_exit(&p->p_lock);
+                return (set_errno(ESRCH));
+        }
+
+        if (ret == 1) {
+                /*
+                 * P_PR_LOCK is already set. Wait and try again. This also
+                 * drops p_lock so p may no longer be valid since the proc may
+                 * have exited.
+                 */
+                sprwaitlock_proc(p);
+                goto again;
+        }
+
+        /* P_PR_LOCK is now set */
+        mutex_exit(&p->p_lock);
+
+        AS_LOCK_ENTER(as, RW_READER);
+        if ((seg = as_segat(as, addr)) == NULL) {
+                AS_LOCK_EXIT(as);
+                mutex_enter(&p->p_lock);
+                sprunlock(p);
+                return (set_errno(ENOMEM));
+        }
+
+        /*
+         * The invalidation behavior only makes sense for vnode-backed segments.
+         */
+        if (seg->s_ops != &segvn_ops) {
+                AS_LOCK_EXIT(as);
+                mutex_enter(&p->p_lock);
+                sprunlock(p);
+                return (0);
+        }
+
+        /*
+         * If the mapping is out of bounds of the segement return an error.
+         */
+        if ((addr + size) > (seg->s_base + seg->s_size)) {
+                AS_LOCK_EXIT(as);
+                mutex_enter(&p->p_lock);
+                sprunlock(p);
+                return (set_errno(EINVAL));
+        }
+
+        /*
+         * Don't use MS_INVALCURPROC flag here since that would eventually
+         * initiate hat invalidation based on curthread. Since we're doing this
+         * on behalf of a different process, that would erroneously invalidate
+         * our own process mappings.
+         */
+        error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
+        if (error == 0) {
+                /*
+                 * Since we didn't invalidate during the sync above, we now
+                 * try to invalidate all of the pages in the mapping.
+                 */
+                map_inval(p, seg, addr, size);
+        }
+        AS_LOCK_EXIT(as);
+
+        mutex_enter(&p->p_lock);
+        sprunlock(p);
+
+        if (error)
+                (void) set_errno(error);
+        return (error);
+}
+#endif