1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2016, Joyent, Inc.
  29  */
  30 
  31 /*
  32  * vm_usage
  33  *
  34  * This file implements the getvmusage() private system call.
  35  * getvmusage() counts the amount of resident memory pages and swap
  36  * reserved by the specified process collective. A "process collective" is
  37  * the set of processes owned by a particular, zone, project, task, or user.
  38  *
  39  * rss and swap are counted so that for a given process collective, a page is
  40  * only counted once.  For example, this means that if multiple processes in
  41  * the same project map the same page, then the project will only be charged
  42  * once for that page.  On the other hand, if two processes in different
  43  * projects map the same page, then both projects will be charged
  44  * for the page.
  45  *
  46  * The vm_getusage() calculation is implemented so that the first thread
  47  * performs the rss/swap counting. Other callers will wait for that thread to
  48  * finish, copying the results.  This enables multiple rcapds and prstats to
  49  * consume data from the same calculation.  The results are also cached so that
  50  * a caller interested in recent results can just copy them instead of starting
  51  * a new calculation. The caller passes the maximium age (in seconds) of the
  52  * data.  If the cached data is young enough, the cache is copied, otherwise,
  53  * a new calculation is executed and the cache is replaced with the new
  54  * data.
  55  *
  56  * The rss calculation for each process collective is as follows:
  57  *
  58  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
  59  *     and/or users.
  60  *   - For each proc:
  61  *      - Figure out proc's collectives (zone, project, task, and/or user).
  62  *      - For each seg in proc's address space:
  63  *              - If seg is private:
  64  *                      - Lookup anons in the amp.
  65  *                      - For incore pages not previously visited each of the
  66  *                        proc's collectives, add incore pagesize to each.
  67  *                        collective.
  68  *                        Anon's with a refcnt of 1 can be assummed to be not
  69  *                        previously visited.
  70  *                      - For address ranges without anons in the amp:
  71  *                              - Lookup pages in underlying vnode.
  72  *                              - For incore pages not previously visiting for
  73  *                                each of the proc's collectives, add incore
  74  *                                pagesize to each collective.
  75  *              - If seg is shared:
  76  *                      - Lookup pages in the shared amp or vnode.
  77  *                      - For incore pages not previously visited for each of
  78  *                        the proc's collectives, add incore pagesize to each
  79  *                        collective.
  80  *
  81  * Swap is reserved by private segments, and shared anonymous segments.
  82  * The only shared anon segments which do not reserve swap are ISM segments
  83  * and schedctl segments, both of which can be identified by having
  84  * amp->swresv == 0.
  85  *
  86  * The swap calculation for each collective is as follows:
  87  *
  88  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
  89  *     and/or users.
  90  *   - For each proc:
  91  *      - Figure out proc's collectives (zone, project, task, and/or user).
  92  *      - For each seg in proc's address space:
  93  *              - If seg is private:
  94  *                      - Add svd->swresv pages to swap count for each of the
  95  *                        proc's collectives.
  96  *              - If seg is anon, shared, and amp->swresv != 0
  97  *                      - For address ranges in amp not previously visited for
  98  *                        each of the proc's collectives, add size of address
  99  *                        range to the swap count for each collective.
 100  *
 101  * These two calculations are done simultaneously, with most of the work
 102  * being done in vmu_calculate_seg().  The results of the calculation are
 103  * copied into "vmu_data.vmu_cache_results".
 104  *
 105  * To perform the calculation, various things are tracked and cached:
 106  *
 107  *    - incore/not-incore page ranges for all vnodes.
 108  *      (vmu_data.vmu_all_vnodes_hash)
 109  *      This eliminates looking up the same page more than once.
 110  *
 111  *    - incore/not-incore page ranges for all shared amps.
 112  *      (vmu_data.vmu_all_amps_hash)
 113  *      This eliminates looking up the same page more than once.
 114  *
 115  *    - visited page ranges for each collective.
 116  *         - per vnode (entity->vme_vnode_hash)
 117  *         - per shared amp (entity->vme_amp_hash)
 118  *      For accurate counting of map-shared and COW-shared pages.
 119  *
 120  *    - visited private anons (refcnt > 1) for each collective.
 121  *      (entity->vme_anon)
 122  *      For accurate counting of COW-shared pages.
 123  *
 124  * The common accounting structure is the vmu_entity_t, which represents
 125  * collectives:
 126  *
 127  *    - A zone.
 128  *    - A project, task, or user within a zone.
 129  *    - The entire system (vmu_data.vmu_system).
 130  *    - Each collapsed (col) project and user.  This means a given projid or
 131  *      uid, regardless of which zone the process is in.  For instance,
 132  *      project 0 in the global zone and project 0 in a non global zone are
 133  *      the same collapsed project.
 134  *
 135  *  Each entity structure tracks which pages have been already visited for
 136  *  that entity (via previously inspected processes) so that these pages are
 137  *  not double counted.
 138  */
 139 
 140 #include <sys/errno.h>
 141 #include <sys/types.h>
 142 #include <sys/zone.h>
 143 #include <sys/proc.h>
 144 #include <sys/project.h>
 145 #include <sys/task.h>
 146 #include <sys/thread.h>
 147 #include <sys/time.h>
 148 #include <sys/mman.h>
 149 #include <sys/modhash.h>
 150 #include <sys/modhash_impl.h>
 151 #include <sys/shm.h>
 152 #include <sys/swap.h>
 153 #include <sys/synch.h>
 154 #include <sys/systm.h>
 155 #include <sys/var.h>
 156 #include <sys/vm_usage.h>
 157 #include <sys/zone.h>
 158 #include <sys/sunddi.h>
 159 #include <sys/sysmacros.h>
 160 #include <sys/avl.h>
 161 #include <vm/anon.h>
 162 #include <vm/as.h>
 163 #include <vm/seg_vn.h>
 164 #include <vm/seg_spt.h>
 165 
 166 #define VMUSAGE_HASH_SIZE               512
 167 
 168 #define VMUSAGE_TYPE_VNODE              1
 169 #define VMUSAGE_TYPE_AMP                2
 170 #define VMUSAGE_TYPE_ANON               3
 171 
 172 #define VMUSAGE_BOUND_UNKNOWN           0
 173 #define VMUSAGE_BOUND_INCORE            1
 174 #define VMUSAGE_BOUND_NOT_INCORE        2
 175 
 176 #define ISWITHIN(node, addr)    ((node)->vmb_start <= addr && \
 177                                     (node)->vmb_end >= addr ? 1 : 0)
 178 
 179 /*
 180  * bounds for vnodes and shared amps
 181  * Each bound is either entirely incore, entirely not in core, or
 182  * entirely unknown.  bounds are stored in an avl tree sorted by start member
 183  * when in use, otherwise (free or temporary lists) they're strung
 184  * together off of vmb_next.
 185  */
 186 typedef struct vmu_bound {
 187         avl_node_t vmb_node;
 188         struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
 189         pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
 190         pgcnt_t vmb_end;    /* page offset in vnode/amp on which bound ends */
 191         char    vmb_type;   /* One of VMUSAGE_BOUND_* */
 192 } vmu_bound_t;
 193 
 194 /*
 195  * hash of visited objects (vnodes or shared amps)
 196  * key is address of vnode or amp.  Bounds lists known incore/non-incore
 197  * bounds for vnode/amp.
 198  */
 199 typedef struct vmu_object {
 200         struct vmu_object       *vmo_next;      /* free list */
 201         caddr_t         vmo_key;
 202         short           vmo_type;
 203         avl_tree_t      vmo_bounds;
 204 } vmu_object_t;
 205 
 206 /*
 207  * Node for tree of visited COW anons.
 208  */
 209 typedef struct vmu_anon {
 210         avl_node_t vma_node;
 211         uintptr_t vma_addr;
 212 } vmu_anon_t;
 213 
 214 /*
 215  * Entity by which to count results.
 216  *
 217  * The entity structure keeps the current rss/swap counts for each entity
 218  * (zone, project, etc), and hashes of vm structures that have already
 219  * been visited for the entity.
 220  *
 221  * vme_next:    links the list of all entities currently being counted by
 222  *              vmu_calculate().
 223  *
 224  * vme_next_calc: links the list of entities related to the current process
 225  *               being counted by vmu_calculate_proc().
 226  *
 227  * vmu_calculate_proc() walks all processes.  For each process, it makes a
 228  * list of the entities related to that process using vme_next_calc.  This
 229  * list changes each time vmu_calculate_proc() is called.
 230  *
 231  */
 232 typedef struct vmu_entity {
 233         struct vmu_entity *vme_next;
 234         struct vmu_entity *vme_next_calc;
 235         mod_hash_t      *vme_vnode_hash; /* vnodes visited for entity */
 236         mod_hash_t      *vme_amp_hash;   /* shared amps visited for entity */
 237         avl_tree_t      vme_anon;        /* COW anons visited for entity */
 238         vmusage_t       vme_result;      /* identifies entity and results */
 239 } vmu_entity_t;
 240 
 241 /*
 242  * Hash of entities visited within a zone, and an entity for the zone
 243  * itself.
 244  */
 245 typedef struct vmu_zone {
 246         struct vmu_zone *vmz_next;      /* free list */
 247         id_t            vmz_id;
 248         vmu_entity_t    *vmz_zone;
 249         mod_hash_t      *vmz_projects_hash;
 250         mod_hash_t      *vmz_tasks_hash;
 251         mod_hash_t      *vmz_rusers_hash;
 252         mod_hash_t      *vmz_eusers_hash;
 253 } vmu_zone_t;
 254 
 255 /*
 256  * Cache of results from last calculation
 257  */
 258 typedef struct vmu_cache {
 259         vmusage_t       *vmc_results;   /* Results from last call to */
 260                                         /* vm_getusage(). */
 261         uint64_t        vmc_nresults;   /* Count of cached results */
 262         uint64_t        vmc_refcnt;     /* refcnt for free */
 263         uint_t          vmc_flags;      /* Flags for vm_getusage() */
 264         hrtime_t        vmc_timestamp;  /* when cache was created */
 265 } vmu_cache_t;
 266 
 267 /*
 268  * top level rss info for the system
 269  */
 270 typedef struct vmu_data {
 271         kmutex_t        vmu_lock;               /* Protects vmu_data */
 272         kcondvar_t      vmu_cv;                 /* Used to signal threads */
 273                                                 /* Waiting for */
 274                                                 /* Rss_calc_thread to finish */
 275         vmu_entity_t    *vmu_system;            /* Entity for tracking */
 276                                                 /* rss/swap for all processes */
 277                                                 /* in all zones */
 278         mod_hash_t      *vmu_zones_hash;        /* Zones visited */
 279         mod_hash_t      *vmu_projects_col_hash; /* These *_col_hash hashes */
 280         mod_hash_t      *vmu_rusers_col_hash;   /* keep track of entities, */
 281         mod_hash_t      *vmu_eusers_col_hash;   /* ignoring zoneid, in order */
 282                                                 /* to implement VMUSAGE_COL_* */
 283                                                 /* flags, which aggregate by */
 284                                                 /* project or user regardless */
 285                                                 /* of zoneid. */
 286         mod_hash_t      *vmu_all_vnodes_hash;   /* System wide visited vnodes */
 287                                                 /* to track incore/not-incore */
 288         mod_hash_t      *vmu_all_amps_hash;     /* System wide visited shared */
 289                                                 /* amps to track incore/not- */
 290                                                 /* incore */
 291         vmu_entity_t    *vmu_entities;          /* Linked list of entities */
 292         size_t          vmu_nentities;          /* Count of entities in list */
 293         vmu_cache_t     *vmu_cache;             /* Cached results */
 294         kthread_t       *vmu_calc_thread;       /* NULL, or thread running */
 295                                                 /* vmu_calculate() */
 296         uint_t          vmu_calc_flags;         /* Flags being using by */
 297                                                 /* currently running calc */
 298                                                 /* thread */
 299         uint_t          vmu_pending_flags;      /* Flags of vm_getusage() */
 300                                                 /* threads waiting for */
 301                                                 /* calc thread to finish */
 302         uint_t          vmu_pending_waiters;    /* Number of threads waiting */
 303                                                 /* for calc thread */
 304         vmu_bound_t     *vmu_free_bounds;
 305         vmu_object_t    *vmu_free_objects;
 306         vmu_entity_t    *vmu_free_entities;
 307         vmu_zone_t      *vmu_free_zones;
 308 } vmu_data_t;
 309 
 310 extern struct as kas;
 311 extern proc_t *practive;
 312 extern zone_t *global_zone;
 313 extern struct seg_ops segvn_ops;
 314 extern struct seg_ops segspt_shmops;
 315 
 316 static vmu_data_t vmu_data;
 317 static kmem_cache_t *vmu_bound_cache;
 318 static kmem_cache_t *vmu_object_cache;
 319 
 320 /*
 321  * Comparison routine for AVL tree. We base our comparison on vmb_start.
 322  */
 323 static int
 324 bounds_cmp(const void *bnd1, const void *bnd2)
 325 {
 326         const vmu_bound_t *bound1 = bnd1;
 327         const vmu_bound_t *bound2 = bnd2;
 328 
 329         if (bound1->vmb_start == bound2->vmb_start) {
 330                 return (0);
 331         }
 332         if (bound1->vmb_start < bound2->vmb_start) {
 333                 return (-1);
 334         }
 335 
 336         return (1);
 337 }
 338 
 339 /*
 340  * Comparison routine for our AVL tree of anon structures.
 341  */
 342 static int
 343 vmu_anon_cmp(const void *lhs, const void *rhs)
 344 {
 345         const vmu_anon_t *l = lhs, *r = rhs;
 346 
 347         if (l->vma_addr == r->vma_addr)
 348                 return (0);
 349 
 350         if (l->vma_addr < r->vma_addr)
 351                 return (-1);
 352 
 353         return (1);
 354 }
 355 
 356 /*
 357  * Save a bound on the free list.
 358  */
 359 static void
 360 vmu_free_bound(vmu_bound_t *bound)
 361 {
 362         bound->vmb_next = vmu_data.vmu_free_bounds;
 363         bound->vmb_start = 0;
 364         bound->vmb_end = 0;
 365         bound->vmb_type = 0;
 366         vmu_data.vmu_free_bounds = bound;
 367 }
 368 
 369 /*
 370  * Free an object, and all visited bound info.
 371  */
 372 static void
 373 vmu_free_object(mod_hash_val_t val)
 374 {
 375         vmu_object_t *obj = (vmu_object_t *)val;
 376         avl_tree_t *tree = &(obj->vmo_bounds);
 377         vmu_bound_t *bound;
 378         void *cookie = NULL;
 379 
 380         while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
 381                 vmu_free_bound(bound);
 382         avl_destroy(tree);
 383 
 384         obj->vmo_type = 0;
 385         obj->vmo_next = vmu_data.vmu_free_objects;
 386         vmu_data.vmu_free_objects = obj;
 387 }
 388 
 389 /*
 390  * Free an entity, and hashes of visited objects for that entity.
 391  */
 392 static void
 393 vmu_free_entity(mod_hash_val_t val)
 394 {
 395         vmu_entity_t *entity = (vmu_entity_t *)val;
 396         vmu_anon_t *anon;
 397         void *cookie = NULL;
 398 
 399         if (entity->vme_vnode_hash != NULL)
 400                 i_mod_hash_clear_nosync(entity->vme_vnode_hash);
 401         if (entity->vme_amp_hash != NULL)
 402                 i_mod_hash_clear_nosync(entity->vme_amp_hash);
 403 
 404         while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
 405                 kmem_free(anon, sizeof (vmu_anon_t));
 406 
 407         avl_destroy(&entity->vme_anon);
 408 
 409         entity->vme_next = vmu_data.vmu_free_entities;
 410         vmu_data.vmu_free_entities = entity;
 411 }
 412 
 413 /*
 414  * Free zone entity, and all hashes of entities inside that zone,
 415  * which are projects, tasks, and users.
 416  */
 417 static void
 418 vmu_free_zone(mod_hash_val_t val)
 419 {
 420         vmu_zone_t *zone = (vmu_zone_t *)val;
 421 
 422         if (zone->vmz_zone != NULL) {
 423                 vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
 424                 zone->vmz_zone = NULL;
 425         }
 426         if (zone->vmz_projects_hash != NULL)
 427                 i_mod_hash_clear_nosync(zone->vmz_projects_hash);
 428         if (zone->vmz_tasks_hash != NULL)
 429                 i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
 430         if (zone->vmz_rusers_hash != NULL)
 431                 i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
 432         if (zone->vmz_eusers_hash != NULL)
 433                 i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
 434         zone->vmz_next = vmu_data.vmu_free_zones;
 435         vmu_data.vmu_free_zones = zone;
 436 }
 437 
 438 /*
 439  * Initialize synchronization primitives and hashes for system-wide tracking
 440  * of visited vnodes and shared amps.  Initialize results cache.
 441  */
 442 void
 443 vm_usage_init()
 444 {
 445         mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
 446         cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
 447 
 448         vmu_data.vmu_system = NULL;
 449         vmu_data.vmu_zones_hash = NULL;
 450         vmu_data.vmu_projects_col_hash = NULL;
 451         vmu_data.vmu_rusers_col_hash = NULL;
 452         vmu_data.vmu_eusers_col_hash = NULL;
 453 
 454         vmu_data.vmu_free_bounds = NULL;
 455         vmu_data.vmu_free_objects = NULL;
 456         vmu_data.vmu_free_entities = NULL;
 457         vmu_data.vmu_free_zones = NULL;
 458 
 459         vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
 460             "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 461             sizeof (vnode_t));
 462         vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
 463             "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 464             sizeof (struct anon_map));
 465         vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
 466             "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
 467             vmu_free_entity);
 468         vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
 469             "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
 470             vmu_free_entity);
 471         vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
 472             "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
 473             vmu_free_entity);
 474         vmu_data.vmu_zones_hash = mod_hash_create_idhash(
 475             "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
 476 
 477         vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
 478             sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 479         vmu_object_cache = kmem_cache_create("vmu_object_cache",
 480             sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 481 
 482         vmu_data.vmu_entities = NULL;
 483         vmu_data.vmu_nentities = 0;
 484 
 485         vmu_data.vmu_cache = NULL;
 486         vmu_data.vmu_calc_thread = NULL;
 487         vmu_data.vmu_calc_flags = 0;
 488         vmu_data.vmu_pending_flags = 0;
 489         vmu_data.vmu_pending_waiters = 0;
 490 }
 491 
 492 /*
 493  * Allocate hashes for tracking vm objects visited for an entity.
 494  * Update list of entities.
 495  */
 496 static vmu_entity_t *
 497 vmu_alloc_entity(id_t id, int type, id_t zoneid)
 498 {
 499         vmu_entity_t *entity;
 500 
 501         if (vmu_data.vmu_free_entities != NULL) {
 502                 entity = vmu_data.vmu_free_entities;
 503                 vmu_data.vmu_free_entities =
 504                     vmu_data.vmu_free_entities->vme_next;
 505                 bzero(&entity->vme_result, sizeof (vmusage_t));
 506         } else {
 507                 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
 508         }
 509         entity->vme_result.vmu_id = id;
 510         entity->vme_result.vmu_zoneid = zoneid;
 511         entity->vme_result.vmu_type = type;
 512 
 513         if (entity->vme_vnode_hash == NULL)
 514                 entity->vme_vnode_hash = mod_hash_create_ptrhash(
 515                     "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 516                     sizeof (vnode_t));
 517 
 518         if (entity->vme_amp_hash == NULL)
 519                 entity->vme_amp_hash = mod_hash_create_ptrhash(
 520                     "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 521                     sizeof (struct anon_map));
 522 
 523         VERIFY(avl_first(&entity->vme_anon) == NULL);
 524 
 525         avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
 526             offsetof(struct vmu_anon, vma_node));
 527 
 528         entity->vme_next = vmu_data.vmu_entities;
 529         vmu_data.vmu_entities = entity;
 530         vmu_data.vmu_nentities++;
 531 
 532         return (entity);
 533 }
 534 
 535 /*
 536  * Allocate a zone entity, and hashes for tracking visited vm objects
 537  * for projects, tasks, and users within that zone.
 538  */
 539 static vmu_zone_t *
 540 vmu_alloc_zone(id_t id)
 541 {
 542         vmu_zone_t *zone;
 543 
 544         if (vmu_data.vmu_free_zones != NULL) {
 545                 zone = vmu_data.vmu_free_zones;
 546                 vmu_data.vmu_free_zones =
 547                     vmu_data.vmu_free_zones->vmz_next;
 548                 zone->vmz_next = NULL;
 549                 zone->vmz_zone = NULL;
 550         } else {
 551                 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
 552         }
 553 
 554         zone->vmz_id = id;
 555 
 556         if ((vmu_data.vmu_calc_flags &
 557             (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
 558                 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 559 
 560         if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
 561             VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
 562                 zone->vmz_projects_hash = mod_hash_create_idhash(
 563                     "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 564 
 565         if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
 566             != 0 && zone->vmz_tasks_hash == NULL)
 567                 zone->vmz_tasks_hash = mod_hash_create_idhash(
 568                     "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 569 
 570         if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
 571             != 0 && zone->vmz_rusers_hash == NULL)
 572                 zone->vmz_rusers_hash = mod_hash_create_idhash(
 573                     "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 574 
 575         if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
 576             != 0 && zone->vmz_eusers_hash == NULL)
 577                 zone->vmz_eusers_hash = mod_hash_create_idhash(
 578                     "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 579 
 580         return (zone);
 581 }
 582 
 583 /*
 584  * Allocate a structure for tracking visited bounds for a vm object.
 585  */
 586 static vmu_object_t *
 587 vmu_alloc_object(caddr_t key, int type)
 588 {
 589         vmu_object_t *object;
 590 
 591         if (vmu_data.vmu_free_objects != NULL) {
 592                 object = vmu_data.vmu_free_objects;
 593                 vmu_data.vmu_free_objects =
 594                     vmu_data.vmu_free_objects->vmo_next;
 595         } else {
 596                 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
 597         }
 598 
 599         object->vmo_next = NULL;
 600         object->vmo_key = key;
 601         object->vmo_type = type;
 602         avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
 603 
 604         return (object);
 605 }
 606 
 607 /*
 608  * Allocate and return a bound structure.
 609  */
 610 static vmu_bound_t *
 611 vmu_alloc_bound()
 612 {
 613         vmu_bound_t *bound;
 614 
 615         if (vmu_data.vmu_free_bounds != NULL) {
 616                 bound = vmu_data.vmu_free_bounds;
 617                 vmu_data.vmu_free_bounds =
 618                     vmu_data.vmu_free_bounds->vmb_next;
 619         } else {
 620                 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
 621         }
 622 
 623         bound->vmb_next = NULL;
 624         bound->vmb_start = 0;
 625         bound->vmb_end = 0;
 626         bound->vmb_type = 0;
 627         return (bound);
 628 }
 629 
 630 /*
 631  * vmu_find_insert_* functions implement hash lookup or allocate and
 632  * insert operations.
 633  */
 634 static vmu_object_t *
 635 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
 636 {
 637         int ret;
 638         vmu_object_t *object;
 639 
 640         ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
 641             (mod_hash_val_t *)&object);
 642         if (ret != 0) {
 643                 object = vmu_alloc_object(key, type);
 644                 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
 645                     (mod_hash_val_t)object, (mod_hash_hndl_t)0);
 646                 ASSERT(ret == 0);
 647         }
 648         return (object);
 649 }
 650 
 651 static int
 652 vmu_find_insert_anon(vmu_entity_t *entity, void *key)
 653 {
 654         vmu_anon_t anon, *ap;
 655 
 656         anon.vma_addr = (uintptr_t)key;
 657 
 658         if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
 659                 return (0);
 660 
 661         ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
 662         ap->vma_addr = (uintptr_t)key;
 663 
 664         avl_add(&entity->vme_anon, ap);
 665 
 666         return (1);
 667 }
 668 
 669 static vmu_entity_t *
 670 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
 671 {
 672         int ret;
 673         vmu_entity_t *entity;
 674 
 675         ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
 676             (mod_hash_val_t *)&entity);
 677         if (ret != 0) {
 678                 entity = vmu_alloc_entity(id, type, zoneid);
 679                 ret = i_mod_hash_insert_nosync(hash,
 680                     (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
 681                     (mod_hash_hndl_t)0);
 682                 ASSERT(ret == 0);
 683         }
 684         return (entity);
 685 }
 686 
 687 
 688 
 689 
 690 /*
 691  * Returns list of object bounds between start and end.  New bounds inserted
 692  * by this call are given type.
 693  *
 694  * Returns the number of pages covered if new bounds are created.  Returns 0
 695  * if region between start/end consists of all existing bounds.
 696  */
 697 static pgcnt_t
 698 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
 699     end, char type, vmu_bound_t **first, vmu_bound_t **last)
 700 {
 701         avl_tree_t      *tree = &(ro->vmo_bounds);
 702         avl_index_t     where;
 703         vmu_bound_t     *walker, *tmp;
 704         pgcnt_t         ret = 0;
 705 
 706         ASSERT(start <= end);
 707 
 708         *first = *last = NULL;
 709 
 710         tmp = vmu_alloc_bound();
 711         tmp->vmb_start = start;
 712         tmp->vmb_type = type;
 713 
 714         /* Hopelessly optimistic case. */
 715         if (walker = avl_find(tree, tmp, &where)) {
 716                 /* We got lucky. */
 717                 vmu_free_bound(tmp);
 718                 *first = walker;
 719         }
 720 
 721         if (walker == NULL) {
 722                 /* Is start in the previous node? */
 723                 walker = avl_nearest(tree, where, AVL_BEFORE);
 724                 if (walker != NULL) {
 725                         if (ISWITHIN(walker, start)) {
 726                                 /* We found start. */
 727                                 vmu_free_bound(tmp);
 728                                 *first = walker;
 729                         }
 730                 }
 731         }
 732 
 733         /*
 734          * At this point, if *first is still NULL, then we
 735          * didn't get a direct hit and start isn't covered
 736          * by the previous node. We know that the next node
 737          * must have a greater start value than we require
 738          * because avl_find tells us where the AVL routines would
 739          * insert our new node. We have some gap between the
 740          * start we want and the next node.
 741          */
 742         if (*first == NULL) {
 743                 walker = avl_nearest(tree, where, AVL_AFTER);
 744                 if (walker != NULL && walker->vmb_start <= end) {
 745                         /* Fill the gap. */
 746                         tmp->vmb_end = walker->vmb_start - 1;
 747                         *first = tmp;
 748                 } else {
 749                         /* We have a gap over [start, end]. */
 750                         tmp->vmb_end = end;
 751                         *first = *last = tmp;
 752                 }
 753                 ret += tmp->vmb_end - tmp->vmb_start + 1;
 754                 avl_insert(tree, tmp, where);
 755         }
 756 
 757         ASSERT(*first != NULL);
 758 
 759         if (*last != NULL) {
 760                 /* We're done. */
 761                 return (ret);
 762         }
 763 
 764         /*
 765          * If we are here we still need to set *last and
 766          * that may involve filling in some gaps.
 767          */
 768         *last = *first;
 769         for (;;) {
 770                 if (ISWITHIN(*last, end)) {
 771                         /* We're done. */
 772                         break;
 773                 }
 774                 walker = AVL_NEXT(tree, *last);
 775                 if (walker == NULL || walker->vmb_start > end) {
 776                         /* Bottom or mid tree with gap. */
 777                         tmp = vmu_alloc_bound();
 778                         tmp->vmb_start = (*last)->vmb_end + 1;
 779                         tmp->vmb_end = end;
 780                         tmp->vmb_type = type;
 781                         ret += tmp->vmb_end - tmp->vmb_start + 1;
 782                         avl_insert_here(tree, tmp, *last, AVL_AFTER);
 783                         *last = tmp;
 784                         break;
 785                 } else {
 786                         if ((*last)->vmb_end + 1 != walker->vmb_start) {
 787                                 /* Non-contiguous. */
 788                                 tmp = vmu_alloc_bound();
 789                                 tmp->vmb_start = (*last)->vmb_end + 1;
 790                                 tmp->vmb_end = walker->vmb_start - 1;
 791                                 tmp->vmb_type = type;
 792                                 ret += tmp->vmb_end - tmp->vmb_start + 1;
 793                                 avl_insert_here(tree, tmp, *last, AVL_AFTER);
 794                                 *last = tmp;
 795                         } else {
 796                                 *last = walker;
 797                         }
 798                 }
 799         }
 800 
 801         return (ret);
 802 }
 803 
 804 /*
 805  * vmu_update_bounds()
 806  *
 807  * tree: avl_tree in which first and last hang.
 808  *
 809  * first, last: list of continuous bounds, of which zero or more are of
 810  *              type VMUSAGE_BOUND_UNKNOWN.
 811  *
 812  * new_tree: avl_tree in which new_first and new_last hang.
 813  *
 814  * new_first, new_last: list of continuous bounds, of which none are of
 815  *                      type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
 816  *                      update the types of bounds in (first,last) with
 817  *                      type VMUSAGE_BOUND_UNKNOWN.
 818  *
 819  * For the list of bounds (first,last), this function updates any bounds
 820  * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
 821  * the list (new_first, new_last).
 822  *
 823  * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
 824  * (new_first, new_last), it will be split into multiple bounds.
 825  *
 826  * Return value:
 827  *      The number of pages in the list of bounds (first,last) that were of
 828  *      type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
 829  *      VMUSAGE_BOUND_INCORE.
 830  *
 831  */
 832 static pgcnt_t
 833 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
 834     avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
 835 {
 836         vmu_bound_t *next, *new_next, *tmp;
 837         pgcnt_t rss = 0;
 838 
 839         next = *first;
 840         new_next = new_first;
 841 
 842         /*
 843          * Verify first and last bound are covered by new bounds if they
 844          * have unknown type.
 845          */
 846         ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
 847             (*first)->vmb_start >= new_first->vmb_start);
 848         ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
 849             (*last)->vmb_end <= new_last->vmb_end);
 850         for (;;) {
 851                 /* If bound already has type, proceed to next bound. */
 852                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 853                         if (next == *last)
 854                                 break;
 855                         next = AVL_NEXT(tree, next);
 856                         continue;
 857                 }
 858                 while (new_next->vmb_end < next->vmb_start)
 859                         new_next = AVL_NEXT(new_tree, new_next);
 860                 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
 861                 next->vmb_type = new_next->vmb_type;
 862                 if (new_next->vmb_end < next->vmb_end) {
 863                         /* need to split bound */
 864                         tmp = vmu_alloc_bound();
 865                         tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
 866                         tmp->vmb_start = new_next->vmb_end + 1;
 867                         tmp->vmb_end = next->vmb_end;
 868                         avl_insert_here(tree, tmp, next, AVL_AFTER);
 869                         next->vmb_end = new_next->vmb_end;
 870                         if (*last == next)
 871                                 *last = tmp;
 872                         if (next->vmb_type == VMUSAGE_BOUND_INCORE)
 873                                 rss += next->vmb_end - next->vmb_start + 1;
 874                         next = tmp;
 875                 } else {
 876                         if (next->vmb_type == VMUSAGE_BOUND_INCORE)
 877                                 rss += next->vmb_end - next->vmb_start + 1;
 878                         if (next == *last)
 879                                 break;
 880                         next = AVL_NEXT(tree, next);
 881                 }
 882         }
 883         return (rss);
 884 }
 885 
 886 /*
 887  * Merges adjacent bounds with same type between first and last bound.
 888  * After merge, last pointer may point to a different bound, as (incoming)
 889  * last bound may have been merged away.
 890  */
 891 static void
 892 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
 893 {
 894         vmu_bound_t *current;
 895         vmu_bound_t *next;
 896 
 897         ASSERT(tree != NULL);
 898         ASSERT(*first != NULL);
 899         ASSERT(*last != NULL);
 900 
 901         current = *first;
 902         while (current != *last) {
 903                 next = AVL_NEXT(tree, current);
 904                 if ((current->vmb_end + 1) == next->vmb_start &&
 905                     current->vmb_type == next->vmb_type) {
 906                         current->vmb_end = next->vmb_end;
 907                         avl_remove(tree, next);
 908                         vmu_free_bound(next);
 909                         if (next == *last) {
 910                                 *last = current;
 911                         }
 912                 } else {
 913                         current = AVL_NEXT(tree, current);
 914                 }
 915         }
 916 }
 917 
 918 /*
 919  * Given an amp and a list of bounds, updates each bound's type with
 920  * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
 921  *
 922  * If a bound is partially incore, it will be split into two bounds.
 923  * first and last may be modified, as bounds may be split into multiple
 924  * bounds if they are partially incore/not-incore.
 925  *
 926  * Set incore to non-zero if bounds are already known to be incore.
 927  *
 928  */
 929 static void
 930 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 931     vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
 932 {
 933         vmu_bound_t *next;
 934         vmu_bound_t *tmp;
 935         pgcnt_t index;
 936         short bound_type;
 937         short page_type;
 938         vnode_t *vn;
 939         anoff_t off;
 940         struct anon *ap;
 941 
 942         next = *first;
 943         /* Shared anon slots don't change once set. */
 944         ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
 945         for (;;) {
 946                 if (incore == B_TRUE)
 947                         next->vmb_type = VMUSAGE_BOUND_INCORE;
 948 
 949                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 950                         if (next == *last)
 951                                 break;
 952                         next = AVL_NEXT(tree, next);
 953                         continue;
 954                 }
 955 
 956                 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 957                 bound_type = next->vmb_type;
 958                 index = next->vmb_start;
 959                 while (index <= next->vmb_end) {
 960 
 961                         /*
 962                          * These are used to determine how much to increment
 963                          * index when a large page is found.
 964                          */
 965                         page_t *page;
 966                         pgcnt_t pgcnt = 1;
 967                         uint_t pgshft;
 968                         pgcnt_t pgmsk;
 969 
 970                         ap = anon_get_ptr(amp->ahp, index);
 971                         if (ap != NULL)
 972                                 swap_xlate(ap, &vn, &off);
 973 
 974                         if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 975                             (page = page_exists(vn, off)) != NULL) {
 976                                 if (PP_ISFREE(page))
 977                                         page_type = VMUSAGE_BOUND_NOT_INCORE;
 978                                 else
 979                                         page_type = VMUSAGE_BOUND_INCORE;
 980                                 if (page->p_szc > 0) {
 981                                         pgcnt = page_get_pagecnt(page->p_szc);
 982                                         pgshft = page_get_shift(page->p_szc);
 983                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
 984                                             - 1;
 985                                 }
 986                         } else {
 987                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
 988                         }
 989 
 990                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 991                                 next->vmb_type = page_type;
 992                                 bound_type = page_type;
 993                         } else if (next->vmb_type != page_type) {
 994                                 /*
 995                                  * If current bound type does not match page
 996                                  * type, need to split off new bound.
 997                                  */
 998                                 tmp = vmu_alloc_bound();
 999                                 tmp->vmb_type = page_type;
1000                                 tmp->vmb_start = index;
1001                                 tmp->vmb_end = next->vmb_end;
1002                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
1003                                 next->vmb_end = index - 1;
1004                                 if (*last == next)
1005                                         *last = tmp;
1006                                 next = tmp;
1007                         }
1008                         if (pgcnt > 1) {
1009                                 /*
1010                                  * If inside large page, jump to next large
1011                                  * page
1012                                  */
1013                                 index = (index & ~pgmsk) + pgcnt;
1014                         } else {
1015                                 index++;
1016                         }
1017                 }
1018                 if (next == *last) {
1019                         ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1020                         break;
1021                 } else
1022                         next = AVL_NEXT(tree, next);
1023         }
1024         ANON_LOCK_EXIT(&->a_rwlock);
1025 }
1026 
1027 /*
1028  * Same as vmu_amp_update_incore_bounds(), except for tracking
1029  * incore-/not-incore for vnodes.
1030  */
1031 static void
1032 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
1033     vmu_bound_t **first, vmu_bound_t **last)
1034 {
1035         vmu_bound_t *next;
1036         vmu_bound_t *tmp;
1037         pgcnt_t index;
1038         short bound_type;
1039         short page_type;
1040 
1041         next = *first;
1042         for (;;) {
1043                 if (vnode->v_pages == NULL)
1044                         next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1045 
1046                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1047                         if (next == *last)
1048                                 break;
1049                         next = AVL_NEXT(tree, next);
1050                         continue;
1051                 }
1052 
1053                 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
1054                 bound_type = next->vmb_type;
1055                 index = next->vmb_start;
1056                 while (index <= next->vmb_end) {
1057 
1058                         /*
1059                          * These are used to determine how much to increment
1060                          * index when a large page is found.
1061                          */
1062                         page_t *page;
1063                         pgcnt_t pgcnt = 1;
1064                         uint_t pgshft;
1065                         pgcnt_t pgmsk;
1066 
1067                         if (vnode->v_pages != NULL &&
1068                             (page = page_exists(vnode, ptob(index))) != NULL) {
1069                                 if (PP_ISFREE(page))
1070                                         page_type = VMUSAGE_BOUND_NOT_INCORE;
1071                                 else
1072                                         page_type = VMUSAGE_BOUND_INCORE;
1073                                 if (page->p_szc > 0) {
1074                                         pgcnt = page_get_pagecnt(page->p_szc);
1075                                         pgshft = page_get_shift(page->p_szc);
1076                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
1077                                             - 1;
1078                                 }
1079                         } else {
1080                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
1081                         }
1082 
1083                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1084                                 next->vmb_type = page_type;
1085                                 bound_type = page_type;
1086                         } else if (next->vmb_type != page_type) {
1087                                 /*
1088                                  * If current bound type does not match page
1089                                  * type, need to split off new bound.
1090                                  */
1091                                 tmp = vmu_alloc_bound();
1092                                 tmp->vmb_type = page_type;
1093                                 tmp->vmb_start = index;
1094                                 tmp->vmb_end = next->vmb_end;
1095                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
1096                                 next->vmb_end = index - 1;
1097                                 if (*last == next)
1098                                         *last = tmp;
1099                                 next = tmp;
1100                         }
1101                         if (pgcnt > 1) {
1102                                 /*
1103                                  * If inside large page, jump to next large
1104                                  * page
1105                                  */
1106                                 index = (index & ~pgmsk) + pgcnt;
1107                         } else {
1108                                 index++;
1109                         }
1110                 }
1111                 if (next == *last) {
1112                         ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1113                         break;
1114                 } else
1115                         next = AVL_NEXT(tree, next);
1116         }
1117 }
1118 
1119 /*
1120  * Calculate the rss and swap consumed by a segment.  vmu_entities is the
1121  * list of entities to visit.  For shared segments, the vnode or amp
1122  * is looked up in each entity to see if it has been already counted.  Private
1123  * anon pages are checked per entity to ensure that COW pages are not
1124  * double counted.
1125  *
1126  * For private mapped files, first the amp is checked for private pages.
1127  * Bounds not backed by the amp are looked up in the vnode for each entity
1128  * to avoid double counting of private COW vnode pages.
1129  */
1130 static void
1131 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1132 {
1133         struct segvn_data *svd;
1134         struct shm_data *shmd;
1135         struct spt_data *sptd;
1136         vmu_object_t *shared_object = NULL;
1137         vmu_object_t *entity_object = NULL;
1138         vmu_entity_t *entity;
1139         vmusage_t *result;
1140         vmu_bound_t *first = NULL;
1141         vmu_bound_t *last = NULL;
1142         vmu_bound_t *cur = NULL;
1143         vmu_bound_t *e_first = NULL;
1144         vmu_bound_t *e_last = NULL;
1145         vmu_bound_t *tmp;
1146         pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1147         struct anon_map *private_amp = NULL;
1148         boolean_t incore = B_FALSE;
1149         boolean_t shared = B_FALSE;
1150         int file = 0;
1151         pgcnt_t swresv = 0;
1152         pgcnt_t panon = 0;
1153 
1154         /* Can zero-length segments exist?  Not sure, so paranoia. */
1155         if (seg->s_size <= 0)
1156                 return;
1157 
1158         /*
1159          * Figure out if there is a shared object (such as a named vnode or
1160          * a shared amp, then figure out if there is a private amp, which
1161          * identifies private pages.
1162          */
1163         if (seg->s_ops == &segvn_ops) {
1164                 svd = (struct segvn_data *)seg->s_data;
1165                 if (svd->type == MAP_SHARED) {
1166                         shared = B_TRUE;
1167                 } else {
1168                         swresv = svd->swresv;
1169 
1170                         if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1171                             RW_READER) != 0) {
1172                                 /*
1173                                  * Text replication anon maps can be shared
1174                                  * across all zones. Space used for text
1175                                  * replication is typically capped as a small %
1176                                  * of memory.  To keep it simple for now we
1177                                  * don't account for swap and memory space used
1178                                  * for text replication.
1179                                  */
1180                                 if (svd->tr_state == SEGVN_TR_OFF &&
1181                                     svd->amp != NULL) {
1182                                         private_amp = svd->amp;
1183                                         p_start = svd->anon_index;
1184                                         p_end = svd->anon_index +
1185                                             btop(seg->s_size) - 1;
1186                                 }
1187                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1188                         }
1189                 }
1190                 if (svd->vp != NULL) {
1191                         file = 1;
1192                         shared_object = vmu_find_insert_object(
1193                             vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1194                             VMUSAGE_TYPE_VNODE);
1195                         s_start = btop(svd->offset);
1196                         s_end = btop(svd->offset + seg->s_size) - 1;
1197                 }
1198                 if (svd->amp != NULL && svd->type == MAP_SHARED) {
1199                         ASSERT(shared_object == NULL);
1200                         shared_object = vmu_find_insert_object(
1201                             vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1202                             VMUSAGE_TYPE_AMP);
1203                         s_start = svd->anon_index;
1204                         s_end = svd->anon_index + btop(seg->s_size) - 1;
1205                         /* schedctl mappings are always in core */
1206                         if (svd->amp->swresv == 0)
1207                                 incore = B_TRUE;
1208                 }
1209         } else if (seg->s_ops == &segspt_shmops) {
1210                 shared = B_TRUE;
1211                 shmd = (struct shm_data *)seg->s_data;
1212                 shared_object = vmu_find_insert_object(
1213                     vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1214                     VMUSAGE_TYPE_AMP);
1215                 s_start = 0;
1216                 s_end = btop(seg->s_size) - 1;
1217                 sptd = shmd->shm_sptseg->s_data;
1218 
1219                 /* ism segments are always incore and do not reserve swap */
1220                 if (sptd->spt_flags & SHM_SHARE_MMU)
1221                         incore = B_TRUE;
1222 
1223         } else {
1224                 return;
1225         }
1226 
1227         /*
1228          * If there is a private amp, count anon pages that exist.  If an
1229          * anon has a refcnt > 1 (COW sharing), then save the anon in a
1230          * hash so that it is not double counted.
1231          *
1232          * If there is also a shared object, then figure out the bounds
1233          * which are not mapped by the private amp.
1234          */
1235         if (private_amp != NULL) {
1236 
1237                 /* Enter as writer to prevent COW anons from being freed */
1238                 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1239 
1240                 p_index = p_start;
1241                 s_index = s_start;
1242 
1243                 while (p_index <= p_end) {
1244 
1245                         pgcnt_t p_index_next;
1246                         pgcnt_t p_bound_size;
1247                         int cnt;
1248                         anoff_t off;
1249                         struct vnode *vn;
1250                         struct anon *ap;
1251                         page_t *page;           /* For handling of large */
1252                         pgcnt_t pgcnt = 1;      /* pages */
1253                         pgcnt_t pgstart;
1254                         pgcnt_t pgend;
1255                         uint_t pgshft;
1256                         pgcnt_t pgmsk;
1257 
1258                         p_index_next = p_index;
1259                         ap = anon_get_next_ptr(private_amp->ahp,
1260                             &p_index_next);
1261 
1262                         /*
1263                          * If next anon is past end of mapping, simulate
1264                          * end of anon so loop terminates.
1265                          */
1266                         if (p_index_next > p_end) {
1267                                 p_index_next = p_end + 1;
1268                                 ap = NULL;
1269                         }
1270                         /*
1271                          * For COW segments, keep track of bounds not
1272                          * backed by private amp so they can be looked
1273                          * up in the backing vnode
1274                          */
1275                         if (p_index_next != p_index) {
1276 
1277                                 /*
1278                                  * Compute index difference between anon and
1279                                  * previous anon.
1280                                  */
1281                                 p_bound_size = p_index_next - p_index - 1;
1282 
1283                                 if (shared_object != NULL) {
1284                                         cur = vmu_alloc_bound();
1285                                         cur->vmb_start = s_index;
1286                                         cur->vmb_end = s_index + p_bound_size;
1287                                         cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1288                                         if (first == NULL) {
1289                                                 first = cur;
1290                                                 last = cur;
1291                                         } else {
1292                                                 last->vmb_next = cur;
1293                                                 last = cur;
1294                                         }
1295                                 }
1296                                 p_index = p_index + p_bound_size + 1;
1297                                 s_index = s_index + p_bound_size + 1;
1298                         }
1299 
1300                         /* Detect end of anons in amp */
1301                         if (ap == NULL)
1302                                 break;
1303 
1304                         cnt = ap->an_refcnt;
1305                         swap_xlate(ap, &vn, &off);
1306 
1307                         if (vn == NULL || vn->v_pages == NULL ||
1308                             (page = page_exists(vn, off)) == NULL) {
1309                                 p_index++;
1310                                 s_index++;
1311                                 continue;
1312                         }
1313 
1314                         /*
1315                          * If large page is found, compute portion of large
1316                          * page in mapping, and increment indicies to the next
1317                          * large page.
1318                          */
1319                         if (page->p_szc > 0) {
1320 
1321                                 pgcnt = page_get_pagecnt(page->p_szc);
1322                                 pgshft = page_get_shift(page->p_szc);
1323                                 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1324 
1325                                 /* First page in large page */
1326                                 pgstart = p_index & ~pgmsk;
1327                                 /* Last page in large page */
1328                                 pgend = pgstart + pgcnt - 1;
1329                                 /*
1330                                  * Artifically end page if page extends past
1331                                  * end of mapping.
1332                                  */
1333                                 if (pgend > p_end)
1334                                         pgend = p_end;
1335 
1336                                 /*
1337                                  * Compute number of pages from large page
1338                                  * which are mapped.
1339                                  */
1340                                 pgcnt = pgend - p_index + 1;
1341 
1342                                 /*
1343                                  * Point indicies at page after large page,
1344                                  * or at page after end of mapping.
1345                                  */
1346                                 p_index += pgcnt;
1347                                 s_index += pgcnt;
1348                         } else {
1349                                 p_index++;
1350                                 s_index++;
1351                         }
1352 
1353                         /*
1354                          * Pages on the free list aren't counted for the rss.
1355                          */
1356                         if (PP_ISFREE(page))
1357                                 continue;
1358 
1359                         /*
1360                          * Assume anon structs with a refcnt
1361                          * of 1 are not COW shared, so there
1362                          * is no reason to track them per entity.
1363                          */
1364                         if (cnt == 1) {
1365                                 panon += pgcnt;
1366                                 continue;
1367                         }
1368                         for (entity = vmu_entities; entity != NULL;
1369                             entity = entity->vme_next_calc) {
1370 
1371                                 result = &entity->vme_result;
1372                                 /*
1373                                  * Track COW anons per entity so
1374                                  * they are not double counted.
1375                                  */
1376                                 if (vmu_find_insert_anon(entity, ap) == 0)
1377                                         continue;
1378 
1379                                 result->vmu_rss_all += (pgcnt << PAGESHIFT);
1380                                 result->vmu_rss_private +=
1381                                     (pgcnt << PAGESHIFT);
1382                         }
1383                 }
1384                 ANON_LOCK_EXIT(&private_amp->a_rwlock);
1385         }
1386 
1387         /* Add up resident anon and swap reserved for private mappings */
1388         if (swresv > 0 || panon > 0) {
1389                 for (entity = vmu_entities; entity != NULL;
1390                     entity = entity->vme_next_calc) {
1391                         result = &entity->vme_result;
1392                         result->vmu_swap_all += swresv;
1393                         result->vmu_swap_private += swresv;
1394                         result->vmu_rss_all += (panon << PAGESHIFT);
1395                         result->vmu_rss_private += (panon << PAGESHIFT);
1396                 }
1397         }
1398 
1399         /* Compute resident pages backing shared amp or named vnode */
1400         if (shared_object != NULL) {
1401                 avl_tree_t *tree = &(shared_object->vmo_bounds);
1402 
1403                 if (first == NULL) {
1404                         /*
1405                          * No private amp, or private amp has no anon
1406                          * structs.  This means entire segment is backed by
1407                          * the shared object.
1408                          */
1409                         first = vmu_alloc_bound();
1410                         first->vmb_start = s_start;
1411                         first->vmb_end = s_end;
1412                         first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1413                 }
1414                 /*
1415                  * Iterate bounds not backed by private amp, and compute
1416                  * resident pages.
1417                  */
1418                 cur = first;
1419                 while (cur != NULL) {
1420 
1421                         if (vmu_insert_lookup_object_bounds(shared_object,
1422                             cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1423                             &first, &last) > 0) {
1424                                 /* new bounds, find incore/not-incore */
1425                                 if (shared_object->vmo_type ==
1426                                     VMUSAGE_TYPE_VNODE) {
1427                                         vmu_vnode_update_incore_bounds(
1428                                             tree,
1429                                             (vnode_t *)
1430                                             shared_object->vmo_key, &first,
1431                                             &last);
1432                                 } else {
1433                                         vmu_amp_update_incore_bounds(
1434                                             tree,
1435                                             (struct anon_map *)
1436                                             shared_object->vmo_key, &first,
1437                                             &last, incore);
1438                                 }
1439                                 vmu_merge_bounds(tree, &first, &last);
1440                         }
1441                         for (entity = vmu_entities; entity != NULL;
1442                             entity = entity->vme_next_calc) {
1443                                 avl_tree_t *e_tree;
1444 
1445                                 result = &entity->vme_result;
1446 
1447                                 entity_object = vmu_find_insert_object(
1448                                     shared_object->vmo_type ==
1449                                     VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1450                                     entity->vme_amp_hash,
1451                                     shared_object->vmo_key,
1452                                     shared_object->vmo_type);
1453 
1454                                 virt = vmu_insert_lookup_object_bounds(
1455                                     entity_object, cur->vmb_start, cur->vmb_end,
1456                                     VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1457 
1458                                 if (virt == 0)
1459                                         continue;
1460                                 /*
1461                                  * Range visited for this entity
1462                                  */
1463                                 e_tree = &(entity_object->vmo_bounds);
1464                                 rss = vmu_update_bounds(e_tree, &e_first,
1465                                     &e_last, tree, first, last);
1466                                 result->vmu_rss_all += (rss << PAGESHIFT);
1467                                 if (shared == B_TRUE && file == B_FALSE) {
1468                                         /* shared anon mapping */
1469                                         result->vmu_swap_all +=
1470                                             (virt << PAGESHIFT);
1471                                         result->vmu_swap_shared +=
1472                                             (virt << PAGESHIFT);
1473                                         result->vmu_rss_shared +=
1474                                             (rss << PAGESHIFT);
1475                                 } else if (shared == B_TRUE && file == B_TRUE) {
1476                                         /* shared file mapping */
1477                                         result->vmu_rss_shared +=
1478                                             (rss << PAGESHIFT);
1479                                 } else if (shared == B_FALSE &&
1480                                     file == B_TRUE) {
1481                                         /* private file mapping */
1482                                         result->vmu_rss_private +=
1483                                             (rss << PAGESHIFT);
1484                                 }
1485                                 vmu_merge_bounds(e_tree, &e_first, &e_last);
1486                         }
1487                         tmp = cur;
1488                         cur = cur->vmb_next;
1489                         vmu_free_bound(tmp);
1490                 }
1491         }
1492 }
1493 
1494 /*
1495  * Based on the current calculation flags, find the relevant entities
1496  * which are relative to the process.  Then calculate each segment
1497  * in the process'es address space for each relevant entity.
1498  */
1499 static void
1500 vmu_calculate_proc(proc_t *p)
1501 {
1502         vmu_entity_t *entities = NULL;
1503         vmu_zone_t *zone;
1504         vmu_entity_t *tmp;
1505         struct as *as;
1506         struct seg *seg;
1507         int ret;
1508 
1509         /* Figure out which entities are being computed */
1510         if ((vmu_data.vmu_system) != NULL) {
1511                 tmp = vmu_data.vmu_system;
1512                 tmp->vme_next_calc = entities;
1513                 entities = tmp;
1514         }
1515         if (vmu_data.vmu_calc_flags &
1516             (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
1517             VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1518             VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1519             VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1520             VMUSAGE_ALL_EUSERS)) {
1521                 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1522                     (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1523                     (mod_hash_val_t *)&zone);
1524                 if (ret != 0) {
1525                         zone = vmu_alloc_zone(p->p_zone->zone_id);
1526                         ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1527                             (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1528                             (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1529                         ASSERT(ret == 0);
1530                 }
1531                 if (zone->vmz_zone != NULL) {
1532                         tmp = zone->vmz_zone;
1533                         tmp->vme_next_calc = entities;
1534                         entities = tmp;
1535                 }
1536                 if (vmu_data.vmu_calc_flags &
1537                     (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1538                         tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1539                             p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1540                             zone->vmz_id);
1541                         tmp->vme_next_calc = entities;
1542                         entities = tmp;
1543                 }
1544                 if (vmu_data.vmu_calc_flags &
1545                     (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1546                         tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1547                             p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1548                         tmp->vme_next_calc = entities;
1549                         entities = tmp;
1550                 }
1551                 if (vmu_data.vmu_calc_flags &
1552                     (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1553                         tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1554                             crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1555                         tmp->vme_next_calc = entities;
1556                         entities = tmp;
1557                 }
1558                 if (vmu_data.vmu_calc_flags &
1559                     (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1560                         tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1561                             crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1562                         tmp->vme_next_calc = entities;
1563                         entities = tmp;
1564                 }
1565         }
1566         /* Entities which collapse projects and users for all zones */
1567         if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1568                 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1569                     p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1570                 tmp->vme_next_calc = entities;
1571                 entities = tmp;
1572         }
1573         if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1574                 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1575                     crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1576                 tmp->vme_next_calc = entities;
1577                 entities = tmp;
1578         }
1579         if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1580                 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1581                     crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1582                 tmp->vme_next_calc = entities;
1583                 entities = tmp;
1584         }
1585 
1586         ASSERT(entities != NULL);
1587         /* process all segs in process's address space */
1588         as = p->p_as;
1589         AS_LOCK_ENTER(as, RW_READER);
1590         for (seg = AS_SEGFIRST(as); seg != NULL;
1591             seg = AS_SEGNEXT(as, seg)) {
1592                 vmu_calculate_seg(entities, seg);
1593         }
1594         AS_LOCK_EXIT(as);
1595 }
1596 
1597 /*
1598  * Free data created by previous call to vmu_calculate().
1599  */
1600 static void
1601 vmu_clear_calc()
1602 {
1603         if (vmu_data.vmu_system != NULL)
1604                 vmu_free_entity(vmu_data.vmu_system);
1605                 vmu_data.vmu_system = NULL;
1606         if (vmu_data.vmu_zones_hash != NULL)
1607                 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1608         if (vmu_data.vmu_projects_col_hash != NULL)
1609                 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1610         if (vmu_data.vmu_rusers_col_hash != NULL)
1611                 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1612         if (vmu_data.vmu_eusers_col_hash != NULL)
1613                 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1614 
1615         i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1616         i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1617 }
1618 
1619 /*
1620  * Free unused data structures.  These can result if the system workload
1621  * decreases between calculations.
1622  */
1623 static void
1624 vmu_free_extra()
1625 {
1626         vmu_bound_t *tb;
1627         vmu_object_t *to;
1628         vmu_entity_t *te;
1629         vmu_zone_t *tz;
1630 
1631         while (vmu_data.vmu_free_bounds != NULL) {
1632                 tb = vmu_data.vmu_free_bounds;
1633                 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1634                 kmem_cache_free(vmu_bound_cache, tb);
1635         }
1636         while (vmu_data.vmu_free_objects != NULL) {
1637                 to = vmu_data.vmu_free_objects;
1638                 vmu_data.vmu_free_objects =
1639                     vmu_data.vmu_free_objects->vmo_next;
1640                 kmem_cache_free(vmu_object_cache, to);
1641         }
1642         while (vmu_data.vmu_free_entities != NULL) {
1643                 te = vmu_data.vmu_free_entities;
1644                 vmu_data.vmu_free_entities =
1645                     vmu_data.vmu_free_entities->vme_next;
1646                 if (te->vme_vnode_hash != NULL)
1647                         mod_hash_destroy_hash(te->vme_vnode_hash);
1648                 if (te->vme_amp_hash != NULL)
1649                         mod_hash_destroy_hash(te->vme_amp_hash);
1650                 VERIFY(avl_first(&te->vme_anon) == NULL);
1651                 kmem_free(te, sizeof (vmu_entity_t));
1652         }
1653         while (vmu_data.vmu_free_zones != NULL) {
1654                 tz = vmu_data.vmu_free_zones;
1655                 vmu_data.vmu_free_zones =
1656                     vmu_data.vmu_free_zones->vmz_next;
1657                 if (tz->vmz_projects_hash != NULL)
1658                         mod_hash_destroy_hash(tz->vmz_projects_hash);
1659                 if (tz->vmz_tasks_hash != NULL)
1660                         mod_hash_destroy_hash(tz->vmz_tasks_hash);
1661                 if (tz->vmz_rusers_hash != NULL)
1662                         mod_hash_destroy_hash(tz->vmz_rusers_hash);
1663                 if (tz->vmz_eusers_hash != NULL)
1664                         mod_hash_destroy_hash(tz->vmz_eusers_hash);
1665                 kmem_free(tz, sizeof (vmu_zone_t));
1666         }
1667 }
1668 
1669 extern kcondvar_t *pr_pid_cv;
1670 
1671 /*
1672  * Determine which entity types are relevant and allocate the hashes to
1673  * track them.  Then walk the process table and count rss and swap
1674  * for each process'es address space.  Address space object such as
1675  * vnodes, amps and anons are tracked per entity, so that they are
1676  * not double counted in the results.
1677  *
1678  */
1679 static void
1680 vmu_calculate()
1681 {
1682         int i = 0;
1683         int ret;
1684         proc_t *p;
1685 
1686         vmu_clear_calc();
1687 
1688         if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1689                 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1690                     ALL_ZONES);
1691 
1692         /*
1693          * Walk process table and calculate rss of each proc.
1694          *
1695          * Pidlock and p_lock cannot be held while doing the rss calculation.
1696          * This is because:
1697          *      1.  The calculation allocates using KM_SLEEP.
1698          *      2.  The calculation grabs a_lock, which cannot be grabbed
1699          *          after p_lock.
1700          *
1701          * Since pidlock must be dropped, we cannot simply just walk the
1702          * practive list.  Instead, we walk the process table, and sprlock
1703          * each process to ensure that it does not exit during the
1704          * calculation.
1705          */
1706 
1707         mutex_enter(&pidlock);
1708         for (i = 0; i < v.v_proc; i++) {
1709 again:
1710                 p = pid_entry(i);
1711                 if (p == NULL)
1712                         continue;
1713 
1714                 mutex_enter(&p->p_lock);
1715                 mutex_exit(&pidlock);
1716 
1717                 if (panicstr) {
1718                         mutex_exit(&p->p_lock);
1719                         return;
1720                 }
1721 
1722                 /* Try to set P_PR_LOCK */
1723                 ret = sprtrylock_proc(p);
1724                 if (ret == -1) {
1725                         /* Process in invalid state */
1726                         mutex_exit(&p->p_lock);
1727                         mutex_enter(&pidlock);
1728                         continue;
1729                 } else if (ret == 1) {
1730                         /*
1731                          * P_PR_LOCK is already set.  Wait and try again.
1732                          * This also drops p_lock.
1733                          */
1734                         sprwaitlock_proc(p);
1735                         mutex_enter(&pidlock);
1736                         goto again;
1737                 }
1738                 mutex_exit(&p->p_lock);
1739 
1740                 vmu_calculate_proc(p);
1741 
1742                 mutex_enter(&p->p_lock);
1743                 sprunlock(p);
1744                 mutex_enter(&pidlock);
1745         }
1746         mutex_exit(&pidlock);
1747 
1748         vmu_free_extra();
1749 }
1750 
1751 /*
1752  * allocate a new cache for N results satisfying flags
1753  */
1754 vmu_cache_t *
1755 vmu_cache_alloc(size_t nres, uint_t flags)
1756 {
1757         vmu_cache_t *cache;
1758 
1759         cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1760         cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1761         cache->vmc_nresults = nres;
1762         cache->vmc_flags = flags;
1763         cache->vmc_refcnt = 1;
1764         return (cache);
1765 }
1766 
1767 /*
1768  * Make sure cached results are not freed
1769  */
1770 static void
1771 vmu_cache_hold(vmu_cache_t *cache)
1772 {
1773         ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1774         cache->vmc_refcnt++;
1775 }
1776 
1777 /*
1778  * free cache data
1779  */
1780 static void
1781 vmu_cache_rele(vmu_cache_t *cache)
1782 {
1783         ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1784         ASSERT(cache->vmc_refcnt > 0);
1785         cache->vmc_refcnt--;
1786         if (cache->vmc_refcnt == 0) {
1787                 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1788                     cache->vmc_nresults);
1789                 kmem_free(cache, sizeof (vmu_cache_t));
1790         }
1791 }
1792 
1793 /*
1794  * When new data is calculated, update the phys_mem rctl usage value in the
1795  * zones.
1796  */
1797 static void
1798 vmu_update_zone_rctls(vmu_cache_t *cache)
1799 {
1800         vmusage_t       *rp;
1801         size_t          i = 0;
1802         zone_t          *zp;
1803 
1804         for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
1805                 if (rp->vmu_type == VMUSAGE_ZONE &&
1806                     rp->vmu_zoneid != ALL_ZONES) {
1807                         if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
1808                                 zp->zone_phys_mem = rp->vmu_rss_all;
1809                                 zone_rele(zp);
1810                         }
1811                 }
1812         }
1813 }
1814 
1815 /*
1816  * Copy out the cached results to a caller.  Inspect the callers flags
1817  * and zone to determine which cached results should be copied.
1818  */
1819 static int
1820 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1821     uint_t flags, id_t req_zone_id, int cpflg)
1822 {
1823         vmusage_t *result, *out_result;
1824         vmusage_t dummy;
1825         size_t i, count = 0;
1826         size_t bufsize;
1827         int ret = 0;
1828         uint_t types = 0;
1829 
1830         if (nres != NULL) {
1831                 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1832                         return (set_errno(EFAULT));
1833         } else {
1834                 bufsize = 0;
1835         }
1836 
1837         /* figure out what results the caller is interested in. */
1838         if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1839                 types |= VMUSAGE_SYSTEM;
1840         if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
1841                 types |= VMUSAGE_ZONE;
1842         if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1843             VMUSAGE_COL_PROJECTS))
1844                 types |= VMUSAGE_PROJECTS;
1845         if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1846                 types |= VMUSAGE_TASKS;
1847         if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1848                 types |= VMUSAGE_RUSERS;
1849         if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1850                 types |= VMUSAGE_EUSERS;
1851 
1852         /* count results for current zone */
1853         out_result = buf;
1854         for (result = cache->vmc_results, i = 0;
1855             i < cache->vmc_nresults; result++, i++) {
1856 
1857                 /* Do not return "other-zone" results to non-global zones */
1858                 if (curproc->p_zone != global_zone &&
1859                     curproc->p_zone->zone_id != result->vmu_zoneid)
1860                         continue;
1861 
1862                 /*
1863                  * If non-global zone requests VMUSAGE_SYSTEM, fake
1864                  * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1865                  */
1866                 if (curproc->p_zone != global_zone &&
1867                     (flags & VMUSAGE_SYSTEM) != 0 &&
1868                     result->vmu_type == VMUSAGE_ZONE) {
1869                         count++;
1870                         if (out_result != NULL) {
1871                                 if (bufsize < count) {
1872                                         ret = set_errno(EOVERFLOW);
1873                                 } else {
1874                                         dummy = *result;
1875                                         dummy.vmu_zoneid = ALL_ZONES;
1876                                         dummy.vmu_id = 0;
1877                                         dummy.vmu_type = VMUSAGE_SYSTEM;
1878                                         if (ddi_copyout(&dummy, out_result,
1879                                             sizeof (vmusage_t), cpflg))
1880                                                 return (set_errno(EFAULT));
1881                                         out_result++;
1882                                 }
1883                         }
1884                 }
1885 
1886                 /* Skip results that do not match requested type */
1887                 if ((result->vmu_type & types) == 0)
1888                         continue;
1889 
1890                 /* Skip collated results if not requested */
1891                 if (result->vmu_zoneid == ALL_ZONES) {
1892                         if (result->vmu_type == VMUSAGE_PROJECTS &&
1893                             (flags & VMUSAGE_COL_PROJECTS) == 0)
1894                                 continue;
1895                         if (result->vmu_type == VMUSAGE_EUSERS &&
1896                             (flags & VMUSAGE_COL_EUSERS) == 0)
1897                                 continue;
1898                         if (result->vmu_type == VMUSAGE_RUSERS &&
1899                             (flags & VMUSAGE_COL_RUSERS) == 0)
1900                                 continue;
1901                 }
1902 
1903                 if (result->vmu_type == VMUSAGE_ZONE &&
1904                     flags & VMUSAGE_A_ZONE) {
1905                         /* Skip non-requested zone results */
1906                         if (result->vmu_zoneid != req_zone_id)
1907                                 continue;
1908                 } else {
1909                         /* Skip "other zone" results if not requested */
1910                         if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1911                                 if (result->vmu_type == VMUSAGE_ZONE &&
1912                                     (flags & VMUSAGE_ALL_ZONES) == 0)
1913                                         continue;
1914                                 if (result->vmu_type == VMUSAGE_PROJECTS &&
1915                                     (flags & (VMUSAGE_ALL_PROJECTS |
1916                                     VMUSAGE_COL_PROJECTS)) == 0)
1917                                         continue;
1918                                 if (result->vmu_type == VMUSAGE_TASKS &&
1919                                     (flags & VMUSAGE_ALL_TASKS) == 0)
1920                                         continue;
1921                                 if (result->vmu_type == VMUSAGE_RUSERS &&
1922                                     (flags & (VMUSAGE_ALL_RUSERS |
1923                                     VMUSAGE_COL_RUSERS)) == 0)
1924                                         continue;
1925                                 if (result->vmu_type == VMUSAGE_EUSERS &&
1926                                     (flags & (VMUSAGE_ALL_EUSERS |
1927                                     VMUSAGE_COL_EUSERS)) == 0)
1928                                         continue;
1929                         }
1930                 }
1931                 count++;
1932                 if (out_result != NULL) {
1933                         if (bufsize < count) {
1934                                 ret = set_errno(EOVERFLOW);
1935                         } else {
1936                                 if (ddi_copyout(result, out_result,
1937                                     sizeof (vmusage_t), cpflg))
1938                                         return (set_errno(EFAULT));
1939                                 out_result++;
1940                         }
1941                 }
1942         }
1943         if (nres != NULL)
1944                 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1945                         return (set_errno(EFAULT));
1946 
1947         return (ret);
1948 }
1949 
1950 /*
1951  * vm_getusage()
1952  *
1953  * Counts rss and swap by zone, project, task, and/or user.  The flags argument
1954  * determines the type of results structures returned.  Flags requesting
1955  * results from more than one zone are "flattened" to the local zone if the
1956  * caller is not the global zone.
1957  *
1958  * args:
1959  *      flags:  bitmap consisting of one or more of VMUSAGE_*.
1960  *      age:    maximum allowable age (time since counting was done) in
1961  *              seconds of the results.  Results from previous callers are
1962  *              cached in kernel.
1963  *      buf:    pointer to buffer array of vmusage_t.  If NULL, then only nres
1964  *              set on success.
1965  *      nres:   Set to number of vmusage_t structures pointed to by buf
1966  *              before calling vm_getusage().
1967  *              On return 0 (success) or ENOSPC, is set to the number of result
1968  *              structures returned or attempted to return.
1969  *
1970  * returns 0 on success, -1 on failure:
1971  *      EINTR (interrupted)
1972  *      ENOSPC (nres to small for results, nres set to needed value for success)
1973  *      EINVAL (flags invalid)
1974  *      EFAULT (bad address for buf or nres)
1975  */
1976 int
1977 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1978 {
1979         vmu_entity_t *entity;
1980         vmusage_t *result;
1981         int ret = 0;
1982         int cacherecent = 0;
1983         hrtime_t now;
1984         uint_t flags_orig;
1985         id_t req_zone_id;
1986 
1987         /*
1988          * Non-global zones cannot request system wide and/or collated
1989          * results, or the system result, or usage of another zone, so munge
1990          * the flags accordingly.
1991          */
1992         flags_orig = flags;
1993         if (curproc->p_zone != global_zone) {
1994                 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1995                         flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1996                         flags |= VMUSAGE_PROJECTS;
1997                 }
1998                 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1999                         flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
2000                         flags |= VMUSAGE_RUSERS;
2001                 }
2002                 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
2003                         flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
2004                         flags |= VMUSAGE_EUSERS;
2005                 }
2006                 if (flags & VMUSAGE_SYSTEM) {
2007                         flags &= ~VMUSAGE_SYSTEM;
2008                         flags |= VMUSAGE_ZONE;
2009                 }
2010                 if (flags & VMUSAGE_A_ZONE) {
2011                         flags &= ~VMUSAGE_A_ZONE;
2012                         flags |= VMUSAGE_ZONE;
2013                 }
2014         }
2015 
2016         /* Check for unknown flags */
2017         if ((flags & (~VMUSAGE_MASK)) != 0)
2018                 return (set_errno(EINVAL));
2019 
2020         /* Check for no flags */
2021         if ((flags & VMUSAGE_MASK) == 0)
2022                 return (set_errno(EINVAL));
2023 
2024         /* If requesting results for a specific zone, get the zone ID */
2025         if (flags & VMUSAGE_A_ZONE) {
2026                 size_t bufsize;
2027                 vmusage_t zreq;
2028 
2029                 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
2030                         return (set_errno(EFAULT));
2031                 /* Requested zone ID is passed in buf, so 0 len not allowed */
2032                 if (bufsize == 0)
2033                         return (set_errno(EINVAL));
2034                 if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
2035                         return (set_errno(EFAULT));
2036                 req_zone_id = zreq.vmu_id;
2037         }
2038 
2039         mutex_enter(&vmu_data.vmu_lock);
2040         now = gethrtime();
2041 
2042 start:
2043         if (vmu_data.vmu_cache != NULL) {
2044 
2045                 vmu_cache_t *cache;
2046 
2047                 if ((vmu_data.vmu_cache->vmc_timestamp +
2048                     ((hrtime_t)age * NANOSEC)) > now)
2049                         cacherecent = 1;
2050 
2051                 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
2052                     cacherecent == 1) {
2053                         cache = vmu_data.vmu_cache;
2054                         vmu_cache_hold(cache);
2055                         mutex_exit(&vmu_data.vmu_lock);
2056 
2057                         ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2058                             req_zone_id, cpflg);
2059                         mutex_enter(&vmu_data.vmu_lock);
2060                         vmu_cache_rele(cache);
2061                         if (vmu_data.vmu_pending_waiters > 0)
2062                                 cv_broadcast(&vmu_data.vmu_cv);
2063                         mutex_exit(&vmu_data.vmu_lock);
2064                         return (ret);
2065                 }
2066                 /*
2067                  * If the cache is recent, it is likely that there are other
2068                  * consumers of vm_getusage running, so add their flags to the
2069                  * desired flags for the calculation.
2070                  */
2071                 if (cacherecent == 1)
2072                         flags = vmu_data.vmu_cache->vmc_flags | flags;
2073         }
2074         if (vmu_data.vmu_calc_thread == NULL) {
2075 
2076                 vmu_cache_t *cache;
2077 
2078                 vmu_data.vmu_calc_thread = curthread;
2079                 vmu_data.vmu_calc_flags = flags;
2080                 vmu_data.vmu_entities = NULL;
2081                 vmu_data.vmu_nentities = 0;
2082                 if (vmu_data.vmu_pending_waiters > 0)
2083                         vmu_data.vmu_calc_flags |=
2084                             vmu_data.vmu_pending_flags;
2085 
2086                 vmu_data.vmu_pending_flags = 0;
2087                 mutex_exit(&vmu_data.vmu_lock);
2088                 vmu_calculate();
2089                 mutex_enter(&vmu_data.vmu_lock);
2090                 /* copy results to cache */
2091                 if (vmu_data.vmu_cache != NULL)
2092                         vmu_cache_rele(vmu_data.vmu_cache);
2093                 cache = vmu_data.vmu_cache =
2094                     vmu_cache_alloc(vmu_data.vmu_nentities,
2095                     vmu_data.vmu_calc_flags);
2096 
2097                 result = cache->vmc_results;
2098                 for (entity = vmu_data.vmu_entities; entity != NULL;
2099                     entity = entity->vme_next) {
2100                         *result = entity->vme_result;
2101                         result++;
2102                 }
2103                 cache->vmc_timestamp = gethrtime();
2104                 vmu_cache_hold(cache);
2105 
2106                 vmu_data.vmu_calc_flags = 0;
2107                 vmu_data.vmu_calc_thread = NULL;
2108 
2109                 if (vmu_data.vmu_pending_waiters > 0)
2110                         cv_broadcast(&vmu_data.vmu_cv);
2111 
2112                 mutex_exit(&vmu_data.vmu_lock);
2113 
2114                 /* update zone's phys. mem. rctl usage */
2115                 vmu_update_zone_rctls(cache);
2116                 /* copy cache */
2117                 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2118                     req_zone_id, cpflg);
2119                 mutex_enter(&vmu_data.vmu_lock);
2120                 vmu_cache_rele(cache);
2121                 mutex_exit(&vmu_data.vmu_lock);
2122 
2123                 return (ret);
2124         }
2125         vmu_data.vmu_pending_flags |= flags;
2126         vmu_data.vmu_pending_waiters++;
2127         while (vmu_data.vmu_calc_thread != NULL) {
2128                 if (cv_wait_sig(&vmu_data.vmu_cv,
2129                     &vmu_data.vmu_lock) == 0) {
2130                         vmu_data.vmu_pending_waiters--;
2131                         mutex_exit(&vmu_data.vmu_lock);
2132                         return (set_errno(EINTR));
2133                 }
2134         }
2135         vmu_data.vmu_pending_waiters--;
2136         goto start;
2137 }
2138 
2139 #if defined(__x86)
2140 /*
2141  * Attempt to invalidate all of the pages in the mapping for the given process.
2142  */
2143 static void
2144 map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
2145 {
2146         page_t          *pp;
2147         size_t          psize;
2148         u_offset_t      off;
2149         caddr_t         eaddr;
2150         struct vnode    *vp;
2151         struct segvn_data *svd;
2152         struct hat      *victim_hat;
2153 
2154         ASSERT((addr + size) <= (seg->s_base + seg->s_size));
2155 
2156         victim_hat = p->p_as->a_hat;
2157         svd = (struct segvn_data *)seg->s_data;
2158         vp = svd->vp;
2159         psize = page_get_pagesize(seg->s_szc);
2160 
2161         off = svd->offset + (uintptr_t)(addr - seg->s_base);
2162 
2163         for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
2164                 pp = page_lookup_nowait(vp, off, SE_SHARED);
2165 
2166                 if (pp != NULL) {
2167                         /* following logic based on pvn_getdirty() */
2168 
2169                         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2170                                 page_unlock(pp);
2171                                 continue;
2172                         }
2173 
2174                         page_io_lock(pp);
2175                         hat_page_inval(pp, 0, victim_hat);
2176                         page_io_unlock(pp);
2177 
2178                         /*
2179                          * For B_INVALCURONLY-style handling we let
2180                          * page_release call VN_DISPOSE if no one else is using
2181                          * the page.
2182                          *
2183                          * A hat_ismod() check would be useless because:
2184                          * (1) we are not be holding SE_EXCL lock
2185                          * (2) we've not unloaded _all_ translations
2186                          *
2187                          * Let page_release() do the heavy-lifting.
2188                          */
2189                         (void) page_release(pp, 1);
2190                 }
2191         }
2192 }
2193 
2194 /*
2195  * vm_map_inval()
2196  *
2197  * Invalidate as many pages as possible within the given mapping for the given
2198  * process. addr is expected to be the base address of the mapping and size is
2199  * the length of the mapping. In some cases a mapping will encompass an
2200  * entire segment, but at least for anon or stack mappings, these will be
2201  * regions within a single large segment. Thus, the invalidation is oriented
2202  * around a single mapping and not an entire segment.
2203  *
2204  * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
2205  * this code is only applicable to x86.
2206  */
2207 int
2208 vm_map_inval(pid_t pid, caddr_t addr, size_t size)
2209 {
2210         int ret;
2211         int error = 0;
2212         proc_t *p;              /* target proc */
2213         struct as *as;          /* target proc's address space */
2214         struct seg *seg;        /* working segment */
2215 
2216         if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
2217                 return (set_errno(EPERM));
2218 
2219         /* If not a valid mapping address, return an error */
2220         if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
2221                 return (set_errno(EINVAL));
2222 
2223 again:
2224         mutex_enter(&pidlock);
2225         p = prfind(pid);
2226         if (p == NULL) {
2227                 mutex_exit(&pidlock);
2228                 return (set_errno(ESRCH));
2229         }
2230 
2231         mutex_enter(&p->p_lock);
2232         mutex_exit(&pidlock);
2233 
2234         if (panicstr != NULL) {
2235                 mutex_exit(&p->p_lock);
2236                 return (0);
2237         }
2238 
2239         as = p->p_as;
2240 
2241         /*
2242          * Try to set P_PR_LOCK - prevents process "changing shape"
2243          * - blocks fork
2244          * - blocks sigkill
2245          * - cannot be a system proc
2246          * - must be fully created proc
2247          */
2248         ret = sprtrylock_proc(p);
2249         if (ret == -1) {
2250                 /* Process in invalid state */
2251                 mutex_exit(&p->p_lock);
2252                 return (set_errno(ESRCH));
2253         }
2254 
2255         if (ret == 1) {
2256                 /*
2257                  * P_PR_LOCK is already set. Wait and try again. This also
2258                  * drops p_lock so p may no longer be valid since the proc may
2259                  * have exited.
2260                  */
2261                 sprwaitlock_proc(p);
2262                 goto again;
2263         }
2264 
2265         /* P_PR_LOCK is now set */
2266         mutex_exit(&p->p_lock);
2267 
2268         AS_LOCK_ENTER(as, RW_READER);
2269         if ((seg = as_segat(as, addr)) == NULL) {
2270                 AS_LOCK_EXIT(as);
2271                 mutex_enter(&p->p_lock);
2272                 sprunlock(p);
2273                 return (set_errno(ENOMEM));
2274         }
2275 
2276         /*
2277          * The invalidation behavior only makes sense for vnode-backed segments.
2278          */
2279         if (seg->s_ops != &segvn_ops) {
2280                 AS_LOCK_EXIT(as);
2281                 mutex_enter(&p->p_lock);
2282                 sprunlock(p);
2283                 return (0);
2284         }
2285 
2286         /*
2287          * If the mapping is out of bounds of the segement return an error.
2288          */
2289         if ((addr + size) > (seg->s_base + seg->s_size)) {
2290                 AS_LOCK_EXIT(as);
2291                 mutex_enter(&p->p_lock);
2292                 sprunlock(p);
2293                 return (set_errno(EINVAL));
2294         }
2295 
2296         /*
2297          * Don't use MS_INVALCURPROC flag here since that would eventually
2298          * initiate hat invalidation based on curthread. Since we're doing this
2299          * on behalf of a different process, that would erroneously invalidate
2300          * our own process mappings.
2301          */
2302         error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
2303         if (error == 0) {
2304                 /*
2305                  * Since we didn't invalidate during the sync above, we now
2306                  * try to invalidate all of the pages in the mapping.
2307                  */
2308                 map_inval(p, seg, addr, size);
2309         }
2310         AS_LOCK_EXIT(as);
2311 
2312         mutex_enter(&p->p_lock);
2313         sprunlock(p);
2314 
2315         if (error)
2316                 (void) set_errno(error);
2317         return (error);
2318 }
2319 #endif