io-lx-public-vs-joyent New usr/src/uts/common/vm/vm

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2016, Joyent, Inc.
  29  */
  30 
  31 /*
  32  * vm_usage
  33  *
  34  * This file implements the getvmusage() private system call.
  35  * getvmusage() counts the amount of resident memory pages and swap
  36  * reserved by the specified process collective. A "process collective" is
  37  * the set of processes owned by a particular, zone, project, task, or user.
  38  *
  39  * rss and swap are counted so that for a given process collective, a page is
  40  * only counted once.  For example, this means that if multiple processes in
  41  * the same project map the same page, then the project will only be charged
  42  * once for that page.  On the other hand, if two processes in different
  43  * projects map the same page, then both projects will be charged
  44  * for the page.
  45  *
  46  * The vm_getusage() calculation is implemented so that the first thread
  47  * performs the rss/swap counting. Other callers will wait for that thread to
  48  * finish, copying the results.  This enables multiple rcapds and prstats to
  49  * consume data from the same calculation.  The results are also cached so that
  50  * a caller interested in recent results can just copy them instead of starting
  51  * a new calculation. The caller passes the maximium age (in seconds) of the
  52  * data.  If the cached data is young enough, the cache is copied, otherwise,
  53  * a new calculation is executed and the cache is replaced with the new
  54  * data.
  55  *
  56  * The rss calculation for each process collective is as follows:
  57  *
  58  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
  59  *     and/or users.
  60  *   - For each proc:
  61  *      - Figure out proc's collectives (zone, project, task, and/or user).
  62  *      - For each seg in proc's address space:
  63  *              - If seg is private:
  64  *                      - Lookup anons in the amp.
  65  *                      - For incore pages not previously visited each of the
  66  *                        proc's collectives, add incore pagesize to each.
  67  *                        collective.
  68  *                        Anon's with a refcnt of 1 can be assummed to be not
  69  *                        previously visited.
  70  *                      - For address ranges without anons in the amp:
  71  *                              - Lookup pages in underlying vnode.
  72  *                              - For incore pages not previously visiting for
  73  *                                each of the proc's collectives, add incore
  74  *                                pagesize to each collective.
  75  *              - If seg is shared:
  76  *                      - Lookup pages in the shared amp or vnode.
  77  *                      - For incore pages not previously visited for each of
  78  *                        the proc's collectives, add incore pagesize to each
  79  *                        collective.
  80  *
  81  * Swap is reserved by private segments, and shared anonymous segments.
  82  * The only shared anon segments which do not reserve swap are ISM segments
  83  * and schedctl segments, both of which can be identified by having
  84  * amp->swresv == 0.
  85  *
  86  * The swap calculation for each collective is as follows:
  87  *
  88  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
  89  *     and/or users.
  90  *   - For each proc:
  91  *      - Figure out proc's collectives (zone, project, task, and/or user).
  92  *      - For each seg in proc's address space:
  93  *              - If seg is private:
  94  *                      - Add svd->swresv pages to swap count for each of the
  95  *                        proc's collectives.
  96  *              - If seg is anon, shared, and amp->swresv != 0
  97  *                      - For address ranges in amp not previously visited for
  98  *                        each of the proc's collectives, add size of address
  99  *                        range to the swap count for each collective.
 100  *
 101  * These two calculations are done simultaneously, with most of the work
 102  * being done in vmu_calculate_seg().  The results of the calculation are
 103  * copied into "vmu_data.vmu_cache_results".
 104  *
 105  * To perform the calculation, various things are tracked and cached:
 106  *
 107  *    - incore/not-incore page ranges for all vnodes.
 108  *      (vmu_data.vmu_all_vnodes_hash)
 109  *      This eliminates looking up the same page more than once.
 110  *
 111  *    - incore/not-incore page ranges for all shared amps.
 112  *      (vmu_data.vmu_all_amps_hash)
 113  *      This eliminates looking up the same page more than once.
 114  *
 115  *    - visited page ranges for each collective.
 116  *         - per vnode (entity->vme_vnode_hash)
 117  *         - per shared amp (entity->vme_amp_hash)
 118  *      For accurate counting of map-shared and COW-shared pages.
 119  *
 120  *    - visited private anons (refcnt > 1) for each collective.
 121  *      (entity->vme_anon_hash)
 122  *      For accurate counting of COW-shared pages.
 123  *
 124  * The common accounting structure is the vmu_entity_t, which represents
 125  * collectives:
 126  *
 127  *    - A zone.
 128  *    - A project, task, or user within a zone.
 129  *    - The entire system (vmu_data.vmu_system).
 130  *    - Each collapsed (col) project and user.  This means a given projid or
 131  *      uid, regardless of which zone the process is in.  For instance,
 132  *      project 0 in the global zone and project 0 in a non global zone are
 133  *      the same collapsed project.
 134  *
 135  *  Each entity structure tracks which pages have been already visited for
 136  *  that entity (via previously inspected processes) so that these pages are
 137  *  not double counted.
 138  */
 139 
 140 #include <sys/errno.h>
 141 #include <sys/types.h>
 142 #include <sys/zone.h>
 143 #include <sys/proc.h>
 144 #include <sys/project.h>
 145 #include <sys/task.h>
 146 #include <sys/thread.h>
 147 #include <sys/time.h>
 148 #include <sys/mman.h>
 149 #include <sys/modhash.h>
 150 #include <sys/modhash_impl.h>
 151 #include <sys/shm.h>
 152 #include <sys/swap.h>
 153 #include <sys/synch.h>
 154 #include <sys/systm.h>
 155 #include <sys/var.h>
 156 #include <sys/vm_usage.h>
 157 #include <sys/zone.h>
 158 #include <sys/sunddi.h>
 159 #include <sys/avl.h>
 160 #include <vm/anon.h>
 161 #include <vm/as.h>
 162 #include <vm/seg_vn.h>
 163 #include <vm/seg_spt.h>
 164 
 165 #define VMUSAGE_HASH_SIZE               512
 166 
 167 #define VMUSAGE_TYPE_VNODE              1
 168 #define VMUSAGE_TYPE_AMP                2
 169 #define VMUSAGE_TYPE_ANON               3
 170 
 171 #define VMUSAGE_BOUND_UNKNOWN           0
 172 #define VMUSAGE_BOUND_INCORE            1
 173 #define VMUSAGE_BOUND_NOT_INCORE        2
 174 
 175 #define ISWITHIN(node, addr)    ((node)->vmb_start <= addr && \
 176                                     (node)->vmb_end >= addr ? 1 : 0)
 177 
 178 /*
 179  * bounds for vnodes and shared amps
 180  * Each bound is either entirely incore, entirely not in core, or
 181  * entirely unknown.  bounds are stored in an avl tree sorted by start member
 182  * when in use, otherwise (free or temporary lists) they're strung
 183  * together off of vmb_next.
 184  */
 185 typedef struct vmu_bound {
 186         avl_node_t vmb_node;
 187         struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
 188         pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
 189         pgcnt_t vmb_end;    /* page offset in vnode/amp on which bound ends */
 190         char    vmb_type;   /* One of VMUSAGE_BOUND_* */
 191 } vmu_bound_t;
 192 
 193 /*
 194  * hash of visited objects (vnodes or shared amps)
 195  * key is address of vnode or amp.  Bounds lists known incore/non-incore
 196  * bounds for vnode/amp.
 197  */
 198 typedef struct vmu_object {
 199         struct vmu_object       *vmo_next;      /* free list */
 200         caddr_t         vmo_key;
 201         short           vmo_type;
 202         avl_tree_t      vmo_bounds;
 203 } vmu_object_t;
 204 
 205 /*
 206  * Entity by which to count results.
 207  *
 208  * The entity structure keeps the current rss/swap counts for each entity
 209  * (zone, project, etc), and hashes of vm structures that have already
 210  * been visited for the entity.
 211  *
 212  * vme_next:    links the list of all entities currently being counted by
 213  *              vmu_calculate().
 214  *
 215  * vme_next_calc: links the list of entities related to the current process
 216  *               being counted by vmu_calculate_proc().
 217  *
 218  * vmu_calculate_proc() walks all processes.  For each process, it makes a
 219  * list of the entities related to that process using vme_next_calc.  This
 220  * list changes each time vmu_calculate_proc() is called.
 221  *
 222  */
 223 typedef struct vmu_entity {
 224         struct vmu_entity *vme_next;
 225         struct vmu_entity *vme_next_calc;
 226         mod_hash_t      *vme_vnode_hash; /* vnodes visited for entity */
 227         mod_hash_t      *vme_amp_hash;   /* shared amps visited for entity */
 228         mod_hash_t      *vme_anon_hash;  /* COW anons visited for entity */
 229         vmusage_t       vme_result;      /* identifies entity and results */
 230 } vmu_entity_t;
 231 
 232 /*
 233  * Hash of entities visited within a zone, and an entity for the zone
 234  * itself.
 235  */
 236 typedef struct vmu_zone {
 237         struct vmu_zone *vmz_next;      /* free list */
 238         id_t            vmz_id;
 239         vmu_entity_t    *vmz_zone;
 240         mod_hash_t      *vmz_projects_hash;
 241         mod_hash_t      *vmz_tasks_hash;
 242         mod_hash_t      *vmz_rusers_hash;
 243         mod_hash_t      *vmz_eusers_hash;
 244 } vmu_zone_t;
 245 
 246 /*
 247  * Cache of results from last calculation
 248  */
 249 typedef struct vmu_cache {
 250         vmusage_t       *vmc_results;   /* Results from last call to */
 251                                         /* vm_getusage(). */
 252         uint64_t        vmc_nresults;   /* Count of cached results */
 253         uint64_t        vmc_refcnt;     /* refcnt for free */
 254         uint_t          vmc_flags;      /* Flags for vm_getusage() */
 255         hrtime_t        vmc_timestamp;  /* when cache was created */
 256 } vmu_cache_t;
 257 
 258 /*
 259  * top level rss info for the system
 260  */
 261 typedef struct vmu_data {
 262         kmutex_t        vmu_lock;               /* Protects vmu_data */
 263         kcondvar_t      vmu_cv;                 /* Used to signal threads */
 264                                                 /* Waiting for */
 265                                                 /* Rss_calc_thread to finish */
 266         vmu_entity_t    *vmu_system;            /* Entity for tracking */
 267                                                 /* rss/swap for all processes */
 268                                                 /* in all zones */
 269         mod_hash_t      *vmu_zones_hash;        /* Zones visited */
 270         mod_hash_t      *vmu_projects_col_hash; /* These *_col_hash hashes */
 271         mod_hash_t      *vmu_rusers_col_hash;   /* keep track of entities, */
 272         mod_hash_t      *vmu_eusers_col_hash;   /* ignoring zoneid, in order */
 273                                                 /* to implement VMUSAGE_COL_* */
 274                                                 /* flags, which aggregate by */
 275                                                 /* project or user regardless */
 276                                                 /* of zoneid. */
 277         mod_hash_t      *vmu_all_vnodes_hash;   /* System wide visited vnodes */
 278                                                 /* to track incore/not-incore */
 279         mod_hash_t      *vmu_all_amps_hash;     /* System wide visited shared */
 280                                                 /* amps to track incore/not- */
 281                                                 /* incore */
 282         vmu_entity_t    *vmu_entities;          /* Linked list of entities */
 283         size_t          vmu_nentities;          /* Count of entities in list */
 284         vmu_cache_t     *vmu_cache;             /* Cached results */
 285         kthread_t       *vmu_calc_thread;       /* NULL, or thread running */
 286                                                 /* vmu_calculate() */
 287         uint_t          vmu_calc_flags;         /* Flags being using by */
 288                                                 /* currently running calc */
 289                                                 /* thread */
 290         uint_t          vmu_pending_flags;      /* Flags of vm_getusage() */
 291                                                 /* threads waiting for */
 292                                                 /* calc thread to finish */
 293         uint_t          vmu_pending_waiters;    /* Number of threads waiting */
 294                                                 /* for calc thread */
 295         vmu_bound_t     *vmu_free_bounds;
 296         vmu_object_t    *vmu_free_objects;
 297         vmu_entity_t    *vmu_free_entities;
 298         vmu_zone_t      *vmu_free_zones;
 299 } vmu_data_t;
 300 
 301 extern struct as kas;
 302 extern proc_t *practive;
 303 extern zone_t *global_zone;
 304 extern struct seg_ops segvn_ops;
 305 extern struct seg_ops segspt_shmops;
 306 
 307 static vmu_data_t vmu_data;
 308 static kmem_cache_t *vmu_bound_cache;
 309 static kmem_cache_t *vmu_object_cache;
 310 
 311 /*
 312  * Comparison routine for AVL tree. We base our comparison on vmb_start.
 313  */
 314 static int
 315 bounds_cmp(const void *bnd1, const void *bnd2)
 316 {
 317         const vmu_bound_t *bound1 = bnd1;
 318         const vmu_bound_t *bound2 = bnd2;
 319 
 320         if (bound1->vmb_start == bound2->vmb_start) {
 321                 return (0);
 322         }
 323         if (bound1->vmb_start < bound2->vmb_start) {
 324                 return (-1);
 325         }
 326 
 327         return (1);
 328 }
 329 
 330 /*
 331  * Save a bound on the free list.
 332  */
 333 static void
 334 vmu_free_bound(vmu_bound_t *bound)
 335 {
 336         bound->vmb_next = vmu_data.vmu_free_bounds;
 337         bound->vmb_start = 0;
 338         bound->vmb_end = 0;
 339         bound->vmb_type = 0;
 340         vmu_data.vmu_free_bounds = bound;
 341 }
 342 
 343 /*
 344  * Free an object, and all visited bound info.
 345  */
 346 static void
 347 vmu_free_object(mod_hash_val_t val)
 348 {
 349         vmu_object_t *obj = (vmu_object_t *)val;
 350         avl_tree_t *tree = &(obj->vmo_bounds);
 351         vmu_bound_t *bound;
 352         void *cookie = NULL;
 353 
 354         while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
 355                 vmu_free_bound(bound);
 356         avl_destroy(tree);
 357 
 358         obj->vmo_type = 0;
 359         obj->vmo_next = vmu_data.vmu_free_objects;
 360         vmu_data.vmu_free_objects = obj;
 361 }
 362 
 363 /*
 364  * Free an entity, and hashes of visited objects for that entity.
 365  */
 366 static void
 367 vmu_free_entity(mod_hash_val_t val)
 368 {
 369         vmu_entity_t *entity = (vmu_entity_t *)val;
 370 
 371         if (entity->vme_vnode_hash != NULL)
 372                 i_mod_hash_clear_nosync(entity->vme_vnode_hash);
 373         if (entity->vme_amp_hash != NULL)
 374                 i_mod_hash_clear_nosync(entity->vme_amp_hash);
 375         if (entity->vme_anon_hash != NULL)
 376                 i_mod_hash_clear_nosync(entity->vme_anon_hash);
 377 
 378         entity->vme_next = vmu_data.vmu_free_entities;
 379         vmu_data.vmu_free_entities = entity;
 380 }
 381 
 382 /*
 383  * Free zone entity, and all hashes of entities inside that zone,
 384  * which are projects, tasks, and users.
 385  */
 386 static void
 387 vmu_free_zone(mod_hash_val_t val)
 388 {
 389         vmu_zone_t *zone = (vmu_zone_t *)val;
 390 
 391         if (zone->vmz_zone != NULL) {
 392                 vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
 393                 zone->vmz_zone = NULL;
 394         }
 395         if (zone->vmz_projects_hash != NULL)
 396                 i_mod_hash_clear_nosync(zone->vmz_projects_hash);
 397         if (zone->vmz_tasks_hash != NULL)
 398                 i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
 399         if (zone->vmz_rusers_hash != NULL)
 400                 i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
 401         if (zone->vmz_eusers_hash != NULL)
 402                 i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
 403         zone->vmz_next = vmu_data.vmu_free_zones;
 404         vmu_data.vmu_free_zones = zone;
 405 }
 406 
 407 /*
 408  * Initialize synchronization primitives and hashes for system-wide tracking
 409  * of visited vnodes and shared amps.  Initialize results cache.
 410  */
 411 void
 412 vm_usage_init()
 413 {
 414         mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
 415         cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
 416 
 417         vmu_data.vmu_system = NULL;
 418         vmu_data.vmu_zones_hash = NULL;
 419         vmu_data.vmu_projects_col_hash = NULL;
 420         vmu_data.vmu_rusers_col_hash = NULL;
 421         vmu_data.vmu_eusers_col_hash = NULL;
 422 
 423         vmu_data.vmu_free_bounds = NULL;
 424         vmu_data.vmu_free_objects = NULL;
 425         vmu_data.vmu_free_entities = NULL;
 426         vmu_data.vmu_free_zones = NULL;
 427 
 428         vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
 429             "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 430             sizeof (vnode_t));
 431         vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
 432             "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 433             sizeof (struct anon_map));
 434         vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
 435             "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
 436             vmu_free_entity);
 437         vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
 438             "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
 439             vmu_free_entity);
 440         vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
 441             "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
 442             vmu_free_entity);
 443         vmu_data.vmu_zones_hash = mod_hash_create_idhash(
 444             "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
 445 
 446         vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
 447             sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 448         vmu_object_cache = kmem_cache_create("vmu_object_cache",
 449             sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 450 
 451         vmu_data.vmu_entities = NULL;
 452         vmu_data.vmu_nentities = 0;
 453 
 454         vmu_data.vmu_cache = NULL;
 455         vmu_data.vmu_calc_thread = NULL;
 456         vmu_data.vmu_calc_flags = 0;
 457         vmu_data.vmu_pending_flags = 0;
 458         vmu_data.vmu_pending_waiters = 0;
 459 }
 460 
 461 /*
 462  * Allocate hashes for tracking vm objects visited for an entity.
 463  * Update list of entities.
 464  */
 465 static vmu_entity_t *
 466 vmu_alloc_entity(id_t id, int type, id_t zoneid)
 467 {
 468         vmu_entity_t *entity;
 469 
 470         if (vmu_data.vmu_free_entities != NULL) {
 471                 entity = vmu_data.vmu_free_entities;
 472                 vmu_data.vmu_free_entities =
 473                     vmu_data.vmu_free_entities->vme_next;
 474                 bzero(&entity->vme_result, sizeof (vmusage_t));
 475         } else {
 476                 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
 477         }
 478         entity->vme_result.vmu_id = id;
 479         entity->vme_result.vmu_zoneid = zoneid;
 480         entity->vme_result.vmu_type = type;
 481 
 482         if (entity->vme_vnode_hash == NULL)
 483                 entity->vme_vnode_hash = mod_hash_create_ptrhash(
 484                     "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 485                     sizeof (vnode_t));
 486 
 487         if (entity->vme_amp_hash == NULL)
 488                 entity->vme_amp_hash = mod_hash_create_ptrhash(
 489                     "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 490                     sizeof (struct anon_map));
 491 
 492         if (entity->vme_anon_hash == NULL)
 493                 entity->vme_anon_hash = mod_hash_create_ptrhash(
 494                     "vmusage anon hash", VMUSAGE_HASH_SIZE,
 495                     mod_hash_null_valdtor, sizeof (struct anon));
 496 
 497         entity->vme_next = vmu_data.vmu_entities;
 498         vmu_data.vmu_entities = entity;
 499         vmu_data.vmu_nentities++;
 500 
 501         return (entity);
 502 }
 503 
 504 /*
 505  * Allocate a zone entity, and hashes for tracking visited vm objects
 506  * for projects, tasks, and users within that zone.
 507  */
 508 static vmu_zone_t *
 509 vmu_alloc_zone(id_t id)
 510 {
 511         vmu_zone_t *zone;
 512 
 513         if (vmu_data.vmu_free_zones != NULL) {
 514                 zone = vmu_data.vmu_free_zones;
 515                 vmu_data.vmu_free_zones =
 516                     vmu_data.vmu_free_zones->vmz_next;
 517                 zone->vmz_next = NULL;
 518                 zone->vmz_zone = NULL;
 519         } else {
 520                 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
 521         }
 522 
 523         zone->vmz_id = id;
 524 
 525         if ((vmu_data.vmu_calc_flags &
 526             (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
 527                 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 528 
 529         if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
 530             VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
 531                 zone->vmz_projects_hash = mod_hash_create_idhash(
 532                     "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 533 
 534         if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
 535             != 0 && zone->vmz_tasks_hash == NULL)
 536                 zone->vmz_tasks_hash = mod_hash_create_idhash(
 537                     "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 538 
 539         if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
 540             != 0 && zone->vmz_rusers_hash == NULL)
 541                 zone->vmz_rusers_hash = mod_hash_create_idhash(
 542                     "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 543 
 544         if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
 545             != 0 && zone->vmz_eusers_hash == NULL)
 546                 zone->vmz_eusers_hash = mod_hash_create_idhash(
 547                     "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 548 
 549         return (zone);
 550 }
 551 
 552 /*
 553  * Allocate a structure for tracking visited bounds for a vm object.
 554  */
 555 static vmu_object_t *
 556 vmu_alloc_object(caddr_t key, int type)
 557 {
 558         vmu_object_t *object;
 559 
 560         if (vmu_data.vmu_free_objects != NULL) {
 561                 object = vmu_data.vmu_free_objects;
 562                 vmu_data.vmu_free_objects =
 563                     vmu_data.vmu_free_objects->vmo_next;
 564         } else {
 565                 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
 566         }
 567 
 568         object->vmo_next = NULL;
 569         object->vmo_key = key;
 570         object->vmo_type = type;
 571         avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
 572 
 573         return (object);
 574 }
 575 
 576 /*
 577  * Allocate and return a bound structure.
 578  */
 579 static vmu_bound_t *
 580 vmu_alloc_bound()
 581 {
 582         vmu_bound_t *bound;
 583 
 584         if (vmu_data.vmu_free_bounds != NULL) {
 585                 bound = vmu_data.vmu_free_bounds;
 586                 vmu_data.vmu_free_bounds =
 587                     vmu_data.vmu_free_bounds->vmb_next;
 588         } else {
 589                 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
 590         }
 591 
 592         bound->vmb_next = NULL;
 593         bound->vmb_start = 0;
 594         bound->vmb_end = 0;
 595         bound->vmb_type = 0;
 596         return (bound);
 597 }
 598 
 599 /*
 600  * vmu_find_insert_* functions implement hash lookup or allocate and
 601  * insert operations.
 602  */
 603 static vmu_object_t *
 604 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
 605 {
 606         int ret;
 607         vmu_object_t *object;
 608 
 609         ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
 610             (mod_hash_val_t *)&object);
 611         if (ret != 0) {
 612                 object = vmu_alloc_object(key, type);
 613                 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
 614                     (mod_hash_val_t)object, (mod_hash_hndl_t)0);
 615                 ASSERT(ret == 0);
 616         }
 617         return (object);
 618 }
 619 
 620 static int
 621 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
 622 {
 623         int ret;
 624         caddr_t val;
 625 
 626         ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
 627             (mod_hash_val_t *)&val);
 628 
 629         if (ret == 0)
 630                 return (0);
 631 
 632         ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
 633             (mod_hash_val_t)key, (mod_hash_hndl_t)0);
 634 
 635         ASSERT(ret == 0);
 636 
 637         return (1);
 638 }
 639 
 640 static vmu_entity_t *
 641 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
 642 {
 643         int ret;
 644         vmu_entity_t *entity;
 645 
 646         ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
 647             (mod_hash_val_t *)&entity);
 648         if (ret != 0) {
 649                 entity = vmu_alloc_entity(id, type, zoneid);
 650                 ret = i_mod_hash_insert_nosync(hash,
 651                     (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
 652                     (mod_hash_hndl_t)0);
 653                 ASSERT(ret == 0);
 654         }
 655         return (entity);
 656 }
 657 
 658 
 659 
 660 
 661 /*
 662  * Returns list of object bounds between start and end.  New bounds inserted
 663  * by this call are given type.
 664  *
 665  * Returns the number of pages covered if new bounds are created.  Returns 0
 666  * if region between start/end consists of all existing bounds.
 667  */
 668 static pgcnt_t
 669 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
 670     end, char type, vmu_bound_t **first, vmu_bound_t **last)
 671 {
 672         avl_tree_t      *tree = &(ro->vmo_bounds);
 673         avl_index_t     where;
 674         vmu_bound_t     *walker, *tmp;
 675         pgcnt_t         ret = 0;
 676 
 677         ASSERT(start <= end);
 678 
 679         *first = *last = NULL;
 680 
 681         tmp = vmu_alloc_bound();
 682         tmp->vmb_start = start;
 683         tmp->vmb_type = type;
 684 
 685         /* Hopelessly optimistic case. */
 686         if (walker = avl_find(tree, tmp, &where)) {
 687                 /* We got lucky. */
 688                 vmu_free_bound(tmp);
 689                 *first = walker;
 690         }
 691 
 692         if (walker == NULL) {
 693                 /* Is start in the previous node? */
 694                 walker = avl_nearest(tree, where, AVL_BEFORE);
 695                 if (walker != NULL) {
 696                         if (ISWITHIN(walker, start)) {
 697                                 /* We found start. */
 698                                 vmu_free_bound(tmp);
 699                                 *first = walker;
 700                         }
 701                 }
 702         }
 703 
 704         /*
 705          * At this point, if *first is still NULL, then we
 706          * didn't get a direct hit and start isn't covered
 707          * by the previous node. We know that the next node
 708          * must have a greater start value than we require
 709          * because avl_find tells us where the AVL routines would
 710          * insert our new node. We have some gap between the
 711          * start we want and the next node.
 712          */
 713         if (*first == NULL) {
 714                 walker = avl_nearest(tree, where, AVL_AFTER);
 715                 if (walker != NULL && walker->vmb_start <= end) {
 716                         /* Fill the gap. */
 717                         tmp->vmb_end = walker->vmb_start - 1;
 718                         *first = tmp;
 719                 } else {
 720                         /* We have a gap over [start, end]. */
 721                         tmp->vmb_end = end;
 722                         *first = *last = tmp;
 723                 }
 724                 ret += tmp->vmb_end - tmp->vmb_start + 1;
 725                 avl_insert(tree, tmp, where);
 726         }
 727 
 728         ASSERT(*first != NULL);
 729 
 730         if (*last != NULL) {
 731                 /* We're done. */
 732                 return (ret);
 733         }
 734 
 735         /*
 736          * If we are here we still need to set *last and
 737          * that may involve filling in some gaps.
 738          */
 739         *last = *first;
 740         for (;;) {
 741                 if (ISWITHIN(*last, end)) {
 742                         /* We're done. */
 743                         break;
 744                 }
 745                 walker = AVL_NEXT(tree, *last);
 746                 if (walker == NULL || walker->vmb_start > end) {
 747                         /* Bottom or mid tree with gap. */
 748                         tmp = vmu_alloc_bound();
 749                         tmp->vmb_start = (*last)->vmb_end + 1;
 750                         tmp->vmb_end = end;
 751                         tmp->vmb_type = type;
 752                         ret += tmp->vmb_end - tmp->vmb_start + 1;
 753                         avl_insert_here(tree, tmp, *last, AVL_AFTER);
 754                         *last = tmp;
 755                         break;
 756                 } else {
 757                         if ((*last)->vmb_end + 1 != walker->vmb_start) {
 758                                 /* Non-contiguous. */
 759                                 tmp = vmu_alloc_bound();
 760                                 tmp->vmb_start = (*last)->vmb_end + 1;
 761                                 tmp->vmb_end = walker->vmb_start - 1;
 762                                 tmp->vmb_type = type;
 763                                 ret += tmp->vmb_end - tmp->vmb_start + 1;
 764                                 avl_insert_here(tree, tmp, *last, AVL_AFTER);
 765                                 *last = tmp;
 766                         } else {
 767                                 *last = walker;
 768                         }
 769                 }
 770         }
 771 
 772         return (ret);
 773 }
 774 
 775 /*
 776  * vmu_update_bounds()
 777  *
 778  * tree: avl_tree in which first and last hang.
 779  *
 780  * first, last: list of continuous bounds, of which zero or more are of
 781  *              type VMUSAGE_BOUND_UNKNOWN.
 782  *
 783  * new_tree: avl_tree in which new_first and new_last hang.
 784  *
 785  * new_first, new_last: list of continuous bounds, of which none are of
 786  *                      type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
 787  *                      update the types of bounds in (first,last) with
 788  *                      type VMUSAGE_BOUND_UNKNOWN.
 789  *
 790  * For the list of bounds (first,last), this function updates any bounds
 791  * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
 792  * the list (new_first, new_last).
 793  *
 794  * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
 795  * (new_first, new_last), it will be split into multiple bounds.
 796  *
 797  * Return value:
 798  *      The number of pages in the list of bounds (first,last) that were of
 799  *      type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
 800  *      VMUSAGE_BOUND_INCORE.
 801  *
 802  */
 803 static pgcnt_t
 804 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
 805     avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
 806 {
 807         vmu_bound_t *next, *new_next, *tmp;
 808         pgcnt_t rss = 0;
 809 
 810         next = *first;
 811         new_next = new_first;
 812 
 813         /*
 814          * Verify first and last bound are covered by new bounds if they
 815          * have unknown type.
 816          */
 817         ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
 818             (*first)->vmb_start >= new_first->vmb_start);
 819         ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
 820             (*last)->vmb_end <= new_last->vmb_end);
 821         for (;;) {
 822                 /* If bound already has type, proceed to next bound. */
 823                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 824                         if (next == *last)
 825                                 break;
 826                         next = AVL_NEXT(tree, next);
 827                         continue;
 828                 }
 829                 while (new_next->vmb_end < next->vmb_start)
 830                         new_next = AVL_NEXT(new_tree, new_next);
 831                 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
 832                 next->vmb_type = new_next->vmb_type;
 833                 if (new_next->vmb_end < next->vmb_end) {
 834                         /* need to split bound */
 835                         tmp = vmu_alloc_bound();
 836                         tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
 837                         tmp->vmb_start = new_next->vmb_end + 1;
 838                         tmp->vmb_end = next->vmb_end;
 839                         avl_insert_here(tree, tmp, next, AVL_AFTER);
 840                         next->vmb_end = new_next->vmb_end;
 841                         if (*last == next)
 842                                 *last = tmp;
 843                         if (next->vmb_type == VMUSAGE_BOUND_INCORE)
 844                                 rss += next->vmb_end - next->vmb_start + 1;
 845                         next = tmp;
 846                 } else {
 847                         if (next->vmb_type == VMUSAGE_BOUND_INCORE)
 848                                 rss += next->vmb_end - next->vmb_start + 1;
 849                         if (next == *last)
 850                                 break;
 851                         next = AVL_NEXT(tree, next);
 852                 }
 853         }
 854         return (rss);
 855 }
 856 
 857 /*
 858  * Merges adjacent bounds with same type between first and last bound.
 859  * After merge, last pointer may point to a different bound, as (incoming)
 860  * last bound may have been merged away.
 861  */
 862 static void
 863 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
 864 {
 865         vmu_bound_t *current;
 866         vmu_bound_t *next;
 867 
 868         ASSERT(tree != NULL);
 869         ASSERT(*first != NULL);
 870         ASSERT(*last != NULL);
 871 
 872         current = *first;
 873         while (current != *last) {
 874                 next = AVL_NEXT(tree, current);
 875                 if ((current->vmb_end + 1) == next->vmb_start &&
 876                     current->vmb_type == next->vmb_type) {
 877                         current->vmb_end = next->vmb_end;
 878                         avl_remove(tree, next);
 879                         vmu_free_bound(next);
 880                         if (next == *last) {
 881                                 *last = current;
 882                         }
 883                 } else {
 884                         current = AVL_NEXT(tree, current);
 885                 }
 886         }
 887 }
 888 
 889 /*
 890  * Given an amp and a list of bounds, updates each bound's type with
 891  * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
 892  *
 893  * If a bound is partially incore, it will be split into two bounds.
 894  * first and last may be modified, as bounds may be split into multiple
 895  * bounds if they are partially incore/not-incore.
 896  *
 897  * Set incore to non-zero if bounds are already known to be incore.
 898  *
 899  */
 900 static void
 901 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 902     vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
 903 {
 904         vmu_bound_t *next;
 905         vmu_bound_t *tmp;
 906         pgcnt_t index;
 907         short bound_type;
 908         short page_type;
 909         vnode_t *vn;
 910         anoff_t off;
 911         struct anon *ap;
 912 
 913         next = *first;
 914         /* Shared anon slots don't change once set. */
 915         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 916         for (;;) {
 917                 if (incore == B_TRUE)
 918                         next->vmb_type = VMUSAGE_BOUND_INCORE;
 919 
 920                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 921                         if (next == *last)
 922                                 break;
 923                         next = AVL_NEXT(tree, next);
 924                         continue;
 925                 }
 926 
 927                 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 928                 bound_type = next->vmb_type;
 929                 index = next->vmb_start;
 930                 while (index <= next->vmb_end) {
 931 
 932                         /*
 933                          * These are used to determine how much to increment
 934                          * index when a large page is found.
 935                          */
 936                         page_t *page;
 937                         pgcnt_t pgcnt = 1;
 938                         uint_t pgshft;
 939                         pgcnt_t pgmsk;
 940 
 941                         ap = anon_get_ptr(amp->ahp, index);
 942                         if (ap != NULL)
 943                                 swap_xlate(ap, &vn, &off);
 944 
 945                         if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 946                             (page = page_exists(vn, off)) != NULL) {
 947                                 if (PP_ISFREE(page))
 948                                         page_type = VMUSAGE_BOUND_NOT_INCORE;
 949                                 else
 950                                         page_type = VMUSAGE_BOUND_INCORE;
 951                                 if (page->p_szc > 0) {
 952                                         pgcnt = page_get_pagecnt(page->p_szc);
 953                                         pgshft = page_get_shift(page->p_szc);
 954                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
 955                                             - 1;
 956                                 }
 957                         } else {
 958                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
 959                         }
 960 
 961                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 962                                 next->vmb_type = page_type;
 963                                 bound_type = page_type;
 964                         } else if (next->vmb_type != page_type) {
 965                                 /*
 966                                  * If current bound type does not match page
 967                                  * type, need to split off new bound.
 968                                  */
 969                                 tmp = vmu_alloc_bound();
 970                                 tmp->vmb_type = page_type;
 971                                 tmp->vmb_start = index;
 972                                 tmp->vmb_end = next->vmb_end;
 973                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
 974                                 next->vmb_end = index - 1;
 975                                 if (*last == next)
 976                                         *last = tmp;
 977                                 next = tmp;
 978                         }
 979                         if (pgcnt > 1) {
 980                                 /*
 981                                  * If inside large page, jump to next large
 982                                  * page
 983                                  */
 984                                 index = (index & ~pgmsk) + pgcnt;
 985                         } else {
 986                                 index++;
 987                         }
 988                 }
 989                 if (next == *last) {
 990                         ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
 991                         break;
 992                 } else
 993                         next = AVL_NEXT(tree, next);
 994         }
 995         ANON_LOCK_EXIT(&amp->a_rwlock);
 996 }
 997 
 998 /*
 999  * Same as vmu_amp_update_incore_bounds(), except for tracking
1000  * incore-/not-incore for vnodes.
1001  */
1002 static void
1003 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
1004     vmu_bound_t **first, vmu_bound_t **last)
1005 {
1006         vmu_bound_t *next;
1007         vmu_bound_t *tmp;
1008         pgcnt_t index;
1009         short bound_type;
1010         short page_type;
1011 
1012         next = *first;
1013         for (;;) {
1014                 if (vnode->v_pages == NULL)
1015                         next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1016 
1017                 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1018                         if (next == *last)
1019                                 break;
1020                         next = AVL_NEXT(tree, next);
1021                         continue;
1022                 }
1023 
1024                 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
1025                 bound_type = next->vmb_type;
1026                 index = next->vmb_start;
1027                 while (index <= next->vmb_end) {
1028 
1029                         /*
1030                          * These are used to determine how much to increment
1031                          * index when a large page is found.
1032                          */
1033                         page_t *page;
1034                         pgcnt_t pgcnt = 1;
1035                         uint_t pgshft;
1036                         pgcnt_t pgmsk;
1037 
1038                         if (vnode->v_pages != NULL &&
1039                             (page = page_exists(vnode, ptob(index))) != NULL) {
1040                                 if (PP_ISFREE(page))
1041                                         page_type = VMUSAGE_BOUND_NOT_INCORE;
1042                                 else
1043                                         page_type = VMUSAGE_BOUND_INCORE;
1044                                 if (page->p_szc > 0) {
1045                                         pgcnt = page_get_pagecnt(page->p_szc);
1046                                         pgshft = page_get_shift(page->p_szc);
1047                                         pgmsk = (0x1 << (pgshft - PAGESHIFT))
1048                                             - 1;
1049                                 }
1050                         } else {
1051                                 page_type = VMUSAGE_BOUND_NOT_INCORE;
1052                         }
1053 
1054                         if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1055                                 next->vmb_type = page_type;
1056                                 bound_type = page_type;
1057                         } else if (next->vmb_type != page_type) {
1058                                 /*
1059                                  * If current bound type does not match page
1060                                  * type, need to split off new bound.
1061                                  */
1062                                 tmp = vmu_alloc_bound();
1063                                 tmp->vmb_type = page_type;
1064                                 tmp->vmb_start = index;
1065                                 tmp->vmb_end = next->vmb_end;
1066                                 avl_insert_here(tree, tmp, next, AVL_AFTER);
1067                                 next->vmb_end = index - 1;
1068                                 if (*last == next)
1069                                         *last = tmp;
1070                                 next = tmp;
1071                         }
1072                         if (pgcnt > 1) {
1073                                 /*
1074                                  * If inside large page, jump to next large
1075                                  * page
1076                                  */
1077                                 index = (index & ~pgmsk) + pgcnt;
1078                         } else {
1079                                 index++;
1080                         }
1081                 }
1082                 if (next == *last) {
1083                         ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1084                         break;
1085                 } else
1086                         next = AVL_NEXT(tree, next);
1087         }
1088 }
1089 
1090 /*
1091  * Calculate the rss and swap consumed by a segment.  vmu_entities is the
1092  * list of entities to visit.  For shared segments, the vnode or amp
1093  * is looked up in each entity to see if it has been already counted.  Private
1094  * anon pages are checked per entity to ensure that COW pages are not
1095  * double counted.
1096  *
1097  * For private mapped files, first the amp is checked for private pages.
1098  * Bounds not backed by the amp are looked up in the vnode for each entity
1099  * to avoid double counting of private COW vnode pages.
1100  */
1101 static void
1102 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1103 {
1104         struct segvn_data *svd;
1105         struct shm_data *shmd;
1106         struct spt_data *sptd;
1107         vmu_object_t *shared_object = NULL;
1108         vmu_object_t *entity_object = NULL;
1109         vmu_entity_t *entity;
1110         vmusage_t *result;
1111         vmu_bound_t *first = NULL;
1112         vmu_bound_t *last = NULL;
1113         vmu_bound_t *cur = NULL;
1114         vmu_bound_t *e_first = NULL;
1115         vmu_bound_t *e_last = NULL;
1116         vmu_bound_t *tmp;
1117         pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1118         struct anon_map *private_amp = NULL;
1119         boolean_t incore = B_FALSE;
1120         boolean_t shared = B_FALSE;
1121         int file = 0;
1122         pgcnt_t swresv = 0;
1123         pgcnt_t panon = 0;
1124 
1125         /* Can zero-length segments exist?  Not sure, so paranoia. */
1126         if (seg->s_size <= 0)
1127                 return;
1128 
1129         /*
1130          * Figure out if there is a shared object (such as a named vnode or
1131          * a shared amp, then figure out if there is a private amp, which
1132          * identifies private pages.
1133          */
1134         if (seg->s_ops == &segvn_ops) {
1135                 svd = (struct segvn_data *)seg->s_data;
1136                 if (svd->type == MAP_SHARED) {
1137                         shared = B_TRUE;
1138                 } else {
1139                         swresv = svd->swresv;
1140 
1141                         if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1142                             RW_READER) != 0) {
1143                                 /*
1144                                  * Text replication anon maps can be shared
1145                                  * across all zones. Space used for text
1146                                  * replication is typically capped as a small %
1147                                  * of memory.  To keep it simple for now we
1148                                  * don't account for swap and memory space used
1149                                  * for text replication.
1150                                  */
1151                                 if (svd->tr_state == SEGVN_TR_OFF &&
1152                                     svd->amp != NULL) {
1153                                         private_amp = svd->amp;
1154                                         p_start = svd->anon_index;
1155                                         p_end = svd->anon_index +
1156                                             btop(seg->s_size) - 1;
1157                                 }
1158                                 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1159                         }
1160                 }
1161                 if (svd->vp != NULL) {
1162                         file = 1;
1163                         shared_object = vmu_find_insert_object(
1164                             vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1165                             VMUSAGE_TYPE_VNODE);
1166                         s_start = btop(svd->offset);
1167                         s_end = btop(svd->offset + seg->s_size) - 1;
1168                 }
1169                 if (svd->amp != NULL && svd->type == MAP_SHARED) {
1170                         ASSERT(shared_object == NULL);
1171                         shared_object = vmu_find_insert_object(
1172                             vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1173                             VMUSAGE_TYPE_AMP);
1174                         s_start = svd->anon_index;
1175                         s_end = svd->anon_index + btop(seg->s_size) - 1;
1176                         /* schedctl mappings are always in core */
1177                         if (svd->amp->swresv == 0)
1178                                 incore = B_TRUE;
1179                 }
1180         } else if (seg->s_ops == &segspt_shmops) {
1181                 shared = B_TRUE;
1182                 shmd = (struct shm_data *)seg->s_data;
1183                 shared_object = vmu_find_insert_object(
1184                     vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1185                     VMUSAGE_TYPE_AMP);
1186                 s_start = 0;
1187                 s_end = btop(seg->s_size) - 1;
1188                 sptd = shmd->shm_sptseg->s_data;
1189 
1190                 /* ism segments are always incore and do not reserve swap */
1191                 if (sptd->spt_flags & SHM_SHARE_MMU)
1192                         incore = B_TRUE;
1193 
1194         } else {
1195                 return;
1196         }
1197 
1198         /*
1199          * If there is a private amp, count anon pages that exist.  If an
1200          * anon has a refcnt > 1 (COW sharing), then save the anon in a
1201          * hash so that it is not double counted.
1202          *
1203          * If there is also a shared object, then figure out the bounds
1204          * which are not mapped by the private amp.
1205          */
1206         if (private_amp != NULL) {
1207 
1208                 /* Enter as writer to prevent COW anons from being freed */
1209                 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1210 
1211                 p_index = p_start;
1212                 s_index = s_start;
1213 
1214                 while (p_index <= p_end) {
1215 
1216                         pgcnt_t p_index_next;
1217                         pgcnt_t p_bound_size;
1218                         int cnt;
1219                         anoff_t off;
1220                         struct vnode *vn;
1221                         struct anon *ap;
1222                         page_t *page;           /* For handling of large */
1223                         pgcnt_t pgcnt = 1;      /* pages */
1224                         pgcnt_t pgstart;
1225                         pgcnt_t pgend;
1226                         uint_t pgshft;
1227                         pgcnt_t pgmsk;
1228 
1229                         p_index_next = p_index;
1230                         ap = anon_get_next_ptr(private_amp->ahp,
1231                             &p_index_next);
1232 
1233                         /*
1234                          * If next anon is past end of mapping, simulate
1235                          * end of anon so loop terminates.
1236                          */
1237                         if (p_index_next > p_end) {
1238                                 p_index_next = p_end + 1;
1239                                 ap = NULL;
1240                         }
1241                         /*
1242                          * For COW segments, keep track of bounds not
1243                          * backed by private amp so they can be looked
1244                          * up in the backing vnode
1245                          */
1246                         if (p_index_next != p_index) {
1247 
1248                                 /*
1249                                  * Compute index difference between anon and
1250                                  * previous anon.
1251                                  */
1252                                 p_bound_size = p_index_next - p_index - 1;
1253 
1254                                 if (shared_object != NULL) {
1255                                         cur = vmu_alloc_bound();
1256                                         cur->vmb_start = s_index;
1257                                         cur->vmb_end = s_index + p_bound_size;
1258                                         cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1259                                         if (first == NULL) {
1260                                                 first = cur;
1261                                                 last = cur;
1262                                         } else {
1263                                                 last->vmb_next = cur;
1264                                                 last = cur;
1265                                         }
1266                                 }
1267                                 p_index = p_index + p_bound_size + 1;
1268                                 s_index = s_index + p_bound_size + 1;
1269                         }
1270 
1271                         /* Detect end of anons in amp */
1272                         if (ap == NULL)
1273                                 break;
1274 
1275                         cnt = ap->an_refcnt;
1276                         swap_xlate(ap, &vn, &off);
1277 
1278                         if (vn == NULL || vn->v_pages == NULL ||
1279                             (page = page_exists(vn, off)) == NULL) {
1280                                 p_index++;
1281                                 s_index++;
1282                                 continue;
1283                         }
1284 
1285                         /*
1286                          * If large page is found, compute portion of large
1287                          * page in mapping, and increment indicies to the next
1288                          * large page.
1289                          */
1290                         if (page->p_szc > 0) {
1291 
1292                                 pgcnt = page_get_pagecnt(page->p_szc);
1293                                 pgshft = page_get_shift(page->p_szc);
1294                                 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1295 
1296                                 /* First page in large page */
1297                                 pgstart = p_index & ~pgmsk;
1298                                 /* Last page in large page */
1299                                 pgend = pgstart + pgcnt - 1;
1300                                 /*
1301                                  * Artifically end page if page extends past
1302                                  * end of mapping.
1303                                  */
1304                                 if (pgend > p_end)
1305                                         pgend = p_end;
1306 
1307                                 /*
1308                                  * Compute number of pages from large page
1309                                  * which are mapped.
1310                                  */
1311                                 pgcnt = pgend - p_index + 1;
1312 
1313                                 /*
1314                                  * Point indicies at page after large page,
1315                                  * or at page after end of mapping.
1316                                  */
1317                                 p_index += pgcnt;
1318                                 s_index += pgcnt;
1319                         } else {
1320                                 p_index++;
1321                                 s_index++;
1322                         }
1323 
1324                         /*
1325                          * Pages on the free list aren't counted for the rss.
1326                          */
1327                         if (PP_ISFREE(page))
1328                                 continue;
1329 
1330                         /*
1331                          * Assume anon structs with a refcnt
1332                          * of 1 are not COW shared, so there
1333                          * is no reason to track them per entity.
1334                          */
1335                         if (cnt == 1) {
1336                                 panon += pgcnt;
1337                                 continue;
1338                         }
1339                         for (entity = vmu_entities; entity != NULL;
1340                             entity = entity->vme_next_calc) {
1341 
1342                                 result = &entity->vme_result;
1343                                 /*
1344                                  * Track COW anons per entity so
1345                                  * they are not double counted.
1346                                  */
1347                                 if (vmu_find_insert_anon(entity->vme_anon_hash,
1348                                     (caddr_t)ap) == 0)
1349                                         continue;
1350 
1351                                 result->vmu_rss_all += (pgcnt << PAGESHIFT);
1352                                 result->vmu_rss_private +=
1353                                     (pgcnt << PAGESHIFT);
1354                         }
1355                 }
1356                 ANON_LOCK_EXIT(&private_amp->a_rwlock);
1357         }
1358 
1359         /* Add up resident anon and swap reserved for private mappings */
1360         if (swresv > 0 || panon > 0) {
1361                 for (entity = vmu_entities; entity != NULL;
1362                     entity = entity->vme_next_calc) {
1363                         result = &entity->vme_result;
1364                         result->vmu_swap_all += swresv;
1365                         result->vmu_swap_private += swresv;
1366                         result->vmu_rss_all += (panon << PAGESHIFT);
1367                         result->vmu_rss_private += (panon << PAGESHIFT);
1368                 }
1369         }
1370 
1371         /* Compute resident pages backing shared amp or named vnode */
1372         if (shared_object != NULL) {
1373                 avl_tree_t *tree = &(shared_object->vmo_bounds);
1374 
1375                 if (first == NULL) {
1376                         /*
1377                          * No private amp, or private amp has no anon
1378                          * structs.  This means entire segment is backed by
1379                          * the shared object.
1380                          */
1381                         first = vmu_alloc_bound();
1382                         first->vmb_start = s_start;
1383                         first->vmb_end = s_end;
1384                         first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1385                 }
1386                 /*
1387                  * Iterate bounds not backed by private amp, and compute
1388                  * resident pages.
1389                  */
1390                 cur = first;
1391                 while (cur != NULL) {
1392 
1393                         if (vmu_insert_lookup_object_bounds(shared_object,
1394                             cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1395                             &first, &last) > 0) {
1396                                 /* new bounds, find incore/not-incore */
1397                                 if (shared_object->vmo_type ==
1398                                     VMUSAGE_TYPE_VNODE) {
1399                                         vmu_vnode_update_incore_bounds(
1400                                             tree,
1401                                             (vnode_t *)
1402                                             shared_object->vmo_key, &first,
1403                                             &last);
1404                                 } else {
1405                                         vmu_amp_update_incore_bounds(
1406                                             tree,
1407                                             (struct anon_map *)
1408                                             shared_object->vmo_key, &first,
1409                                             &last, incore);
1410                                 }
1411                                 vmu_merge_bounds(tree, &first, &last);
1412                         }
1413                         for (entity = vmu_entities; entity != NULL;
1414                             entity = entity->vme_next_calc) {
1415                                 avl_tree_t *e_tree;
1416 
1417                                 result = &entity->vme_result;
1418 
1419                                 entity_object = vmu_find_insert_object(
1420                                     shared_object->vmo_type ==
1421                                     VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1422                                     entity->vme_amp_hash,
1423                                     shared_object->vmo_key,
1424                                     shared_object->vmo_type);
1425 
1426                                 virt = vmu_insert_lookup_object_bounds(
1427                                     entity_object, cur->vmb_start, cur->vmb_end,
1428                                     VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1429 
1430                                 if (virt == 0)
1431                                         continue;
1432                                 /*
1433                                  * Range visited for this entity
1434                                  */
1435                                 e_tree = &(entity_object->vmo_bounds);
1436                                 rss = vmu_update_bounds(e_tree, &e_first,
1437                                     &e_last, tree, first, last);
1438                                 result->vmu_rss_all += (rss << PAGESHIFT);
1439                                 if (shared == B_TRUE && file == B_FALSE) {
1440                                         /* shared anon mapping */
1441                                         result->vmu_swap_all +=
1442                                             (virt << PAGESHIFT);
1443                                         result->vmu_swap_shared +=
1444                                             (virt << PAGESHIFT);
1445                                         result->vmu_rss_shared +=
1446                                             (rss << PAGESHIFT);
1447                                 } else if (shared == B_TRUE && file == B_TRUE) {
1448                                         /* shared file mapping */
1449                                         result->vmu_rss_shared +=
1450                                             (rss << PAGESHIFT);
1451                                 } else if (shared == B_FALSE &&
1452                                     file == B_TRUE) {
1453                                         /* private file mapping */
1454                                         result->vmu_rss_private +=
1455                                             (rss << PAGESHIFT);
1456                                 }
1457                                 vmu_merge_bounds(e_tree, &e_first, &e_last);
1458                         }
1459                         tmp = cur;
1460                         cur = cur->vmb_next;
1461                         vmu_free_bound(tmp);
1462                 }
1463         }
1464 }
1465 
1466 /*
1467  * Based on the current calculation flags, find the relevant entities
1468  * which are relative to the process.  Then calculate each segment
1469  * in the process'es address space for each relevant entity.
1470  */
1471 static void
1472 vmu_calculate_proc(proc_t *p)
1473 {
1474         vmu_entity_t *entities = NULL;
1475         vmu_zone_t *zone;
1476         vmu_entity_t *tmp;
1477         struct as *as;
1478         struct seg *seg;
1479         int ret;
1480 
1481         /* Figure out which entities are being computed */
1482         if ((vmu_data.vmu_system) != NULL) {
1483                 tmp = vmu_data.vmu_system;
1484                 tmp->vme_next_calc = entities;
1485                 entities = tmp;
1486         }
1487         if (vmu_data.vmu_calc_flags &
1488             (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
1489             VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1490             VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1491             VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1492             VMUSAGE_ALL_EUSERS)) {
1493                 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1494                     (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1495                     (mod_hash_val_t *)&zone);
1496                 if (ret != 0) {
1497                         zone = vmu_alloc_zone(p->p_zone->zone_id);
1498                         ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1499                             (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1500                             (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1501                         ASSERT(ret == 0);
1502                 }
1503                 if (zone->vmz_zone != NULL) {
1504                         tmp = zone->vmz_zone;
1505                         tmp->vme_next_calc = entities;
1506                         entities = tmp;
1507                 }
1508                 if (vmu_data.vmu_calc_flags &
1509                     (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1510                         tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1511                             p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1512                             zone->vmz_id);
1513                         tmp->vme_next_calc = entities;
1514                         entities = tmp;
1515                 }
1516                 if (vmu_data.vmu_calc_flags &
1517                     (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1518                         tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1519                             p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1520                         tmp->vme_next_calc = entities;
1521                         entities = tmp;
1522                 }
1523                 if (vmu_data.vmu_calc_flags &
1524                     (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1525                         tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1526                             crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1527                         tmp->vme_next_calc = entities;
1528                         entities = tmp;
1529                 }
1530                 if (vmu_data.vmu_calc_flags &
1531                     (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1532                         tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1533                             crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1534                         tmp->vme_next_calc = entities;
1535                         entities = tmp;
1536                 }
1537         }
1538         /* Entities which collapse projects and users for all zones */
1539         if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1540                 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1541                     p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1542                 tmp->vme_next_calc = entities;
1543                 entities = tmp;
1544         }
1545         if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1546                 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1547                     crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1548                 tmp->vme_next_calc = entities;
1549                 entities = tmp;
1550         }
1551         if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1552                 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1553                     crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1554                 tmp->vme_next_calc = entities;
1555                 entities = tmp;
1556         }
1557 
1558         ASSERT(entities != NULL);
1559         /* process all segs in process's address space */
1560         as = p->p_as;
1561         AS_LOCK_ENTER(as, RW_READER);
1562         for (seg = AS_SEGFIRST(as); seg != NULL;
1563             seg = AS_SEGNEXT(as, seg)) {
1564                 vmu_calculate_seg(entities, seg);
1565         }
1566         AS_LOCK_EXIT(as);
1567 }
1568 
1569 /*
1570  * Free data created by previous call to vmu_calculate().
1571  */
1572 static void
1573 vmu_clear_calc()
1574 {
1575         if (vmu_data.vmu_system != NULL)
1576                 vmu_free_entity(vmu_data.vmu_system);
1577                 vmu_data.vmu_system = NULL;
1578         if (vmu_data.vmu_zones_hash != NULL)
1579                 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1580         if (vmu_data.vmu_projects_col_hash != NULL)
1581                 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1582         if (vmu_data.vmu_rusers_col_hash != NULL)
1583                 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1584         if (vmu_data.vmu_eusers_col_hash != NULL)
1585                 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1586 
1587         i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1588         i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1589 }
1590 
1591 /*
1592  * Free unused data structures.  These can result if the system workload
1593  * decreases between calculations.
1594  */
1595 static void
1596 vmu_free_extra()
1597 {
1598         vmu_bound_t *tb;
1599         vmu_object_t *to;
1600         vmu_entity_t *te;
1601         vmu_zone_t *tz;
1602 
1603         while (vmu_data.vmu_free_bounds != NULL) {
1604                 tb = vmu_data.vmu_free_bounds;
1605                 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1606                 kmem_cache_free(vmu_bound_cache, tb);
1607         }
1608         while (vmu_data.vmu_free_objects != NULL) {
1609                 to = vmu_data.vmu_free_objects;
1610                 vmu_data.vmu_free_objects =
1611                     vmu_data.vmu_free_objects->vmo_next;
1612                 kmem_cache_free(vmu_object_cache, to);
1613         }
1614         while (vmu_data.vmu_free_entities != NULL) {
1615                 te = vmu_data.vmu_free_entities;
1616                 vmu_data.vmu_free_entities =
1617                     vmu_data.vmu_free_entities->vme_next;
1618                 if (te->vme_vnode_hash != NULL)
1619                         mod_hash_destroy_hash(te->vme_vnode_hash);
1620                 if (te->vme_amp_hash != NULL)
1621                         mod_hash_destroy_hash(te->vme_amp_hash);
1622                 if (te->vme_anon_hash != NULL)
1623                         mod_hash_destroy_hash(te->vme_anon_hash);
1624                 kmem_free(te, sizeof (vmu_entity_t));
1625         }
1626         while (vmu_data.vmu_free_zones != NULL) {
1627                 tz = vmu_data.vmu_free_zones;
1628                 vmu_data.vmu_free_zones =
1629                     vmu_data.vmu_free_zones->vmz_next;
1630                 if (tz->vmz_projects_hash != NULL)
1631                         mod_hash_destroy_hash(tz->vmz_projects_hash);
1632                 if (tz->vmz_tasks_hash != NULL)
1633                         mod_hash_destroy_hash(tz->vmz_tasks_hash);
1634                 if (tz->vmz_rusers_hash != NULL)
1635                         mod_hash_destroy_hash(tz->vmz_rusers_hash);
1636                 if (tz->vmz_eusers_hash != NULL)
1637                         mod_hash_destroy_hash(tz->vmz_eusers_hash);
1638                 kmem_free(tz, sizeof (vmu_zone_t));
1639         }
1640 }
1641 
1642 extern kcondvar_t *pr_pid_cv;
1643 
1644 /*
1645  * Determine which entity types are relevant and allocate the hashes to
1646  * track them.  Then walk the process table and count rss and swap
1647  * for each process'es address space.  Address space object such as
1648  * vnodes, amps and anons are tracked per entity, so that they are
1649  * not double counted in the results.
1650  *
1651  */
1652 static void
1653 vmu_calculate()
1654 {
1655         int i = 0;
1656         int ret;
1657         proc_t *p;
1658 
1659         vmu_clear_calc();
1660 
1661         if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1662                 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1663                     ALL_ZONES);
1664 
1665         /*
1666          * Walk process table and calculate rss of each proc.
1667          *
1668          * Pidlock and p_lock cannot be held while doing the rss calculation.
1669          * This is because:
1670          *      1.  The calculation allocates using KM_SLEEP.
1671          *      2.  The calculation grabs a_lock, which cannot be grabbed
1672          *          after p_lock.
1673          *
1674          * Since pidlock must be dropped, we cannot simply just walk the
1675          * practive list.  Instead, we walk the process table, and sprlock
1676          * each process to ensure that it does not exit during the
1677          * calculation.
1678          */
1679 
1680         mutex_enter(&pidlock);
1681         for (i = 0; i < v.v_proc; i++) {
1682 again:
1683                 p = pid_entry(i);
1684                 if (p == NULL)
1685                         continue;
1686 
1687                 mutex_enter(&p->p_lock);
1688                 mutex_exit(&pidlock);
1689 
1690                 if (panicstr) {
1691                         mutex_exit(&p->p_lock);
1692                         return;
1693                 }
1694 
1695                 /* Try to set P_PR_LOCK */
1696                 ret = sprtrylock_proc(p);
1697                 if (ret == -1) {
1698                         /* Process in invalid state */
1699                         mutex_exit(&p->p_lock);
1700                         mutex_enter(&pidlock);
1701                         continue;
1702                 } else if (ret == 1) {
1703                         /*
1704                          * P_PR_LOCK is already set.  Wait and try again.
1705                          * This also drops p_lock.
1706                          */
1707                         sprwaitlock_proc(p);
1708                         mutex_enter(&pidlock);
1709                         goto again;
1710                 }
1711                 mutex_exit(&p->p_lock);
1712 
1713                 vmu_calculate_proc(p);
1714 
1715                 mutex_enter(&p->p_lock);
1716                 sprunlock(p);
1717                 mutex_enter(&pidlock);
1718         }
1719         mutex_exit(&pidlock);
1720 
1721         vmu_free_extra();
1722 }
1723 
1724 /*
1725  * allocate a new cache for N results satisfying flags
1726  */
1727 vmu_cache_t *
1728 vmu_cache_alloc(size_t nres, uint_t flags)
1729 {
1730         vmu_cache_t *cache;
1731 
1732         cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1733         cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1734         cache->vmc_nresults = nres;
1735         cache->vmc_flags = flags;
1736         cache->vmc_refcnt = 1;
1737         return (cache);
1738 }
1739 
1740 /*
1741  * Make sure cached results are not freed
1742  */
1743 static void
1744 vmu_cache_hold(vmu_cache_t *cache)
1745 {
1746         ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1747         cache->vmc_refcnt++;
1748 }
1749 
1750 /*
1751  * free cache data
1752  */
1753 static void
1754 vmu_cache_rele(vmu_cache_t *cache)
1755 {
1756         ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1757         ASSERT(cache->vmc_refcnt > 0);
1758         cache->vmc_refcnt--;
1759         if (cache->vmc_refcnt == 0) {
1760                 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1761                     cache->vmc_nresults);
1762                 kmem_free(cache, sizeof (vmu_cache_t));
1763         }
1764 }
1765 
1766 /*
1767  * When new data is calculated, update the phys_mem rctl usage value in the
1768  * zones.
1769  */
1770 static void
1771 vmu_update_zone_rctls(vmu_cache_t *cache)
1772 {
1773         vmusage_t       *rp;
1774         size_t          i = 0;
1775         zone_t          *zp;
1776 
1777         for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
1778                 if (rp->vmu_type == VMUSAGE_ZONE &&
1779                     rp->vmu_zoneid != ALL_ZONES) {
1780                         if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
1781                                 zp->zone_phys_mem = rp->vmu_rss_all;
1782                                 zone_rele(zp);
1783                         }
1784                 }
1785         }
1786 }
1787 
1788 /*
1789  * Copy out the cached results to a caller.  Inspect the callers flags
1790  * and zone to determine which cached results should be copied.
1791  */
1792 static int
1793 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1794     uint_t flags, id_t req_zone_id, int cpflg)
1795 {
1796         vmusage_t *result, *out_result;
1797         vmusage_t dummy;
1798         size_t i, count = 0;
1799         size_t bufsize;
1800         int ret = 0;
1801         uint_t types = 0;
1802 
1803         if (nres != NULL) {
1804                 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1805                         return (set_errno(EFAULT));
1806         } else {
1807                 bufsize = 0;
1808         }
1809 
1810         /* figure out what results the caller is interested in. */
1811         if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1812                 types |= VMUSAGE_SYSTEM;
1813         if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
1814                 types |= VMUSAGE_ZONE;
1815         if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1816             VMUSAGE_COL_PROJECTS))
1817                 types |= VMUSAGE_PROJECTS;
1818         if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1819                 types |= VMUSAGE_TASKS;
1820         if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1821                 types |= VMUSAGE_RUSERS;
1822         if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1823                 types |= VMUSAGE_EUSERS;
1824 
1825         /* count results for current zone */
1826         out_result = buf;
1827         for (result = cache->vmc_results, i = 0;
1828             i < cache->vmc_nresults; result++, i++) {
1829 
1830                 /* Do not return "other-zone" results to non-global zones */
1831                 if (curproc->p_zone != global_zone &&
1832                     curproc->p_zone->zone_id != result->vmu_zoneid)
1833                         continue;
1834 
1835                 /*
1836                  * If non-global zone requests VMUSAGE_SYSTEM, fake
1837                  * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1838                  */
1839                 if (curproc->p_zone != global_zone &&
1840                     (flags & VMUSAGE_SYSTEM) != 0 &&
1841                     result->vmu_type == VMUSAGE_ZONE) {
1842                         count++;
1843                         if (out_result != NULL) {
1844                                 if (bufsize < count) {
1845                                         ret = set_errno(EOVERFLOW);
1846                                 } else {
1847                                         dummy = *result;
1848                                         dummy.vmu_zoneid = ALL_ZONES;
1849                                         dummy.vmu_id = 0;
1850                                         dummy.vmu_type = VMUSAGE_SYSTEM;
1851                                         if (ddi_copyout(&dummy, out_result,
1852                                             sizeof (vmusage_t), cpflg))
1853                                                 return (set_errno(EFAULT));
1854                                         out_result++;
1855                                 }
1856                         }
1857                 }
1858 
1859                 /* Skip results that do not match requested type */
1860                 if ((result->vmu_type & types) == 0)
1861                         continue;
1862 
1863                 /* Skip collated results if not requested */
1864                 if (result->vmu_zoneid == ALL_ZONES) {
1865                         if (result->vmu_type == VMUSAGE_PROJECTS &&
1866                             (flags & VMUSAGE_COL_PROJECTS) == 0)
1867                                 continue;
1868                         if (result->vmu_type == VMUSAGE_EUSERS &&
1869                             (flags & VMUSAGE_COL_EUSERS) == 0)
1870                                 continue;
1871                         if (result->vmu_type == VMUSAGE_RUSERS &&
1872                             (flags & VMUSAGE_COL_RUSERS) == 0)
1873                                 continue;
1874                 }
1875 
1876                 if (result->vmu_type == VMUSAGE_ZONE &&
1877                     flags & VMUSAGE_A_ZONE) {
1878                         /* Skip non-requested zone results */
1879                         if (result->vmu_zoneid != req_zone_id)
1880                                 continue;
1881                 } else {
1882                         /* Skip "other zone" results if not requested */
1883                         if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1884                                 if (result->vmu_type == VMUSAGE_ZONE &&
1885                                     (flags & VMUSAGE_ALL_ZONES) == 0)
1886                                         continue;
1887                                 if (result->vmu_type == VMUSAGE_PROJECTS &&
1888                                     (flags & (VMUSAGE_ALL_PROJECTS |
1889                                     VMUSAGE_COL_PROJECTS)) == 0)
1890                                         continue;
1891                                 if (result->vmu_type == VMUSAGE_TASKS &&
1892                                     (flags & VMUSAGE_ALL_TASKS) == 0)
1893                                         continue;
1894                                 if (result->vmu_type == VMUSAGE_RUSERS &&
1895                                     (flags & (VMUSAGE_ALL_RUSERS |
1896                                     VMUSAGE_COL_RUSERS)) == 0)
1897                                         continue;
1898                                 if (result->vmu_type == VMUSAGE_EUSERS &&
1899                                     (flags & (VMUSAGE_ALL_EUSERS |
1900                                     VMUSAGE_COL_EUSERS)) == 0)
1901                                         continue;
1902                         }
1903                 }
1904                 count++;
1905                 if (out_result != NULL) {
1906                         if (bufsize < count) {
1907                                 ret = set_errno(EOVERFLOW);
1908                         } else {
1909                                 if (ddi_copyout(result, out_result,
1910                                     sizeof (vmusage_t), cpflg))
1911                                         return (set_errno(EFAULT));
1912                                 out_result++;
1913                         }
1914                 }
1915         }
1916         if (nres != NULL)
1917                 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1918                         return (set_errno(EFAULT));
1919 
1920         return (ret);
1921 }
1922 
1923 /*
1924  * vm_getusage()
1925  *
1926  * Counts rss and swap by zone, project, task, and/or user.  The flags argument
1927  * determines the type of results structures returned.  Flags requesting
1928  * results from more than one zone are "flattened" to the local zone if the
1929  * caller is not the global zone.
1930  *
1931  * args:
1932  *      flags:  bitmap consisting of one or more of VMUSAGE_*.
1933  *      age:    maximum allowable age (time since counting was done) in
1934  *              seconds of the results.  Results from previous callers are
1935  *              cached in kernel.
1936  *      buf:    pointer to buffer array of vmusage_t.  If NULL, then only nres
1937  *              set on success.
1938  *      nres:   Set to number of vmusage_t structures pointed to by buf
1939  *              before calling vm_getusage().
1940  *              On return 0 (success) or ENOSPC, is set to the number of result
1941  *              structures returned or attempted to return.
1942  *
1943  * returns 0 on success, -1 on failure:
1944  *      EINTR (interrupted)
1945  *      ENOSPC (nres to small for results, nres set to needed value for success)
1946  *      EINVAL (flags invalid)
1947  *      EFAULT (bad address for buf or nres)
1948  */
1949 int
1950 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1951 {
1952         vmu_entity_t *entity;
1953         vmusage_t *result;
1954         int ret = 0;
1955         int cacherecent = 0;
1956         hrtime_t now;
1957         uint_t flags_orig;
1958         id_t req_zone_id;
1959 
1960         /*
1961          * Non-global zones cannot request system wide and/or collated
1962          * results, or the system result, or usage of another zone, so munge
1963          * the flags accordingly.
1964          */
1965         flags_orig = flags;
1966         if (curproc->p_zone != global_zone) {
1967                 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1968                         flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1969                         flags |= VMUSAGE_PROJECTS;
1970                 }
1971                 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1972                         flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1973                         flags |= VMUSAGE_RUSERS;
1974                 }
1975                 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1976                         flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1977                         flags |= VMUSAGE_EUSERS;
1978                 }
1979                 if (flags & VMUSAGE_SYSTEM) {
1980                         flags &= ~VMUSAGE_SYSTEM;
1981                         flags |= VMUSAGE_ZONE;
1982                 }
1983                 if (flags & VMUSAGE_A_ZONE) {
1984                         flags &= ~VMUSAGE_A_ZONE;
1985                         flags |= VMUSAGE_ZONE;
1986                 }
1987         }
1988 
1989         /* Check for unknown flags */
1990         if ((flags & (~VMUSAGE_MASK)) != 0)
1991                 return (set_errno(EINVAL));
1992 
1993         /* Check for no flags */
1994         if ((flags & VMUSAGE_MASK) == 0)
1995                 return (set_errno(EINVAL));
1996 
1997         /* If requesting results for a specific zone, get the zone ID */
1998         if (flags & VMUSAGE_A_ZONE) {
1999                 size_t bufsize;
2000                 vmusage_t zreq;
2001 
2002                 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
2003                         return (set_errno(EFAULT));
2004                 /* Requested zone ID is passed in buf, so 0 len not allowed */
2005                 if (bufsize == 0)
2006                         return (set_errno(EINVAL));
2007                 if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
2008                         return (set_errno(EFAULT));
2009                 req_zone_id = zreq.vmu_id;
2010         }
2011 
2012         mutex_enter(&vmu_data.vmu_lock);
2013         now = gethrtime();
2014 
2015 start:
2016         if (vmu_data.vmu_cache != NULL) {
2017 
2018                 vmu_cache_t *cache;
2019 
2020                 if ((vmu_data.vmu_cache->vmc_timestamp +
2021                     ((hrtime_t)age * NANOSEC)) > now)
2022                         cacherecent = 1;
2023 
2024                 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
2025                     cacherecent == 1) {
2026                         cache = vmu_data.vmu_cache;
2027                         vmu_cache_hold(cache);
2028                         mutex_exit(&vmu_data.vmu_lock);
2029 
2030                         ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2031                             req_zone_id, cpflg);
2032                         mutex_enter(&vmu_data.vmu_lock);
2033                         vmu_cache_rele(cache);
2034                         if (vmu_data.vmu_pending_waiters > 0)
2035                                 cv_broadcast(&vmu_data.vmu_cv);
2036                         mutex_exit(&vmu_data.vmu_lock);
2037                         return (ret);
2038                 }
2039                 /*
2040                  * If the cache is recent, it is likely that there are other
2041                  * consumers of vm_getusage running, so add their flags to the
2042                  * desired flags for the calculation.
2043                  */
2044                 if (cacherecent == 1)
2045                         flags = vmu_data.vmu_cache->vmc_flags | flags;
2046         }
2047         if (vmu_data.vmu_calc_thread == NULL) {
2048 
2049                 vmu_cache_t *cache;
2050 
2051                 vmu_data.vmu_calc_thread = curthread;
2052                 vmu_data.vmu_calc_flags = flags;
2053                 vmu_data.vmu_entities = NULL;
2054                 vmu_data.vmu_nentities = 0;
2055                 if (vmu_data.vmu_pending_waiters > 0)
2056                         vmu_data.vmu_calc_flags |=
2057                             vmu_data.vmu_pending_flags;
2058 
2059                 vmu_data.vmu_pending_flags = 0;
2060                 mutex_exit(&vmu_data.vmu_lock);
2061                 vmu_calculate();
2062                 mutex_enter(&vmu_data.vmu_lock);
2063                 /* copy results to cache */
2064                 if (vmu_data.vmu_cache != NULL)
2065                         vmu_cache_rele(vmu_data.vmu_cache);
2066                 cache = vmu_data.vmu_cache =
2067                     vmu_cache_alloc(vmu_data.vmu_nentities,
2068                     vmu_data.vmu_calc_flags);
2069 
2070                 result = cache->vmc_results;
2071                 for (entity = vmu_data.vmu_entities; entity != NULL;
2072                     entity = entity->vme_next) {
2073                         *result = entity->vme_result;
2074                         result++;
2075                 }
2076                 cache->vmc_timestamp = gethrtime();
2077                 vmu_cache_hold(cache);
2078 
2079                 vmu_data.vmu_calc_flags = 0;
2080                 vmu_data.vmu_calc_thread = NULL;
2081 
2082                 if (vmu_data.vmu_pending_waiters > 0)
2083                         cv_broadcast(&vmu_data.vmu_cv);
2084 
2085                 mutex_exit(&vmu_data.vmu_lock);
2086 
2087                 /* update zone's phys. mem. rctl usage */
2088                 vmu_update_zone_rctls(cache);
2089                 /* copy cache */
2090                 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2091                     req_zone_id, cpflg);
2092                 mutex_enter(&vmu_data.vmu_lock);
2093                 vmu_cache_rele(cache);
2094                 mutex_exit(&vmu_data.vmu_lock);
2095 
2096                 return (ret);
2097         }
2098         vmu_data.vmu_pending_flags |= flags;
2099         vmu_data.vmu_pending_waiters++;
2100         while (vmu_data.vmu_calc_thread != NULL) {
2101                 if (cv_wait_sig(&vmu_data.vmu_cv,
2102                     &vmu_data.vmu_lock) == 0) {
2103                         vmu_data.vmu_pending_waiters--;
2104                         mutex_exit(&vmu_data.vmu_lock);
2105                         return (set_errno(EINTR));
2106                 }
2107         }
2108         vmu_data.vmu_pending_waiters--;
2109         goto start;
2110 }
2111 
2112 #if defined(__x86)
2113 /*
2114  * Attempt to invalidate all of the pages in the mapping for the given process.
2115  */
2116 static void
2117 map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
2118 {
2119         page_t          *pp;
2120         size_t          psize;
2121         u_offset_t      off;
2122         caddr_t         eaddr;
2123         struct vnode    *vp;
2124         struct segvn_data *svd;
2125         struct hat      *victim_hat;
2126 
2127         ASSERT((addr + size) <= (seg->s_base + seg->s_size));
2128 
2129         victim_hat = p->p_as->a_hat;
2130         svd = (struct segvn_data *)seg->s_data;
2131         vp = svd->vp;
2132         psize = page_get_pagesize(seg->s_szc);
2133 
2134         off = svd->offset + (uintptr_t)(addr - seg->s_base);
2135 
2136         for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
2137                 pp = page_lookup_nowait(vp, off, SE_SHARED);
2138 
2139                 if (pp != NULL) {
2140                         /* following logic based on pvn_getdirty() */
2141 
2142                         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2143                                 page_unlock(pp);
2144                                 continue;
2145                         }
2146 
2147                         page_io_lock(pp);
2148                         hat_page_inval(pp, 0, victim_hat);
2149                         page_io_unlock(pp);
2150 
2151                         /*
2152                          * For B_INVALCURONLY-style handling we let
2153                          * page_release call VN_DISPOSE if no one else is using
2154                          * the page.
2155                          *
2156                          * A hat_ismod() check would be useless because:
2157                          * (1) we are not be holding SE_EXCL lock
2158                          * (2) we've not unloaded _all_ translations
2159                          *
2160                          * Let page_release() do the heavy-lifting.
2161                          */
2162                         (void) page_release(pp, 1);
2163                 }
2164         }
2165 }
2166 
2167 /*
2168  * vm_map_inval()
2169  *
2170  * Invalidate as many pages as possible within the given mapping for the given
2171  * process. addr is expected to be the base address of the mapping and size is
2172  * the length of the mapping. In some cases a mapping will encompass an
2173  * entire segment, but at least for anon or stack mappings, these will be
2174  * regions within a single large segment. Thus, the invalidation is oriented
2175  * around a single mapping and not an entire segment.
2176  *
2177  * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
2178  * this code is only applicable to x86.
2179  */
2180 int
2181 vm_map_inval(pid_t pid, caddr_t addr, size_t size)
2182 {
2183         int ret;
2184         int error = 0;
2185         proc_t *p;              /* target proc */
2186         struct as *as;          /* target proc's address space */
2187         struct seg *seg;        /* working segment */
2188 
2189         if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
2190                 return (set_errno(EPERM));
2191 
2192         /* If not a valid mapping address, return an error */
2193         if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
2194                 return (set_errno(EINVAL));
2195 
2196 again:
2197         mutex_enter(&pidlock);
2198         p = prfind(pid);
2199         if (p == NULL) {
2200                 mutex_exit(&pidlock);
2201                 return (set_errno(ESRCH));
2202         }
2203 
2204         mutex_enter(&p->p_lock);
2205         mutex_exit(&pidlock);
2206 
2207         if (panicstr != NULL) {
2208                 mutex_exit(&p->p_lock);
2209                 return (0);
2210         }
2211 
2212         as = p->p_as;
2213 
2214         /*
2215          * Try to set P_PR_LOCK - prevents process "changing shape"
2216          * - blocks fork
2217          * - blocks sigkill
2218          * - cannot be a system proc
2219          * - must be fully created proc
2220          */
2221         ret = sprtrylock_proc(p);
2222         if (ret == -1) {
2223                 /* Process in invalid state */
2224                 mutex_exit(&p->p_lock);
2225                 return (set_errno(ESRCH));
2226         }
2227 
2228         if (ret == 1) {
2229                 /*
2230                  * P_PR_LOCK is already set. Wait and try again. This also
2231                  * drops p_lock so p may no longer be valid since the proc may
2232                  * have exited.
2233                  */
2234                 sprwaitlock_proc(p);
2235                 goto again;
2236         }
2237 
2238         /* P_PR_LOCK is now set */
2239         mutex_exit(&p->p_lock);
2240 
2241         AS_LOCK_ENTER(as, RW_READER);
2242         if ((seg = as_segat(as, addr)) == NULL) {
2243                 AS_LOCK_EXIT(as);
2244                 mutex_enter(&p->p_lock);
2245                 sprunlock(p);
2246                 return (set_errno(ENOMEM));
2247         }
2248 
2249         /*
2250          * The invalidation behavior only makes sense for vnode-backed segments.
2251          */
2252         if (seg->s_ops != &segvn_ops) {
2253                 AS_LOCK_EXIT(as);
2254                 mutex_enter(&p->p_lock);
2255                 sprunlock(p);
2256                 return (0);
2257         }
2258 
2259         /*
2260          * If the mapping is out of bounds of the segement return an error.
2261          */
2262         if ((addr + size) > (seg->s_base + seg->s_size)) {
2263                 AS_LOCK_EXIT(as);
2264                 mutex_enter(&p->p_lock);
2265                 sprunlock(p);
2266                 return (set_errno(EINVAL));
2267         }
2268 
2269         /*
2270          * Don't use MS_INVALCURPROC flag here since that would eventually
2271          * initiate hat invalidation based on curthread. Since we're doing this
2272          * on behalf of a different process, that would erroneously invalidate
2273          * our own process mappings.
2274          */
2275         error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
2276         if (error == 0) {
2277                 /*
2278                  * Since we didn't invalidate during the sync above, we now
2279                  * try to invalidate all of the pages in the mapping.
2280                  */
2281                 map_inval(p, seg, addr, size);
2282         }
2283         AS_LOCK_EXIT(as);
2284 
2285         mutex_enter(&p->p_lock);
2286         sprunlock(p);
2287 
2288         if (error)
2289                 (void) set_errno(error);
2290         return (error);
2291 }
2292 #endif