1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright 2016, Joyent, Inc.
29 */
30
31 /*
32 * vm_usage
33 *
34 * This file implements the getvmusage() private system call.
35 * getvmusage() counts the amount of resident memory pages and swap
36 * reserved by the specified process collective. A "process collective" is
37 * the set of processes owned by a particular, zone, project, task, or user.
38 *
39 * rss and swap are counted so that for a given process collective, a page is
40 * only counted once. For example, this means that if multiple processes in
41 * the same project map the same page, then the project will only be charged
42 * once for that page. On the other hand, if two processes in different
43 * projects map the same page, then both projects will be charged
44 * for the page.
45 *
46 * The vm_getusage() calculation is implemented so that the first thread
47 * performs the rss/swap counting. Other callers will wait for that thread to
48 * finish, copying the results. This enables multiple rcapds and prstats to
49 * consume data from the same calculation. The results are also cached so that
50 * a caller interested in recent results can just copy them instead of starting
51 * a new calculation. The caller passes the maximium age (in seconds) of the
52 * data. If the cached data is young enough, the cache is copied, otherwise,
53 * a new calculation is executed and the cache is replaced with the new
54 * data.
55 *
56 * The rss calculation for each process collective is as follows:
57 *
58 * - Inspect flags, determine if counting rss for zones, projects, tasks,
59 * and/or users.
60 * - For each proc:
61 * - Figure out proc's collectives (zone, project, task, and/or user).
62 * - For each seg in proc's address space:
63 * - If seg is private:
64 * - Lookup anons in the amp.
65 * - For incore pages not previously visited each of the
66 * proc's collectives, add incore pagesize to each.
67 * collective.
68 * Anon's with a refcnt of 1 can be assummed to be not
69 * previously visited.
70 * - For address ranges without anons in the amp:
71 * - Lookup pages in underlying vnode.
72 * - For incore pages not previously visiting for
73 * each of the proc's collectives, add incore
74 * pagesize to each collective.
75 * - If seg is shared:
76 * - Lookup pages in the shared amp or vnode.
77 * - For incore pages not previously visited for each of
78 * the proc's collectives, add incore pagesize to each
79 * collective.
80 *
81 * Swap is reserved by private segments, and shared anonymous segments.
82 * The only shared anon segments which do not reserve swap are ISM segments
83 * and schedctl segments, both of which can be identified by having
84 * amp->swresv == 0.
85 *
86 * The swap calculation for each collective is as follows:
87 *
88 * - Inspect flags, determine if counting rss for zones, projects, tasks,
89 * and/or users.
90 * - For each proc:
91 * - Figure out proc's collectives (zone, project, task, and/or user).
92 * - For each seg in proc's address space:
93 * - If seg is private:
94 * - Add svd->swresv pages to swap count for each of the
95 * proc's collectives.
96 * - If seg is anon, shared, and amp->swresv != 0
97 * - For address ranges in amp not previously visited for
98 * each of the proc's collectives, add size of address
99 * range to the swap count for each collective.
100 *
101 * These two calculations are done simultaneously, with most of the work
102 * being done in vmu_calculate_seg(). The results of the calculation are
103 * copied into "vmu_data.vmu_cache_results".
104 *
105 * To perform the calculation, various things are tracked and cached:
106 *
107 * - incore/not-incore page ranges for all vnodes.
108 * (vmu_data.vmu_all_vnodes_hash)
109 * This eliminates looking up the same page more than once.
110 *
111 * - incore/not-incore page ranges for all shared amps.
112 * (vmu_data.vmu_all_amps_hash)
113 * This eliminates looking up the same page more than once.
114 *
115 * - visited page ranges for each collective.
116 * - per vnode (entity->vme_vnode_hash)
117 * - per shared amp (entity->vme_amp_hash)
118 * For accurate counting of map-shared and COW-shared pages.
119 *
120 * - visited private anons (refcnt > 1) for each collective.
121 * (entity->vme_anon)
122 * For accurate counting of COW-shared pages.
123 *
124 * The common accounting structure is the vmu_entity_t, which represents
125 * collectives:
126 *
127 * - A zone.
128 * - A project, task, or user within a zone.
129 * - The entire system (vmu_data.vmu_system).
130 * - Each collapsed (col) project and user. This means a given projid or
131 * uid, regardless of which zone the process is in. For instance,
132 * project 0 in the global zone and project 0 in a non global zone are
133 * the same collapsed project.
134 *
135 * Each entity structure tracks which pages have been already visited for
136 * that entity (via previously inspected processes) so that these pages are
137 * not double counted.
138 */
139
140 #include <sys/errno.h>
141 #include <sys/types.h>
142 #include <sys/zone.h>
143 #include <sys/proc.h>
144 #include <sys/project.h>
145 #include <sys/task.h>
146 #include <sys/thread.h>
147 #include <sys/time.h>
148 #include <sys/mman.h>
149 #include <sys/modhash.h>
150 #include <sys/modhash_impl.h>
151 #include <sys/shm.h>
152 #include <sys/swap.h>
153 #include <sys/synch.h>
154 #include <sys/systm.h>
155 #include <sys/var.h>
156 #include <sys/vm_usage.h>
157 #include <sys/zone.h>
158 #include <sys/sunddi.h>
159 #include <sys/sysmacros.h>
160 #include <sys/avl.h>
161 #include <vm/anon.h>
162 #include <vm/as.h>
163 #include <vm/seg_vn.h>
164 #include <vm/seg_spt.h>
165
166 #define VMUSAGE_HASH_SIZE 512
167
168 #define VMUSAGE_TYPE_VNODE 1
169 #define VMUSAGE_TYPE_AMP 2
170 #define VMUSAGE_TYPE_ANON 3
171
172 #define VMUSAGE_BOUND_UNKNOWN 0
173 #define VMUSAGE_BOUND_INCORE 1
174 #define VMUSAGE_BOUND_NOT_INCORE 2
175
176 #define ISWITHIN(node, addr) ((node)->vmb_start <= addr && \
177 (node)->vmb_end >= addr ? 1 : 0)
178
179 /*
180 * bounds for vnodes and shared amps
181 * Each bound is either entirely incore, entirely not in core, or
182 * entirely unknown. bounds are stored in an avl tree sorted by start member
183 * when in use, otherwise (free or temporary lists) they're strung
184 * together off of vmb_next.
185 */
186 typedef struct vmu_bound {
187 avl_node_t vmb_node;
188 struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
189 pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */
190 pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */
191 char vmb_type; /* One of VMUSAGE_BOUND_* */
192 } vmu_bound_t;
193
194 /*
195 * hash of visited objects (vnodes or shared amps)
196 * key is address of vnode or amp. Bounds lists known incore/non-incore
197 * bounds for vnode/amp.
198 */
199 typedef struct vmu_object {
200 struct vmu_object *vmo_next; /* free list */
201 caddr_t vmo_key;
202 short vmo_type;
203 avl_tree_t vmo_bounds;
204 } vmu_object_t;
205
206 /*
207 * Node for tree of visited COW anons.
208 */
209 typedef struct vmu_anon {
210 avl_node_t vma_node;
211 uintptr_t vma_addr;
212 } vmu_anon_t;
213
214 /*
215 * Entity by which to count results.
216 *
217 * The entity structure keeps the current rss/swap counts for each entity
218 * (zone, project, etc), and hashes of vm structures that have already
219 * been visited for the entity.
220 *
221 * vme_next: links the list of all entities currently being counted by
222 * vmu_calculate().
223 *
224 * vme_next_calc: links the list of entities related to the current process
225 * being counted by vmu_calculate_proc().
226 *
227 * vmu_calculate_proc() walks all processes. For each process, it makes a
228 * list of the entities related to that process using vme_next_calc. This
229 * list changes each time vmu_calculate_proc() is called.
230 *
231 */
232 typedef struct vmu_entity {
233 struct vmu_entity *vme_next;
234 struct vmu_entity *vme_next_calc;
235 mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
236 mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
237 avl_tree_t vme_anon; /* COW anons visited for entity */
238 vmusage_t vme_result; /* identifies entity and results */
239 } vmu_entity_t;
240
241 /*
242 * Hash of entities visited within a zone, and an entity for the zone
243 * itself.
244 */
245 typedef struct vmu_zone {
246 struct vmu_zone *vmz_next; /* free list */
247 id_t vmz_id;
248 vmu_entity_t *vmz_zone;
249 mod_hash_t *vmz_projects_hash;
250 mod_hash_t *vmz_tasks_hash;
251 mod_hash_t *vmz_rusers_hash;
252 mod_hash_t *vmz_eusers_hash;
253 } vmu_zone_t;
254
255 /*
256 * Cache of results from last calculation
257 */
258 typedef struct vmu_cache {
259 vmusage_t *vmc_results; /* Results from last call to */
260 /* vm_getusage(). */
261 uint64_t vmc_nresults; /* Count of cached results */
262 uint64_t vmc_refcnt; /* refcnt for free */
263 uint_t vmc_flags; /* Flags for vm_getusage() */
264 hrtime_t vmc_timestamp; /* when cache was created */
265 } vmu_cache_t;
266
267 /*
268 * top level rss info for the system
269 */
270 typedef struct vmu_data {
271 kmutex_t vmu_lock; /* Protects vmu_data */
272 kcondvar_t vmu_cv; /* Used to signal threads */
273 /* Waiting for */
274 /* Rss_calc_thread to finish */
275 vmu_entity_t *vmu_system; /* Entity for tracking */
276 /* rss/swap for all processes */
277 /* in all zones */
278 mod_hash_t *vmu_zones_hash; /* Zones visited */
279 mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */
280 mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */
281 mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */
282 /* to implement VMUSAGE_COL_* */
283 /* flags, which aggregate by */
284 /* project or user regardless */
285 /* of zoneid. */
286 mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */
287 /* to track incore/not-incore */
288 mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */
289 /* amps to track incore/not- */
290 /* incore */
291 vmu_entity_t *vmu_entities; /* Linked list of entities */
292 size_t vmu_nentities; /* Count of entities in list */
293 vmu_cache_t *vmu_cache; /* Cached results */
294 kthread_t *vmu_calc_thread; /* NULL, or thread running */
295 /* vmu_calculate() */
296 uint_t vmu_calc_flags; /* Flags being using by */
297 /* currently running calc */
298 /* thread */
299 uint_t vmu_pending_flags; /* Flags of vm_getusage() */
300 /* threads waiting for */
301 /* calc thread to finish */
302 uint_t vmu_pending_waiters; /* Number of threads waiting */
303 /* for calc thread */
304 vmu_bound_t *vmu_free_bounds;
305 vmu_object_t *vmu_free_objects;
306 vmu_entity_t *vmu_free_entities;
307 vmu_zone_t *vmu_free_zones;
308 } vmu_data_t;
309
310 extern struct as kas;
311 extern proc_t *practive;
312 extern zone_t *global_zone;
313 extern struct seg_ops segvn_ops;
314 extern struct seg_ops segspt_shmops;
315
316 static vmu_data_t vmu_data;
317 static kmem_cache_t *vmu_bound_cache;
318 static kmem_cache_t *vmu_object_cache;
319
320 /*
321 * Comparison routine for AVL tree. We base our comparison on vmb_start.
322 */
323 static int
324 bounds_cmp(const void *bnd1, const void *bnd2)
325 {
326 const vmu_bound_t *bound1 = bnd1;
327 const vmu_bound_t *bound2 = bnd2;
328
329 if (bound1->vmb_start == bound2->vmb_start) {
330 return (0);
331 }
332 if (bound1->vmb_start < bound2->vmb_start) {
333 return (-1);
334 }
335
336 return (1);
337 }
338
339 /*
340 * Comparison routine for our AVL tree of anon structures.
341 */
342 static int
343 vmu_anon_cmp(const void *lhs, const void *rhs)
344 {
345 const vmu_anon_t *l = lhs, *r = rhs;
346
347 if (l->vma_addr == r->vma_addr)
348 return (0);
349
350 if (l->vma_addr < r->vma_addr)
351 return (-1);
352
353 return (1);
354 }
355
356 /*
357 * Save a bound on the free list.
358 */
359 static void
360 vmu_free_bound(vmu_bound_t *bound)
361 {
362 bound->vmb_next = vmu_data.vmu_free_bounds;
363 bound->vmb_start = 0;
364 bound->vmb_end = 0;
365 bound->vmb_type = 0;
366 vmu_data.vmu_free_bounds = bound;
367 }
368
369 /*
370 * Free an object, and all visited bound info.
371 */
372 static void
373 vmu_free_object(mod_hash_val_t val)
374 {
375 vmu_object_t *obj = (vmu_object_t *)val;
376 avl_tree_t *tree = &(obj->vmo_bounds);
377 vmu_bound_t *bound;
378 void *cookie = NULL;
379
380 while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
381 vmu_free_bound(bound);
382 avl_destroy(tree);
383
384 obj->vmo_type = 0;
385 obj->vmo_next = vmu_data.vmu_free_objects;
386 vmu_data.vmu_free_objects = obj;
387 }
388
389 /*
390 * Free an entity, and hashes of visited objects for that entity.
391 */
392 static void
393 vmu_free_entity(mod_hash_val_t val)
394 {
395 vmu_entity_t *entity = (vmu_entity_t *)val;
396 vmu_anon_t *anon;
397 void *cookie = NULL;
398
399 if (entity->vme_vnode_hash != NULL)
400 i_mod_hash_clear_nosync(entity->vme_vnode_hash);
401 if (entity->vme_amp_hash != NULL)
402 i_mod_hash_clear_nosync(entity->vme_amp_hash);
403
404 while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
405 kmem_free(anon, sizeof (vmu_anon_t));
406
407 avl_destroy(&entity->vme_anon);
408
409 entity->vme_next = vmu_data.vmu_free_entities;
410 vmu_data.vmu_free_entities = entity;
411 }
412
413 /*
414 * Free zone entity, and all hashes of entities inside that zone,
415 * which are projects, tasks, and users.
416 */
417 static void
418 vmu_free_zone(mod_hash_val_t val)
419 {
420 vmu_zone_t *zone = (vmu_zone_t *)val;
421
422 if (zone->vmz_zone != NULL) {
423 vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
424 zone->vmz_zone = NULL;
425 }
426 if (zone->vmz_projects_hash != NULL)
427 i_mod_hash_clear_nosync(zone->vmz_projects_hash);
428 if (zone->vmz_tasks_hash != NULL)
429 i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
430 if (zone->vmz_rusers_hash != NULL)
431 i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
432 if (zone->vmz_eusers_hash != NULL)
433 i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
434 zone->vmz_next = vmu_data.vmu_free_zones;
435 vmu_data.vmu_free_zones = zone;
436 }
437
438 /*
439 * Initialize synchronization primitives and hashes for system-wide tracking
440 * of visited vnodes and shared amps. Initialize results cache.
441 */
442 void
443 vm_usage_init()
444 {
445 mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
446 cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
447
448 vmu_data.vmu_system = NULL;
449 vmu_data.vmu_zones_hash = NULL;
450 vmu_data.vmu_projects_col_hash = NULL;
451 vmu_data.vmu_rusers_col_hash = NULL;
452 vmu_data.vmu_eusers_col_hash = NULL;
453
454 vmu_data.vmu_free_bounds = NULL;
455 vmu_data.vmu_free_objects = NULL;
456 vmu_data.vmu_free_entities = NULL;
457 vmu_data.vmu_free_zones = NULL;
458
459 vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
460 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
461 sizeof (vnode_t));
462 vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
463 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
464 sizeof (struct anon_map));
465 vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
466 "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
467 vmu_free_entity);
468 vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
469 "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
470 vmu_free_entity);
471 vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
472 "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
473 vmu_free_entity);
474 vmu_data.vmu_zones_hash = mod_hash_create_idhash(
475 "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
476
477 vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
478 sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
479 vmu_object_cache = kmem_cache_create("vmu_object_cache",
480 sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
481
482 vmu_data.vmu_entities = NULL;
483 vmu_data.vmu_nentities = 0;
484
485 vmu_data.vmu_cache = NULL;
486 vmu_data.vmu_calc_thread = NULL;
487 vmu_data.vmu_calc_flags = 0;
488 vmu_data.vmu_pending_flags = 0;
489 vmu_data.vmu_pending_waiters = 0;
490 }
491
492 /*
493 * Allocate hashes for tracking vm objects visited for an entity.
494 * Update list of entities.
495 */
496 static vmu_entity_t *
497 vmu_alloc_entity(id_t id, int type, id_t zoneid)
498 {
499 vmu_entity_t *entity;
500
501 if (vmu_data.vmu_free_entities != NULL) {
502 entity = vmu_data.vmu_free_entities;
503 vmu_data.vmu_free_entities =
504 vmu_data.vmu_free_entities->vme_next;
505 bzero(&entity->vme_result, sizeof (vmusage_t));
506 } else {
507 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
508 }
509 entity->vme_result.vmu_id = id;
510 entity->vme_result.vmu_zoneid = zoneid;
511 entity->vme_result.vmu_type = type;
512
513 if (entity->vme_vnode_hash == NULL)
514 entity->vme_vnode_hash = mod_hash_create_ptrhash(
515 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
516 sizeof (vnode_t));
517
518 if (entity->vme_amp_hash == NULL)
519 entity->vme_amp_hash = mod_hash_create_ptrhash(
520 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
521 sizeof (struct anon_map));
522
523 VERIFY(avl_first(&entity->vme_anon) == NULL);
524
525 avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
526 offsetof(struct vmu_anon, vma_node));
527
528 entity->vme_next = vmu_data.vmu_entities;
529 vmu_data.vmu_entities = entity;
530 vmu_data.vmu_nentities++;
531
532 return (entity);
533 }
534
535 /*
536 * Allocate a zone entity, and hashes for tracking visited vm objects
537 * for projects, tasks, and users within that zone.
538 */
539 static vmu_zone_t *
540 vmu_alloc_zone(id_t id)
541 {
542 vmu_zone_t *zone;
543
544 if (vmu_data.vmu_free_zones != NULL) {
545 zone = vmu_data.vmu_free_zones;
546 vmu_data.vmu_free_zones =
547 vmu_data.vmu_free_zones->vmz_next;
548 zone->vmz_next = NULL;
549 zone->vmz_zone = NULL;
550 } else {
551 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
552 }
553
554 zone->vmz_id = id;
555
556 if ((vmu_data.vmu_calc_flags &
557 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
558 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
559
560 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
561 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
562 zone->vmz_projects_hash = mod_hash_create_idhash(
563 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
564
565 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
566 != 0 && zone->vmz_tasks_hash == NULL)
567 zone->vmz_tasks_hash = mod_hash_create_idhash(
568 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
569
570 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
571 != 0 && zone->vmz_rusers_hash == NULL)
572 zone->vmz_rusers_hash = mod_hash_create_idhash(
573 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
574
575 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
576 != 0 && zone->vmz_eusers_hash == NULL)
577 zone->vmz_eusers_hash = mod_hash_create_idhash(
578 "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
579
580 return (zone);
581 }
582
583 /*
584 * Allocate a structure for tracking visited bounds for a vm object.
585 */
586 static vmu_object_t *
587 vmu_alloc_object(caddr_t key, int type)
588 {
589 vmu_object_t *object;
590
591 if (vmu_data.vmu_free_objects != NULL) {
592 object = vmu_data.vmu_free_objects;
593 vmu_data.vmu_free_objects =
594 vmu_data.vmu_free_objects->vmo_next;
595 } else {
596 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
597 }
598
599 object->vmo_next = NULL;
600 object->vmo_key = key;
601 object->vmo_type = type;
602 avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
603
604 return (object);
605 }
606
607 /*
608 * Allocate and return a bound structure.
609 */
610 static vmu_bound_t *
611 vmu_alloc_bound()
612 {
613 vmu_bound_t *bound;
614
615 if (vmu_data.vmu_free_bounds != NULL) {
616 bound = vmu_data.vmu_free_bounds;
617 vmu_data.vmu_free_bounds =
618 vmu_data.vmu_free_bounds->vmb_next;
619 } else {
620 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
621 }
622
623 bound->vmb_next = NULL;
624 bound->vmb_start = 0;
625 bound->vmb_end = 0;
626 bound->vmb_type = 0;
627 return (bound);
628 }
629
630 /*
631 * vmu_find_insert_* functions implement hash lookup or allocate and
632 * insert operations.
633 */
634 static vmu_object_t *
635 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
636 {
637 int ret;
638 vmu_object_t *object;
639
640 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
641 (mod_hash_val_t *)&object);
642 if (ret != 0) {
643 object = vmu_alloc_object(key, type);
644 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
645 (mod_hash_val_t)object, (mod_hash_hndl_t)0);
646 ASSERT(ret == 0);
647 }
648 return (object);
649 }
650
651 static int
652 vmu_find_insert_anon(vmu_entity_t *entity, void *key)
653 {
654 vmu_anon_t anon, *ap;
655
656 anon.vma_addr = (uintptr_t)key;
657
658 if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
659 return (0);
660
661 ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
662 ap->vma_addr = (uintptr_t)key;
663
664 avl_add(&entity->vme_anon, ap);
665
666 return (1);
667 }
668
669 static vmu_entity_t *
670 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
671 {
672 int ret;
673 vmu_entity_t *entity;
674
675 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
676 (mod_hash_val_t *)&entity);
677 if (ret != 0) {
678 entity = vmu_alloc_entity(id, type, zoneid);
679 ret = i_mod_hash_insert_nosync(hash,
680 (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
681 (mod_hash_hndl_t)0);
682 ASSERT(ret == 0);
683 }
684 return (entity);
685 }
686
687
688
689
690 /*
691 * Returns list of object bounds between start and end. New bounds inserted
692 * by this call are given type.
693 *
694 * Returns the number of pages covered if new bounds are created. Returns 0
695 * if region between start/end consists of all existing bounds.
696 */
697 static pgcnt_t
698 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
699 end, char type, vmu_bound_t **first, vmu_bound_t **last)
700 {
701 avl_tree_t *tree = &(ro->vmo_bounds);
702 avl_index_t where;
703 vmu_bound_t *walker, *tmp;
704 pgcnt_t ret = 0;
705
706 ASSERT(start <= end);
707
708 *first = *last = NULL;
709
710 tmp = vmu_alloc_bound();
711 tmp->vmb_start = start;
712 tmp->vmb_type = type;
713
714 /* Hopelessly optimistic case. */
715 if (walker = avl_find(tree, tmp, &where)) {
716 /* We got lucky. */
717 vmu_free_bound(tmp);
718 *first = walker;
719 }
720
721 if (walker == NULL) {
722 /* Is start in the previous node? */
723 walker = avl_nearest(tree, where, AVL_BEFORE);
724 if (walker != NULL) {
725 if (ISWITHIN(walker, start)) {
726 /* We found start. */
727 vmu_free_bound(tmp);
728 *first = walker;
729 }
730 }
731 }
732
733 /*
734 * At this point, if *first is still NULL, then we
735 * didn't get a direct hit and start isn't covered
736 * by the previous node. We know that the next node
737 * must have a greater start value than we require
738 * because avl_find tells us where the AVL routines would
739 * insert our new node. We have some gap between the
740 * start we want and the next node.
741 */
742 if (*first == NULL) {
743 walker = avl_nearest(tree, where, AVL_AFTER);
744 if (walker != NULL && walker->vmb_start <= end) {
745 /* Fill the gap. */
746 tmp->vmb_end = walker->vmb_start - 1;
747 *first = tmp;
748 } else {
749 /* We have a gap over [start, end]. */
750 tmp->vmb_end = end;
751 *first = *last = tmp;
752 }
753 ret += tmp->vmb_end - tmp->vmb_start + 1;
754 avl_insert(tree, tmp, where);
755 }
756
757 ASSERT(*first != NULL);
758
759 if (*last != NULL) {
760 /* We're done. */
761 return (ret);
762 }
763
764 /*
765 * If we are here we still need to set *last and
766 * that may involve filling in some gaps.
767 */
768 *last = *first;
769 for (;;) {
770 if (ISWITHIN(*last, end)) {
771 /* We're done. */
772 break;
773 }
774 walker = AVL_NEXT(tree, *last);
775 if (walker == NULL || walker->vmb_start > end) {
776 /* Bottom or mid tree with gap. */
777 tmp = vmu_alloc_bound();
778 tmp->vmb_start = (*last)->vmb_end + 1;
779 tmp->vmb_end = end;
780 tmp->vmb_type = type;
781 ret += tmp->vmb_end - tmp->vmb_start + 1;
782 avl_insert_here(tree, tmp, *last, AVL_AFTER);
783 *last = tmp;
784 break;
785 } else {
786 if ((*last)->vmb_end + 1 != walker->vmb_start) {
787 /* Non-contiguous. */
788 tmp = vmu_alloc_bound();
789 tmp->vmb_start = (*last)->vmb_end + 1;
790 tmp->vmb_end = walker->vmb_start - 1;
791 tmp->vmb_type = type;
792 ret += tmp->vmb_end - tmp->vmb_start + 1;
793 avl_insert_here(tree, tmp, *last, AVL_AFTER);
794 *last = tmp;
795 } else {
796 *last = walker;
797 }
798 }
799 }
800
801 return (ret);
802 }
803
804 /*
805 * vmu_update_bounds()
806 *
807 * tree: avl_tree in which first and last hang.
808 *
809 * first, last: list of continuous bounds, of which zero or more are of
810 * type VMUSAGE_BOUND_UNKNOWN.
811 *
812 * new_tree: avl_tree in which new_first and new_last hang.
813 *
814 * new_first, new_last: list of continuous bounds, of which none are of
815 * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
816 * update the types of bounds in (first,last) with
817 * type VMUSAGE_BOUND_UNKNOWN.
818 *
819 * For the list of bounds (first,last), this function updates any bounds
820 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
821 * the list (new_first, new_last).
822 *
823 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
824 * (new_first, new_last), it will be split into multiple bounds.
825 *
826 * Return value:
827 * The number of pages in the list of bounds (first,last) that were of
828 * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
829 * VMUSAGE_BOUND_INCORE.
830 *
831 */
832 static pgcnt_t
833 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
834 avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
835 {
836 vmu_bound_t *next, *new_next, *tmp;
837 pgcnt_t rss = 0;
838
839 next = *first;
840 new_next = new_first;
841
842 /*
843 * Verify first and last bound are covered by new bounds if they
844 * have unknown type.
845 */
846 ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
847 (*first)->vmb_start >= new_first->vmb_start);
848 ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
849 (*last)->vmb_end <= new_last->vmb_end);
850 for (;;) {
851 /* If bound already has type, proceed to next bound. */
852 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
853 if (next == *last)
854 break;
855 next = AVL_NEXT(tree, next);
856 continue;
857 }
858 while (new_next->vmb_end < next->vmb_start)
859 new_next = AVL_NEXT(new_tree, new_next);
860 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
861 next->vmb_type = new_next->vmb_type;
862 if (new_next->vmb_end < next->vmb_end) {
863 /* need to split bound */
864 tmp = vmu_alloc_bound();
865 tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
866 tmp->vmb_start = new_next->vmb_end + 1;
867 tmp->vmb_end = next->vmb_end;
868 avl_insert_here(tree, tmp, next, AVL_AFTER);
869 next->vmb_end = new_next->vmb_end;
870 if (*last == next)
871 *last = tmp;
872 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
873 rss += next->vmb_end - next->vmb_start + 1;
874 next = tmp;
875 } else {
876 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
877 rss += next->vmb_end - next->vmb_start + 1;
878 if (next == *last)
879 break;
880 next = AVL_NEXT(tree, next);
881 }
882 }
883 return (rss);
884 }
885
886 /*
887 * Merges adjacent bounds with same type between first and last bound.
888 * After merge, last pointer may point to a different bound, as (incoming)
889 * last bound may have been merged away.
890 */
891 static void
892 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
893 {
894 vmu_bound_t *current;
895 vmu_bound_t *next;
896
897 ASSERT(tree != NULL);
898 ASSERT(*first != NULL);
899 ASSERT(*last != NULL);
900
901 current = *first;
902 while (current != *last) {
903 next = AVL_NEXT(tree, current);
904 if ((current->vmb_end + 1) == next->vmb_start &&
905 current->vmb_type == next->vmb_type) {
906 current->vmb_end = next->vmb_end;
907 avl_remove(tree, next);
908 vmu_free_bound(next);
909 if (next == *last) {
910 *last = current;
911 }
912 } else {
913 current = AVL_NEXT(tree, current);
914 }
915 }
916 }
917
918 /*
919 * Given an amp and a list of bounds, updates each bound's type with
920 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
921 *
922 * If a bound is partially incore, it will be split into two bounds.
923 * first and last may be modified, as bounds may be split into multiple
924 * bounds if they are partially incore/not-incore.
925 *
926 * Set incore to non-zero if bounds are already known to be incore.
927 *
928 */
929 static void
930 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
931 vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
932 {
933 vmu_bound_t *next;
934 vmu_bound_t *tmp;
935 pgcnt_t index;
936 short bound_type;
937 short page_type;
938 vnode_t *vn;
939 anoff_t off;
940 struct anon *ap;
941
942 next = *first;
943 /* Shared anon slots don't change once set. */
944 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
945 for (;;) {
946 if (incore == B_TRUE)
947 next->vmb_type = VMUSAGE_BOUND_INCORE;
948
949 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
950 if (next == *last)
951 break;
952 next = AVL_NEXT(tree, next);
953 continue;
954 }
955
956 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
957 bound_type = next->vmb_type;
958 index = next->vmb_start;
959 while (index <= next->vmb_end) {
960
961 /*
962 * These are used to determine how much to increment
963 * index when a large page is found.
964 */
965 page_t *page;
966 pgcnt_t pgcnt = 1;
967 uint_t pgshft;
968 pgcnt_t pgmsk;
969
970 ap = anon_get_ptr(amp->ahp, index);
971 if (ap != NULL)
972 swap_xlate(ap, &vn, &off);
973
974 if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
975 (page = page_exists(vn, off)) != NULL) {
976 if (PP_ISFREE(page))
977 page_type = VMUSAGE_BOUND_NOT_INCORE;
978 else
979 page_type = VMUSAGE_BOUND_INCORE;
980 if (page->p_szc > 0) {
981 pgcnt = page_get_pagecnt(page->p_szc);
982 pgshft = page_get_shift(page->p_szc);
983 pgmsk = (0x1 << (pgshft - PAGESHIFT))
984 - 1;
985 }
986 } else {
987 page_type = VMUSAGE_BOUND_NOT_INCORE;
988 }
989
990 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
991 next->vmb_type = page_type;
992 bound_type = page_type;
993 } else if (next->vmb_type != page_type) {
994 /*
995 * If current bound type does not match page
996 * type, need to split off new bound.
997 */
998 tmp = vmu_alloc_bound();
999 tmp->vmb_type = page_type;
1000 tmp->vmb_start = index;
1001 tmp->vmb_end = next->vmb_end;
1002 avl_insert_here(tree, tmp, next, AVL_AFTER);
1003 next->vmb_end = index - 1;
1004 if (*last == next)
1005 *last = tmp;
1006 next = tmp;
1007 }
1008 if (pgcnt > 1) {
1009 /*
1010 * If inside large page, jump to next large
1011 * page
1012 */
1013 index = (index & ~pgmsk) + pgcnt;
1014 } else {
1015 index++;
1016 }
1017 }
1018 if (next == *last) {
1019 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1020 break;
1021 } else
1022 next = AVL_NEXT(tree, next);
1023 }
1024 ANON_LOCK_EXIT(&->a_rwlock);
1025 }
1026
1027 /*
1028 * Same as vmu_amp_update_incore_bounds(), except for tracking
1029 * incore-/not-incore for vnodes.
1030 */
1031 static void
1032 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
1033 vmu_bound_t **first, vmu_bound_t **last)
1034 {
1035 vmu_bound_t *next;
1036 vmu_bound_t *tmp;
1037 pgcnt_t index;
1038 short bound_type;
1039 short page_type;
1040
1041 next = *first;
1042 for (;;) {
1043 if (vnode->v_pages == NULL)
1044 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1045
1046 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1047 if (next == *last)
1048 break;
1049 next = AVL_NEXT(tree, next);
1050 continue;
1051 }
1052
1053 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
1054 bound_type = next->vmb_type;
1055 index = next->vmb_start;
1056 while (index <= next->vmb_end) {
1057
1058 /*
1059 * These are used to determine how much to increment
1060 * index when a large page is found.
1061 */
1062 page_t *page;
1063 pgcnt_t pgcnt = 1;
1064 uint_t pgshft;
1065 pgcnt_t pgmsk;
1066
1067 if (vnode->v_pages != NULL &&
1068 (page = page_exists(vnode, ptob(index))) != NULL) {
1069 if (PP_ISFREE(page))
1070 page_type = VMUSAGE_BOUND_NOT_INCORE;
1071 else
1072 page_type = VMUSAGE_BOUND_INCORE;
1073 if (page->p_szc > 0) {
1074 pgcnt = page_get_pagecnt(page->p_szc);
1075 pgshft = page_get_shift(page->p_szc);
1076 pgmsk = (0x1 << (pgshft - PAGESHIFT))
1077 - 1;
1078 }
1079 } else {
1080 page_type = VMUSAGE_BOUND_NOT_INCORE;
1081 }
1082
1083 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1084 next->vmb_type = page_type;
1085 bound_type = page_type;
1086 } else if (next->vmb_type != page_type) {
1087 /*
1088 * If current bound type does not match page
1089 * type, need to split off new bound.
1090 */
1091 tmp = vmu_alloc_bound();
1092 tmp->vmb_type = page_type;
1093 tmp->vmb_start = index;
1094 tmp->vmb_end = next->vmb_end;
1095 avl_insert_here(tree, tmp, next, AVL_AFTER);
1096 next->vmb_end = index - 1;
1097 if (*last == next)
1098 *last = tmp;
1099 next = tmp;
1100 }
1101 if (pgcnt > 1) {
1102 /*
1103 * If inside large page, jump to next large
1104 * page
1105 */
1106 index = (index & ~pgmsk) + pgcnt;
1107 } else {
1108 index++;
1109 }
1110 }
1111 if (next == *last) {
1112 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1113 break;
1114 } else
1115 next = AVL_NEXT(tree, next);
1116 }
1117 }
1118
1119 /*
1120 * Calculate the rss and swap consumed by a segment. vmu_entities is the
1121 * list of entities to visit. For shared segments, the vnode or amp
1122 * is looked up in each entity to see if it has been already counted. Private
1123 * anon pages are checked per entity to ensure that COW pages are not
1124 * double counted.
1125 *
1126 * For private mapped files, first the amp is checked for private pages.
1127 * Bounds not backed by the amp are looked up in the vnode for each entity
1128 * to avoid double counting of private COW vnode pages.
1129 */
1130 static void
1131 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1132 {
1133 struct segvn_data *svd;
1134 struct shm_data *shmd;
1135 struct spt_data *sptd;
1136 vmu_object_t *shared_object = NULL;
1137 vmu_object_t *entity_object = NULL;
1138 vmu_entity_t *entity;
1139 vmusage_t *result;
1140 vmu_bound_t *first = NULL;
1141 vmu_bound_t *last = NULL;
1142 vmu_bound_t *cur = NULL;
1143 vmu_bound_t *e_first = NULL;
1144 vmu_bound_t *e_last = NULL;
1145 vmu_bound_t *tmp;
1146 pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1147 struct anon_map *private_amp = NULL;
1148 boolean_t incore = B_FALSE;
1149 boolean_t shared = B_FALSE;
1150 int file = 0;
1151 pgcnt_t swresv = 0;
1152 pgcnt_t panon = 0;
1153
1154 /* Can zero-length segments exist? Not sure, so paranoia. */
1155 if (seg->s_size <= 0)
1156 return;
1157
1158 /*
1159 * Figure out if there is a shared object (such as a named vnode or
1160 * a shared amp, then figure out if there is a private amp, which
1161 * identifies private pages.
1162 */
1163 if (seg->s_ops == &segvn_ops) {
1164 svd = (struct segvn_data *)seg->s_data;
1165 if (svd->type == MAP_SHARED) {
1166 shared = B_TRUE;
1167 } else {
1168 swresv = svd->swresv;
1169
1170 if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1171 RW_READER) != 0) {
1172 /*
1173 * Text replication anon maps can be shared
1174 * across all zones. Space used for text
1175 * replication is typically capped as a small %
1176 * of memory. To keep it simple for now we
1177 * don't account for swap and memory space used
1178 * for text replication.
1179 */
1180 if (svd->tr_state == SEGVN_TR_OFF &&
1181 svd->amp != NULL) {
1182 private_amp = svd->amp;
1183 p_start = svd->anon_index;
1184 p_end = svd->anon_index +
1185 btop(seg->s_size) - 1;
1186 }
1187 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1188 }
1189 }
1190 if (svd->vp != NULL) {
1191 file = 1;
1192 shared_object = vmu_find_insert_object(
1193 vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1194 VMUSAGE_TYPE_VNODE);
1195 s_start = btop(svd->offset);
1196 s_end = btop(svd->offset + seg->s_size) - 1;
1197 }
1198 if (svd->amp != NULL && svd->type == MAP_SHARED) {
1199 ASSERT(shared_object == NULL);
1200 shared_object = vmu_find_insert_object(
1201 vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1202 VMUSAGE_TYPE_AMP);
1203 s_start = svd->anon_index;
1204 s_end = svd->anon_index + btop(seg->s_size) - 1;
1205 /* schedctl mappings are always in core */
1206 if (svd->amp->swresv == 0)
1207 incore = B_TRUE;
1208 }
1209 } else if (seg->s_ops == &segspt_shmops) {
1210 shared = B_TRUE;
1211 shmd = (struct shm_data *)seg->s_data;
1212 shared_object = vmu_find_insert_object(
1213 vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1214 VMUSAGE_TYPE_AMP);
1215 s_start = 0;
1216 s_end = btop(seg->s_size) - 1;
1217 sptd = shmd->shm_sptseg->s_data;
1218
1219 /* ism segments are always incore and do not reserve swap */
1220 if (sptd->spt_flags & SHM_SHARE_MMU)
1221 incore = B_TRUE;
1222
1223 } else {
1224 return;
1225 }
1226
1227 /*
1228 * If there is a private amp, count anon pages that exist. If an
1229 * anon has a refcnt > 1 (COW sharing), then save the anon in a
1230 * hash so that it is not double counted.
1231 *
1232 * If there is also a shared object, then figure out the bounds
1233 * which are not mapped by the private amp.
1234 */
1235 if (private_amp != NULL) {
1236
1237 /* Enter as writer to prevent COW anons from being freed */
1238 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1239
1240 p_index = p_start;
1241 s_index = s_start;
1242
1243 while (p_index <= p_end) {
1244
1245 pgcnt_t p_index_next;
1246 pgcnt_t p_bound_size;
1247 int cnt;
1248 anoff_t off;
1249 struct vnode *vn;
1250 struct anon *ap;
1251 page_t *page; /* For handling of large */
1252 pgcnt_t pgcnt = 1; /* pages */
1253 pgcnt_t pgstart;
1254 pgcnt_t pgend;
1255 uint_t pgshft;
1256 pgcnt_t pgmsk;
1257
1258 p_index_next = p_index;
1259 ap = anon_get_next_ptr(private_amp->ahp,
1260 &p_index_next);
1261
1262 /*
1263 * If next anon is past end of mapping, simulate
1264 * end of anon so loop terminates.
1265 */
1266 if (p_index_next > p_end) {
1267 p_index_next = p_end + 1;
1268 ap = NULL;
1269 }
1270 /*
1271 * For COW segments, keep track of bounds not
1272 * backed by private amp so they can be looked
1273 * up in the backing vnode
1274 */
1275 if (p_index_next != p_index) {
1276
1277 /*
1278 * Compute index difference between anon and
1279 * previous anon.
1280 */
1281 p_bound_size = p_index_next - p_index - 1;
1282
1283 if (shared_object != NULL) {
1284 cur = vmu_alloc_bound();
1285 cur->vmb_start = s_index;
1286 cur->vmb_end = s_index + p_bound_size;
1287 cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1288 if (first == NULL) {
1289 first = cur;
1290 last = cur;
1291 } else {
1292 last->vmb_next = cur;
1293 last = cur;
1294 }
1295 }
1296 p_index = p_index + p_bound_size + 1;
1297 s_index = s_index + p_bound_size + 1;
1298 }
1299
1300 /* Detect end of anons in amp */
1301 if (ap == NULL)
1302 break;
1303
1304 cnt = ap->an_refcnt;
1305 swap_xlate(ap, &vn, &off);
1306
1307 if (vn == NULL || vn->v_pages == NULL ||
1308 (page = page_exists(vn, off)) == NULL) {
1309 p_index++;
1310 s_index++;
1311 continue;
1312 }
1313
1314 /*
1315 * If large page is found, compute portion of large
1316 * page in mapping, and increment indicies to the next
1317 * large page.
1318 */
1319 if (page->p_szc > 0) {
1320
1321 pgcnt = page_get_pagecnt(page->p_szc);
1322 pgshft = page_get_shift(page->p_szc);
1323 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1324
1325 /* First page in large page */
1326 pgstart = p_index & ~pgmsk;
1327 /* Last page in large page */
1328 pgend = pgstart + pgcnt - 1;
1329 /*
1330 * Artifically end page if page extends past
1331 * end of mapping.
1332 */
1333 if (pgend > p_end)
1334 pgend = p_end;
1335
1336 /*
1337 * Compute number of pages from large page
1338 * which are mapped.
1339 */
1340 pgcnt = pgend - p_index + 1;
1341
1342 /*
1343 * Point indicies at page after large page,
1344 * or at page after end of mapping.
1345 */
1346 p_index += pgcnt;
1347 s_index += pgcnt;
1348 } else {
1349 p_index++;
1350 s_index++;
1351 }
1352
1353 /*
1354 * Pages on the free list aren't counted for the rss.
1355 */
1356 if (PP_ISFREE(page))
1357 continue;
1358
1359 /*
1360 * Assume anon structs with a refcnt
1361 * of 1 are not COW shared, so there
1362 * is no reason to track them per entity.
1363 */
1364 if (cnt == 1) {
1365 panon += pgcnt;
1366 continue;
1367 }
1368 for (entity = vmu_entities; entity != NULL;
1369 entity = entity->vme_next_calc) {
1370
1371 result = &entity->vme_result;
1372 /*
1373 * Track COW anons per entity so
1374 * they are not double counted.
1375 */
1376 if (vmu_find_insert_anon(entity, ap) == 0)
1377 continue;
1378
1379 result->vmu_rss_all += (pgcnt << PAGESHIFT);
1380 result->vmu_rss_private +=
1381 (pgcnt << PAGESHIFT);
1382 }
1383 }
1384 ANON_LOCK_EXIT(&private_amp->a_rwlock);
1385 }
1386
1387 /* Add up resident anon and swap reserved for private mappings */
1388 if (swresv > 0 || panon > 0) {
1389 for (entity = vmu_entities; entity != NULL;
1390 entity = entity->vme_next_calc) {
1391 result = &entity->vme_result;
1392 result->vmu_swap_all += swresv;
1393 result->vmu_swap_private += swresv;
1394 result->vmu_rss_all += (panon << PAGESHIFT);
1395 result->vmu_rss_private += (panon << PAGESHIFT);
1396 }
1397 }
1398
1399 /* Compute resident pages backing shared amp or named vnode */
1400 if (shared_object != NULL) {
1401 avl_tree_t *tree = &(shared_object->vmo_bounds);
1402
1403 if (first == NULL) {
1404 /*
1405 * No private amp, or private amp has no anon
1406 * structs. This means entire segment is backed by
1407 * the shared object.
1408 */
1409 first = vmu_alloc_bound();
1410 first->vmb_start = s_start;
1411 first->vmb_end = s_end;
1412 first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1413 }
1414 /*
1415 * Iterate bounds not backed by private amp, and compute
1416 * resident pages.
1417 */
1418 cur = first;
1419 while (cur != NULL) {
1420
1421 if (vmu_insert_lookup_object_bounds(shared_object,
1422 cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1423 &first, &last) > 0) {
1424 /* new bounds, find incore/not-incore */
1425 if (shared_object->vmo_type ==
1426 VMUSAGE_TYPE_VNODE) {
1427 vmu_vnode_update_incore_bounds(
1428 tree,
1429 (vnode_t *)
1430 shared_object->vmo_key, &first,
1431 &last);
1432 } else {
1433 vmu_amp_update_incore_bounds(
1434 tree,
1435 (struct anon_map *)
1436 shared_object->vmo_key, &first,
1437 &last, incore);
1438 }
1439 vmu_merge_bounds(tree, &first, &last);
1440 }
1441 for (entity = vmu_entities; entity != NULL;
1442 entity = entity->vme_next_calc) {
1443 avl_tree_t *e_tree;
1444
1445 result = &entity->vme_result;
1446
1447 entity_object = vmu_find_insert_object(
1448 shared_object->vmo_type ==
1449 VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1450 entity->vme_amp_hash,
1451 shared_object->vmo_key,
1452 shared_object->vmo_type);
1453
1454 virt = vmu_insert_lookup_object_bounds(
1455 entity_object, cur->vmb_start, cur->vmb_end,
1456 VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1457
1458 if (virt == 0)
1459 continue;
1460 /*
1461 * Range visited for this entity
1462 */
1463 e_tree = &(entity_object->vmo_bounds);
1464 rss = vmu_update_bounds(e_tree, &e_first,
1465 &e_last, tree, first, last);
1466 result->vmu_rss_all += (rss << PAGESHIFT);
1467 if (shared == B_TRUE && file == B_FALSE) {
1468 /* shared anon mapping */
1469 result->vmu_swap_all +=
1470 (virt << PAGESHIFT);
1471 result->vmu_swap_shared +=
1472 (virt << PAGESHIFT);
1473 result->vmu_rss_shared +=
1474 (rss << PAGESHIFT);
1475 } else if (shared == B_TRUE && file == B_TRUE) {
1476 /* shared file mapping */
1477 result->vmu_rss_shared +=
1478 (rss << PAGESHIFT);
1479 } else if (shared == B_FALSE &&
1480 file == B_TRUE) {
1481 /* private file mapping */
1482 result->vmu_rss_private +=
1483 (rss << PAGESHIFT);
1484 }
1485 vmu_merge_bounds(e_tree, &e_first, &e_last);
1486 }
1487 tmp = cur;
1488 cur = cur->vmb_next;
1489 vmu_free_bound(tmp);
1490 }
1491 }
1492 }
1493
1494 /*
1495 * Based on the current calculation flags, find the relevant entities
1496 * which are relative to the process. Then calculate each segment
1497 * in the process'es address space for each relevant entity.
1498 */
1499 static void
1500 vmu_calculate_proc(proc_t *p)
1501 {
1502 vmu_entity_t *entities = NULL;
1503 vmu_zone_t *zone;
1504 vmu_entity_t *tmp;
1505 struct as *as;
1506 struct seg *seg;
1507 int ret;
1508
1509 /* Figure out which entities are being computed */
1510 if ((vmu_data.vmu_system) != NULL) {
1511 tmp = vmu_data.vmu_system;
1512 tmp->vme_next_calc = entities;
1513 entities = tmp;
1514 }
1515 if (vmu_data.vmu_calc_flags &
1516 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
1517 VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1518 VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1519 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1520 VMUSAGE_ALL_EUSERS)) {
1521 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1522 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1523 (mod_hash_val_t *)&zone);
1524 if (ret != 0) {
1525 zone = vmu_alloc_zone(p->p_zone->zone_id);
1526 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1527 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1528 (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1529 ASSERT(ret == 0);
1530 }
1531 if (zone->vmz_zone != NULL) {
1532 tmp = zone->vmz_zone;
1533 tmp->vme_next_calc = entities;
1534 entities = tmp;
1535 }
1536 if (vmu_data.vmu_calc_flags &
1537 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1538 tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1539 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1540 zone->vmz_id);
1541 tmp->vme_next_calc = entities;
1542 entities = tmp;
1543 }
1544 if (vmu_data.vmu_calc_flags &
1545 (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1546 tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1547 p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1548 tmp->vme_next_calc = entities;
1549 entities = tmp;
1550 }
1551 if (vmu_data.vmu_calc_flags &
1552 (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1553 tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1554 crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1555 tmp->vme_next_calc = entities;
1556 entities = tmp;
1557 }
1558 if (vmu_data.vmu_calc_flags &
1559 (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1560 tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1561 crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1562 tmp->vme_next_calc = entities;
1563 entities = tmp;
1564 }
1565 }
1566 /* Entities which collapse projects and users for all zones */
1567 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1568 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1569 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1570 tmp->vme_next_calc = entities;
1571 entities = tmp;
1572 }
1573 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1574 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1575 crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1576 tmp->vme_next_calc = entities;
1577 entities = tmp;
1578 }
1579 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1580 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1581 crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1582 tmp->vme_next_calc = entities;
1583 entities = tmp;
1584 }
1585
1586 ASSERT(entities != NULL);
1587 /* process all segs in process's address space */
1588 as = p->p_as;
1589 AS_LOCK_ENTER(as, RW_READER);
1590 for (seg = AS_SEGFIRST(as); seg != NULL;
1591 seg = AS_SEGNEXT(as, seg)) {
1592 vmu_calculate_seg(entities, seg);
1593 }
1594 AS_LOCK_EXIT(as);
1595 }
1596
1597 /*
1598 * Free data created by previous call to vmu_calculate().
1599 */
1600 static void
1601 vmu_clear_calc()
1602 {
1603 if (vmu_data.vmu_system != NULL)
1604 vmu_free_entity(vmu_data.vmu_system);
1605 vmu_data.vmu_system = NULL;
1606 if (vmu_data.vmu_zones_hash != NULL)
1607 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1608 if (vmu_data.vmu_projects_col_hash != NULL)
1609 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1610 if (vmu_data.vmu_rusers_col_hash != NULL)
1611 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1612 if (vmu_data.vmu_eusers_col_hash != NULL)
1613 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1614
1615 i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1616 i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1617 }
1618
1619 /*
1620 * Free unused data structures. These can result if the system workload
1621 * decreases between calculations.
1622 */
1623 static void
1624 vmu_free_extra()
1625 {
1626 vmu_bound_t *tb;
1627 vmu_object_t *to;
1628 vmu_entity_t *te;
1629 vmu_zone_t *tz;
1630
1631 while (vmu_data.vmu_free_bounds != NULL) {
1632 tb = vmu_data.vmu_free_bounds;
1633 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1634 kmem_cache_free(vmu_bound_cache, tb);
1635 }
1636 while (vmu_data.vmu_free_objects != NULL) {
1637 to = vmu_data.vmu_free_objects;
1638 vmu_data.vmu_free_objects =
1639 vmu_data.vmu_free_objects->vmo_next;
1640 kmem_cache_free(vmu_object_cache, to);
1641 }
1642 while (vmu_data.vmu_free_entities != NULL) {
1643 te = vmu_data.vmu_free_entities;
1644 vmu_data.vmu_free_entities =
1645 vmu_data.vmu_free_entities->vme_next;
1646 if (te->vme_vnode_hash != NULL)
1647 mod_hash_destroy_hash(te->vme_vnode_hash);
1648 if (te->vme_amp_hash != NULL)
1649 mod_hash_destroy_hash(te->vme_amp_hash);
1650 VERIFY(avl_first(&te->vme_anon) == NULL);
1651 kmem_free(te, sizeof (vmu_entity_t));
1652 }
1653 while (vmu_data.vmu_free_zones != NULL) {
1654 tz = vmu_data.vmu_free_zones;
1655 vmu_data.vmu_free_zones =
1656 vmu_data.vmu_free_zones->vmz_next;
1657 if (tz->vmz_projects_hash != NULL)
1658 mod_hash_destroy_hash(tz->vmz_projects_hash);
1659 if (tz->vmz_tasks_hash != NULL)
1660 mod_hash_destroy_hash(tz->vmz_tasks_hash);
1661 if (tz->vmz_rusers_hash != NULL)
1662 mod_hash_destroy_hash(tz->vmz_rusers_hash);
1663 if (tz->vmz_eusers_hash != NULL)
1664 mod_hash_destroy_hash(tz->vmz_eusers_hash);
1665 kmem_free(tz, sizeof (vmu_zone_t));
1666 }
1667 }
1668
1669 extern kcondvar_t *pr_pid_cv;
1670
1671 /*
1672 * Determine which entity types are relevant and allocate the hashes to
1673 * track them. Then walk the process table and count rss and swap
1674 * for each process'es address space. Address space object such as
1675 * vnodes, amps and anons are tracked per entity, so that they are
1676 * not double counted in the results.
1677 *
1678 */
1679 static void
1680 vmu_calculate()
1681 {
1682 int i = 0;
1683 int ret;
1684 proc_t *p;
1685
1686 vmu_clear_calc();
1687
1688 if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1689 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1690 ALL_ZONES);
1691
1692 /*
1693 * Walk process table and calculate rss of each proc.
1694 *
1695 * Pidlock and p_lock cannot be held while doing the rss calculation.
1696 * This is because:
1697 * 1. The calculation allocates using KM_SLEEP.
1698 * 2. The calculation grabs a_lock, which cannot be grabbed
1699 * after p_lock.
1700 *
1701 * Since pidlock must be dropped, we cannot simply just walk the
1702 * practive list. Instead, we walk the process table, and sprlock
1703 * each process to ensure that it does not exit during the
1704 * calculation.
1705 */
1706
1707 mutex_enter(&pidlock);
1708 for (i = 0; i < v.v_proc; i++) {
1709 again:
1710 p = pid_entry(i);
1711 if (p == NULL)
1712 continue;
1713
1714 mutex_enter(&p->p_lock);
1715 mutex_exit(&pidlock);
1716
1717 if (panicstr) {
1718 mutex_exit(&p->p_lock);
1719 return;
1720 }
1721
1722 /* Try to set P_PR_LOCK */
1723 ret = sprtrylock_proc(p);
1724 if (ret == -1) {
1725 /* Process in invalid state */
1726 mutex_exit(&p->p_lock);
1727 mutex_enter(&pidlock);
1728 continue;
1729 } else if (ret == 1) {
1730 /*
1731 * P_PR_LOCK is already set. Wait and try again.
1732 * This also drops p_lock.
1733 */
1734 sprwaitlock_proc(p);
1735 mutex_enter(&pidlock);
1736 goto again;
1737 }
1738 mutex_exit(&p->p_lock);
1739
1740 vmu_calculate_proc(p);
1741
1742 mutex_enter(&p->p_lock);
1743 sprunlock(p);
1744 mutex_enter(&pidlock);
1745 }
1746 mutex_exit(&pidlock);
1747
1748 vmu_free_extra();
1749 }
1750
1751 /*
1752 * allocate a new cache for N results satisfying flags
1753 */
1754 vmu_cache_t *
1755 vmu_cache_alloc(size_t nres, uint_t flags)
1756 {
1757 vmu_cache_t *cache;
1758
1759 cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1760 cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1761 cache->vmc_nresults = nres;
1762 cache->vmc_flags = flags;
1763 cache->vmc_refcnt = 1;
1764 return (cache);
1765 }
1766
1767 /*
1768 * Make sure cached results are not freed
1769 */
1770 static void
1771 vmu_cache_hold(vmu_cache_t *cache)
1772 {
1773 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1774 cache->vmc_refcnt++;
1775 }
1776
1777 /*
1778 * free cache data
1779 */
1780 static void
1781 vmu_cache_rele(vmu_cache_t *cache)
1782 {
1783 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1784 ASSERT(cache->vmc_refcnt > 0);
1785 cache->vmc_refcnt--;
1786 if (cache->vmc_refcnt == 0) {
1787 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1788 cache->vmc_nresults);
1789 kmem_free(cache, sizeof (vmu_cache_t));
1790 }
1791 }
1792
1793 /*
1794 * When new data is calculated, update the phys_mem rctl usage value in the
1795 * zones.
1796 */
1797 static void
1798 vmu_update_zone_rctls(vmu_cache_t *cache)
1799 {
1800 vmusage_t *rp;
1801 size_t i = 0;
1802 zone_t *zp;
1803
1804 for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
1805 if (rp->vmu_type == VMUSAGE_ZONE &&
1806 rp->vmu_zoneid != ALL_ZONES) {
1807 if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
1808 zp->zone_phys_mem = rp->vmu_rss_all;
1809 zone_rele(zp);
1810 }
1811 }
1812 }
1813 }
1814
1815 /*
1816 * Copy out the cached results to a caller. Inspect the callers flags
1817 * and zone to determine which cached results should be copied.
1818 */
1819 static int
1820 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1821 uint_t flags, id_t req_zone_id, int cpflg)
1822 {
1823 vmusage_t *result, *out_result;
1824 vmusage_t dummy;
1825 size_t i, count = 0;
1826 size_t bufsize;
1827 int ret = 0;
1828 uint_t types = 0;
1829
1830 if (nres != NULL) {
1831 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1832 return (set_errno(EFAULT));
1833 } else {
1834 bufsize = 0;
1835 }
1836
1837 /* figure out what results the caller is interested in. */
1838 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1839 types |= VMUSAGE_SYSTEM;
1840 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
1841 types |= VMUSAGE_ZONE;
1842 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1843 VMUSAGE_COL_PROJECTS))
1844 types |= VMUSAGE_PROJECTS;
1845 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1846 types |= VMUSAGE_TASKS;
1847 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1848 types |= VMUSAGE_RUSERS;
1849 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1850 types |= VMUSAGE_EUSERS;
1851
1852 /* count results for current zone */
1853 out_result = buf;
1854 for (result = cache->vmc_results, i = 0;
1855 i < cache->vmc_nresults; result++, i++) {
1856
1857 /* Do not return "other-zone" results to non-global zones */
1858 if (curproc->p_zone != global_zone &&
1859 curproc->p_zone->zone_id != result->vmu_zoneid)
1860 continue;
1861
1862 /*
1863 * If non-global zone requests VMUSAGE_SYSTEM, fake
1864 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1865 */
1866 if (curproc->p_zone != global_zone &&
1867 (flags & VMUSAGE_SYSTEM) != 0 &&
1868 result->vmu_type == VMUSAGE_ZONE) {
1869 count++;
1870 if (out_result != NULL) {
1871 if (bufsize < count) {
1872 ret = set_errno(EOVERFLOW);
1873 } else {
1874 dummy = *result;
1875 dummy.vmu_zoneid = ALL_ZONES;
1876 dummy.vmu_id = 0;
1877 dummy.vmu_type = VMUSAGE_SYSTEM;
1878 if (ddi_copyout(&dummy, out_result,
1879 sizeof (vmusage_t), cpflg))
1880 return (set_errno(EFAULT));
1881 out_result++;
1882 }
1883 }
1884 }
1885
1886 /* Skip results that do not match requested type */
1887 if ((result->vmu_type & types) == 0)
1888 continue;
1889
1890 /* Skip collated results if not requested */
1891 if (result->vmu_zoneid == ALL_ZONES) {
1892 if (result->vmu_type == VMUSAGE_PROJECTS &&
1893 (flags & VMUSAGE_COL_PROJECTS) == 0)
1894 continue;
1895 if (result->vmu_type == VMUSAGE_EUSERS &&
1896 (flags & VMUSAGE_COL_EUSERS) == 0)
1897 continue;
1898 if (result->vmu_type == VMUSAGE_RUSERS &&
1899 (flags & VMUSAGE_COL_RUSERS) == 0)
1900 continue;
1901 }
1902
1903 if (result->vmu_type == VMUSAGE_ZONE &&
1904 flags & VMUSAGE_A_ZONE) {
1905 /* Skip non-requested zone results */
1906 if (result->vmu_zoneid != req_zone_id)
1907 continue;
1908 } else {
1909 /* Skip "other zone" results if not requested */
1910 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1911 if (result->vmu_type == VMUSAGE_ZONE &&
1912 (flags & VMUSAGE_ALL_ZONES) == 0)
1913 continue;
1914 if (result->vmu_type == VMUSAGE_PROJECTS &&
1915 (flags & (VMUSAGE_ALL_PROJECTS |
1916 VMUSAGE_COL_PROJECTS)) == 0)
1917 continue;
1918 if (result->vmu_type == VMUSAGE_TASKS &&
1919 (flags & VMUSAGE_ALL_TASKS) == 0)
1920 continue;
1921 if (result->vmu_type == VMUSAGE_RUSERS &&
1922 (flags & (VMUSAGE_ALL_RUSERS |
1923 VMUSAGE_COL_RUSERS)) == 0)
1924 continue;
1925 if (result->vmu_type == VMUSAGE_EUSERS &&
1926 (flags & (VMUSAGE_ALL_EUSERS |
1927 VMUSAGE_COL_EUSERS)) == 0)
1928 continue;
1929 }
1930 }
1931 count++;
1932 if (out_result != NULL) {
1933 if (bufsize < count) {
1934 ret = set_errno(EOVERFLOW);
1935 } else {
1936 if (ddi_copyout(result, out_result,
1937 sizeof (vmusage_t), cpflg))
1938 return (set_errno(EFAULT));
1939 out_result++;
1940 }
1941 }
1942 }
1943 if (nres != NULL)
1944 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1945 return (set_errno(EFAULT));
1946
1947 return (ret);
1948 }
1949
1950 /*
1951 * vm_getusage()
1952 *
1953 * Counts rss and swap by zone, project, task, and/or user. The flags argument
1954 * determines the type of results structures returned. Flags requesting
1955 * results from more than one zone are "flattened" to the local zone if the
1956 * caller is not the global zone.
1957 *
1958 * args:
1959 * flags: bitmap consisting of one or more of VMUSAGE_*.
1960 * age: maximum allowable age (time since counting was done) in
1961 * seconds of the results. Results from previous callers are
1962 * cached in kernel.
1963 * buf: pointer to buffer array of vmusage_t. If NULL, then only nres
1964 * set on success.
1965 * nres: Set to number of vmusage_t structures pointed to by buf
1966 * before calling vm_getusage().
1967 * On return 0 (success) or ENOSPC, is set to the number of result
1968 * structures returned or attempted to return.
1969 *
1970 * returns 0 on success, -1 on failure:
1971 * EINTR (interrupted)
1972 * ENOSPC (nres to small for results, nres set to needed value for success)
1973 * EINVAL (flags invalid)
1974 * EFAULT (bad address for buf or nres)
1975 */
1976 int
1977 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1978 {
1979 vmu_entity_t *entity;
1980 vmusage_t *result;
1981 int ret = 0;
1982 int cacherecent = 0;
1983 hrtime_t now;
1984 uint_t flags_orig;
1985 id_t req_zone_id;
1986
1987 /*
1988 * Non-global zones cannot request system wide and/or collated
1989 * results, or the system result, or usage of another zone, so munge
1990 * the flags accordingly.
1991 */
1992 flags_orig = flags;
1993 if (curproc->p_zone != global_zone) {
1994 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1995 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1996 flags |= VMUSAGE_PROJECTS;
1997 }
1998 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1999 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
2000 flags |= VMUSAGE_RUSERS;
2001 }
2002 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
2003 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
2004 flags |= VMUSAGE_EUSERS;
2005 }
2006 if (flags & VMUSAGE_SYSTEM) {
2007 flags &= ~VMUSAGE_SYSTEM;
2008 flags |= VMUSAGE_ZONE;
2009 }
2010 if (flags & VMUSAGE_A_ZONE) {
2011 flags &= ~VMUSAGE_A_ZONE;
2012 flags |= VMUSAGE_ZONE;
2013 }
2014 }
2015
2016 /* Check for unknown flags */
2017 if ((flags & (~VMUSAGE_MASK)) != 0)
2018 return (set_errno(EINVAL));
2019
2020 /* Check for no flags */
2021 if ((flags & VMUSAGE_MASK) == 0)
2022 return (set_errno(EINVAL));
2023
2024 /* If requesting results for a specific zone, get the zone ID */
2025 if (flags & VMUSAGE_A_ZONE) {
2026 size_t bufsize;
2027 vmusage_t zreq;
2028
2029 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
2030 return (set_errno(EFAULT));
2031 /* Requested zone ID is passed in buf, so 0 len not allowed */
2032 if (bufsize == 0)
2033 return (set_errno(EINVAL));
2034 if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
2035 return (set_errno(EFAULT));
2036 req_zone_id = zreq.vmu_id;
2037 }
2038
2039 mutex_enter(&vmu_data.vmu_lock);
2040 now = gethrtime();
2041
2042 start:
2043 if (vmu_data.vmu_cache != NULL) {
2044
2045 vmu_cache_t *cache;
2046
2047 if ((vmu_data.vmu_cache->vmc_timestamp +
2048 ((hrtime_t)age * NANOSEC)) > now)
2049 cacherecent = 1;
2050
2051 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
2052 cacherecent == 1) {
2053 cache = vmu_data.vmu_cache;
2054 vmu_cache_hold(cache);
2055 mutex_exit(&vmu_data.vmu_lock);
2056
2057 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2058 req_zone_id, cpflg);
2059 mutex_enter(&vmu_data.vmu_lock);
2060 vmu_cache_rele(cache);
2061 if (vmu_data.vmu_pending_waiters > 0)
2062 cv_broadcast(&vmu_data.vmu_cv);
2063 mutex_exit(&vmu_data.vmu_lock);
2064 return (ret);
2065 }
2066 /*
2067 * If the cache is recent, it is likely that there are other
2068 * consumers of vm_getusage running, so add their flags to the
2069 * desired flags for the calculation.
2070 */
2071 if (cacherecent == 1)
2072 flags = vmu_data.vmu_cache->vmc_flags | flags;
2073 }
2074 if (vmu_data.vmu_calc_thread == NULL) {
2075
2076 vmu_cache_t *cache;
2077
2078 vmu_data.vmu_calc_thread = curthread;
2079 vmu_data.vmu_calc_flags = flags;
2080 vmu_data.vmu_entities = NULL;
2081 vmu_data.vmu_nentities = 0;
2082 if (vmu_data.vmu_pending_waiters > 0)
2083 vmu_data.vmu_calc_flags |=
2084 vmu_data.vmu_pending_flags;
2085
2086 vmu_data.vmu_pending_flags = 0;
2087 mutex_exit(&vmu_data.vmu_lock);
2088 vmu_calculate();
2089 mutex_enter(&vmu_data.vmu_lock);
2090 /* copy results to cache */
2091 if (vmu_data.vmu_cache != NULL)
2092 vmu_cache_rele(vmu_data.vmu_cache);
2093 cache = vmu_data.vmu_cache =
2094 vmu_cache_alloc(vmu_data.vmu_nentities,
2095 vmu_data.vmu_calc_flags);
2096
2097 result = cache->vmc_results;
2098 for (entity = vmu_data.vmu_entities; entity != NULL;
2099 entity = entity->vme_next) {
2100 *result = entity->vme_result;
2101 result++;
2102 }
2103 cache->vmc_timestamp = gethrtime();
2104 vmu_cache_hold(cache);
2105
2106 vmu_data.vmu_calc_flags = 0;
2107 vmu_data.vmu_calc_thread = NULL;
2108
2109 if (vmu_data.vmu_pending_waiters > 0)
2110 cv_broadcast(&vmu_data.vmu_cv);
2111
2112 mutex_exit(&vmu_data.vmu_lock);
2113
2114 /* update zone's phys. mem. rctl usage */
2115 vmu_update_zone_rctls(cache);
2116 /* copy cache */
2117 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2118 req_zone_id, cpflg);
2119 mutex_enter(&vmu_data.vmu_lock);
2120 vmu_cache_rele(cache);
2121 mutex_exit(&vmu_data.vmu_lock);
2122
2123 return (ret);
2124 }
2125 vmu_data.vmu_pending_flags |= flags;
2126 vmu_data.vmu_pending_waiters++;
2127 while (vmu_data.vmu_calc_thread != NULL) {
2128 if (cv_wait_sig(&vmu_data.vmu_cv,
2129 &vmu_data.vmu_lock) == 0) {
2130 vmu_data.vmu_pending_waiters--;
2131 mutex_exit(&vmu_data.vmu_lock);
2132 return (set_errno(EINTR));
2133 }
2134 }
2135 vmu_data.vmu_pending_waiters--;
2136 goto start;
2137 }
2138
2139 #if defined(__x86)
2140 /*
2141 * Attempt to invalidate all of the pages in the mapping for the given process.
2142 */
2143 static void
2144 map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
2145 {
2146 page_t *pp;
2147 size_t psize;
2148 u_offset_t off;
2149 caddr_t eaddr;
2150 struct vnode *vp;
2151 struct segvn_data *svd;
2152 struct hat *victim_hat;
2153
2154 ASSERT((addr + size) <= (seg->s_base + seg->s_size));
2155
2156 victim_hat = p->p_as->a_hat;
2157 svd = (struct segvn_data *)seg->s_data;
2158 vp = svd->vp;
2159 psize = page_get_pagesize(seg->s_szc);
2160
2161 off = svd->offset + (uintptr_t)(addr - seg->s_base);
2162
2163 for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
2164 pp = page_lookup_nowait(vp, off, SE_SHARED);
2165
2166 if (pp != NULL) {
2167 /* following logic based on pvn_getdirty() */
2168
2169 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2170 page_unlock(pp);
2171 continue;
2172 }
2173
2174 page_io_lock(pp);
2175 hat_page_inval(pp, 0, victim_hat);
2176 page_io_unlock(pp);
2177
2178 /*
2179 * For B_INVALCURONLY-style handling we let
2180 * page_release call VN_DISPOSE if no one else is using
2181 * the page.
2182 *
2183 * A hat_ismod() check would be useless because:
2184 * (1) we are not be holding SE_EXCL lock
2185 * (2) we've not unloaded _all_ translations
2186 *
2187 * Let page_release() do the heavy-lifting.
2188 */
2189 (void) page_release(pp, 1);
2190 }
2191 }
2192 }
2193
2194 /*
2195 * vm_map_inval()
2196 *
2197 * Invalidate as many pages as possible within the given mapping for the given
2198 * process. addr is expected to be the base address of the mapping and size is
2199 * the length of the mapping. In some cases a mapping will encompass an
2200 * entire segment, but at least for anon or stack mappings, these will be
2201 * regions within a single large segment. Thus, the invalidation is oriented
2202 * around a single mapping and not an entire segment.
2203 *
2204 * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
2205 * this code is only applicable to x86.
2206 */
2207 int
2208 vm_map_inval(pid_t pid, caddr_t addr, size_t size)
2209 {
2210 int ret;
2211 int error = 0;
2212 proc_t *p; /* target proc */
2213 struct as *as; /* target proc's address space */
2214 struct seg *seg; /* working segment */
2215
2216 if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
2217 return (set_errno(EPERM));
2218
2219 /* If not a valid mapping address, return an error */
2220 if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
2221 return (set_errno(EINVAL));
2222
2223 again:
2224 mutex_enter(&pidlock);
2225 p = prfind(pid);
2226 if (p == NULL) {
2227 mutex_exit(&pidlock);
2228 return (set_errno(ESRCH));
2229 }
2230
2231 mutex_enter(&p->p_lock);
2232 mutex_exit(&pidlock);
2233
2234 if (panicstr != NULL) {
2235 mutex_exit(&p->p_lock);
2236 return (0);
2237 }
2238
2239 as = p->p_as;
2240
2241 /*
2242 * Try to set P_PR_LOCK - prevents process "changing shape"
2243 * - blocks fork
2244 * - blocks sigkill
2245 * - cannot be a system proc
2246 * - must be fully created proc
2247 */
2248 ret = sprtrylock_proc(p);
2249 if (ret == -1) {
2250 /* Process in invalid state */
2251 mutex_exit(&p->p_lock);
2252 return (set_errno(ESRCH));
2253 }
2254
2255 if (ret == 1) {
2256 /*
2257 * P_PR_LOCK is already set. Wait and try again. This also
2258 * drops p_lock so p may no longer be valid since the proc may
2259 * have exited.
2260 */
2261 sprwaitlock_proc(p);
2262 goto again;
2263 }
2264
2265 /* P_PR_LOCK is now set */
2266 mutex_exit(&p->p_lock);
2267
2268 AS_LOCK_ENTER(as, RW_READER);
2269 if ((seg = as_segat(as, addr)) == NULL) {
2270 AS_LOCK_EXIT(as);
2271 mutex_enter(&p->p_lock);
2272 sprunlock(p);
2273 return (set_errno(ENOMEM));
2274 }
2275
2276 /*
2277 * The invalidation behavior only makes sense for vnode-backed segments.
2278 */
2279 if (seg->s_ops != &segvn_ops) {
2280 AS_LOCK_EXIT(as);
2281 mutex_enter(&p->p_lock);
2282 sprunlock(p);
2283 return (0);
2284 }
2285
2286 /*
2287 * If the mapping is out of bounds of the segement return an error.
2288 */
2289 if ((addr + size) > (seg->s_base + seg->s_size)) {
2290 AS_LOCK_EXIT(as);
2291 mutex_enter(&p->p_lock);
2292 sprunlock(p);
2293 return (set_errno(EINVAL));
2294 }
2295
2296 /*
2297 * Don't use MS_INVALCURPROC flag here since that would eventually
2298 * initiate hat invalidation based on curthread. Since we're doing this
2299 * on behalf of a different process, that would erroneously invalidate
2300 * our own process mappings.
2301 */
2302 error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
2303 if (error == 0) {
2304 /*
2305 * Since we didn't invalidate during the sync above, we now
2306 * try to invalidate all of the pages in the mapping.
2307 */
2308 map_inval(p, seg, addr, size);
2309 }
2310 AS_LOCK_EXIT(as);
2311
2312 mutex_enter(&p->p_lock);
2313 sprunlock(p);
2314
2315 if (error)
2316 (void) set_errno(error);
2317 return (error);
2318 }
2319 #endif