1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright 2016, Joyent, Inc.
29 */
30
31 /*
32 * vm_usage
33 *
34 * This file implements the getvmusage() private system call.
35 * getvmusage() counts the amount of resident memory pages and swap
36 * reserved by the specified process collective. A "process collective" is
37 * the set of processes owned by a particular, zone, project, task, or user.
38 *
39 * rss and swap are counted so that for a given process collective, a page is
40 * only counted once. For example, this means that if multiple processes in
41 * the same project map the same page, then the project will only be charged
42 * once for that page. On the other hand, if two processes in different
43 * projects map the same page, then both projects will be charged
44 * for the page.
45 *
46 * The vm_getusage() calculation is implemented so that the first thread
47 * performs the rss/swap counting. Other callers will wait for that thread to
48 * finish, copying the results. This enables multiple rcapds and prstats to
49 * consume data from the same calculation. The results are also cached so that
50 * a caller interested in recent results can just copy them instead of starting
51 * a new calculation. The caller passes the maximium age (in seconds) of the
52 * data. If the cached data is young enough, the cache is copied, otherwise,
53 * a new calculation is executed and the cache is replaced with the new
54 * data.
55 *
56 * The rss calculation for each process collective is as follows:
57 *
58 * - Inspect flags, determine if counting rss for zones, projects, tasks,
59 * and/or users.
60 * - For each proc:
61 * - Figure out proc's collectives (zone, project, task, and/or user).
62 * - For each seg in proc's address space:
63 * - If seg is private:
64 * - Lookup anons in the amp.
65 * - For incore pages not previously visited each of the
66 * proc's collectives, add incore pagesize to each.
67 * collective.
68 * Anon's with a refcnt of 1 can be assummed to be not
69 * previously visited.
70 * - For address ranges without anons in the amp:
71 * - Lookup pages in underlying vnode.
72 * - For incore pages not previously visiting for
73 * each of the proc's collectives, add incore
74 * pagesize to each collective.
75 * - If seg is shared:
76 * - Lookup pages in the shared amp or vnode.
77 * - For incore pages not previously visited for each of
78 * the proc's collectives, add incore pagesize to each
79 * collective.
80 *
81 * Swap is reserved by private segments, and shared anonymous segments.
82 * The only shared anon segments which do not reserve swap are ISM segments
83 * and schedctl segments, both of which can be identified by having
84 * amp->swresv == 0.
85 *
86 * The swap calculation for each collective is as follows:
87 *
88 * - Inspect flags, determine if counting rss for zones, projects, tasks,
89 * and/or users.
90 * - For each proc:
91 * - Figure out proc's collectives (zone, project, task, and/or user).
92 * - For each seg in proc's address space:
93 * - If seg is private:
94 * - Add svd->swresv pages to swap count for each of the
95 * proc's collectives.
96 * - If seg is anon, shared, and amp->swresv != 0
97 * - For address ranges in amp not previously visited for
98 * each of the proc's collectives, add size of address
99 * range to the swap count for each collective.
100 *
101 * These two calculations are done simultaneously, with most of the work
102 * being done in vmu_calculate_seg(). The results of the calculation are
103 * copied into "vmu_data.vmu_cache_results".
104 *
105 * To perform the calculation, various things are tracked and cached:
106 *
107 * - incore/not-incore page ranges for all vnodes.
108 * (vmu_data.vmu_all_vnodes_hash)
109 * This eliminates looking up the same page more than once.
110 *
111 * - incore/not-incore page ranges for all shared amps.
112 * (vmu_data.vmu_all_amps_hash)
113 * This eliminates looking up the same page more than once.
114 *
115 * - visited page ranges for each collective.
116 * - per vnode (entity->vme_vnode_hash)
117 * - per shared amp (entity->vme_amp_hash)
118 * For accurate counting of map-shared and COW-shared pages.
119 *
120 * - visited private anons (refcnt > 1) for each collective.
121 * (entity->vme_anon_hash)
122 * For accurate counting of COW-shared pages.
123 *
124 * The common accounting structure is the vmu_entity_t, which represents
125 * collectives:
126 *
127 * - A zone.
128 * - A project, task, or user within a zone.
129 * - The entire system (vmu_data.vmu_system).
130 * - Each collapsed (col) project and user. This means a given projid or
131 * uid, regardless of which zone the process is in. For instance,
132 * project 0 in the global zone and project 0 in a non global zone are
133 * the same collapsed project.
134 *
135 * Each entity structure tracks which pages have been already visited for
136 * that entity (via previously inspected processes) so that these pages are
137 * not double counted.
138 */
139
140 #include <sys/errno.h>
141 #include <sys/types.h>
142 #include <sys/zone.h>
143 #include <sys/proc.h>
144 #include <sys/project.h>
145 #include <sys/task.h>
146 #include <sys/thread.h>
147 #include <sys/time.h>
148 #include <sys/mman.h>
149 #include <sys/modhash.h>
150 #include <sys/modhash_impl.h>
151 #include <sys/shm.h>
152 #include <sys/swap.h>
153 #include <sys/synch.h>
154 #include <sys/systm.h>
155 #include <sys/var.h>
156 #include <sys/vm_usage.h>
157 #include <sys/zone.h>
158 #include <sys/sunddi.h>
159 #include <sys/avl.h>
160 #include <vm/anon.h>
161 #include <vm/as.h>
162 #include <vm/seg_vn.h>
163 #include <vm/seg_spt.h>
164
165 #define VMUSAGE_HASH_SIZE 512
166
167 #define VMUSAGE_TYPE_VNODE 1
168 #define VMUSAGE_TYPE_AMP 2
169 #define VMUSAGE_TYPE_ANON 3
170
171 #define VMUSAGE_BOUND_UNKNOWN 0
172 #define VMUSAGE_BOUND_INCORE 1
173 #define VMUSAGE_BOUND_NOT_INCORE 2
174
175 #define ISWITHIN(node, addr) ((node)->vmb_start <= addr && \
176 (node)->vmb_end >= addr ? 1 : 0)
177
178 /*
179 * bounds for vnodes and shared amps
180 * Each bound is either entirely incore, entirely not in core, or
181 * entirely unknown. bounds are stored in an avl tree sorted by start member
182 * when in use, otherwise (free or temporary lists) they're strung
183 * together off of vmb_next.
184 */
185 typedef struct vmu_bound {
186 avl_node_t vmb_node;
187 struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
188 pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */
189 pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */
190 char vmb_type; /* One of VMUSAGE_BOUND_* */
191 } vmu_bound_t;
192
193 /*
194 * hash of visited objects (vnodes or shared amps)
195 * key is address of vnode or amp. Bounds lists known incore/non-incore
196 * bounds for vnode/amp.
197 */
198 typedef struct vmu_object {
199 struct vmu_object *vmo_next; /* free list */
200 caddr_t vmo_key;
201 short vmo_type;
202 avl_tree_t vmo_bounds;
203 } vmu_object_t;
204
205 /*
206 * Entity by which to count results.
207 *
208 * The entity structure keeps the current rss/swap counts for each entity
209 * (zone, project, etc), and hashes of vm structures that have already
210 * been visited for the entity.
211 *
212 * vme_next: links the list of all entities currently being counted by
213 * vmu_calculate().
214 *
215 * vme_next_calc: links the list of entities related to the current process
216 * being counted by vmu_calculate_proc().
217 *
218 * vmu_calculate_proc() walks all processes. For each process, it makes a
219 * list of the entities related to that process using vme_next_calc. This
220 * list changes each time vmu_calculate_proc() is called.
221 *
222 */
223 typedef struct vmu_entity {
224 struct vmu_entity *vme_next;
225 struct vmu_entity *vme_next_calc;
226 mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
227 mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
228 mod_hash_t *vme_anon_hash; /* COW anons visited for entity */
229 vmusage_t vme_result; /* identifies entity and results */
230 } vmu_entity_t;
231
232 /*
233 * Hash of entities visited within a zone, and an entity for the zone
234 * itself.
235 */
236 typedef struct vmu_zone {
237 struct vmu_zone *vmz_next; /* free list */
238 id_t vmz_id;
239 vmu_entity_t *vmz_zone;
240 mod_hash_t *vmz_projects_hash;
241 mod_hash_t *vmz_tasks_hash;
242 mod_hash_t *vmz_rusers_hash;
243 mod_hash_t *vmz_eusers_hash;
244 } vmu_zone_t;
245
246 /*
247 * Cache of results from last calculation
248 */
249 typedef struct vmu_cache {
250 vmusage_t *vmc_results; /* Results from last call to */
251 /* vm_getusage(). */
252 uint64_t vmc_nresults; /* Count of cached results */
253 uint64_t vmc_refcnt; /* refcnt for free */
254 uint_t vmc_flags; /* Flags for vm_getusage() */
255 hrtime_t vmc_timestamp; /* when cache was created */
256 } vmu_cache_t;
257
258 /*
259 * top level rss info for the system
260 */
261 typedef struct vmu_data {
262 kmutex_t vmu_lock; /* Protects vmu_data */
263 kcondvar_t vmu_cv; /* Used to signal threads */
264 /* Waiting for */
265 /* Rss_calc_thread to finish */
266 vmu_entity_t *vmu_system; /* Entity for tracking */
267 /* rss/swap for all processes */
268 /* in all zones */
269 mod_hash_t *vmu_zones_hash; /* Zones visited */
270 mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */
271 mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */
272 mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */
273 /* to implement VMUSAGE_COL_* */
274 /* flags, which aggregate by */
275 /* project or user regardless */
276 /* of zoneid. */
277 mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */
278 /* to track incore/not-incore */
279 mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */
280 /* amps to track incore/not- */
281 /* incore */
282 vmu_entity_t *vmu_entities; /* Linked list of entities */
283 size_t vmu_nentities; /* Count of entities in list */
284 vmu_cache_t *vmu_cache; /* Cached results */
285 kthread_t *vmu_calc_thread; /* NULL, or thread running */
286 /* vmu_calculate() */
287 uint_t vmu_calc_flags; /* Flags being using by */
288 /* currently running calc */
289 /* thread */
290 uint_t vmu_pending_flags; /* Flags of vm_getusage() */
291 /* threads waiting for */
292 /* calc thread to finish */
293 uint_t vmu_pending_waiters; /* Number of threads waiting */
294 /* for calc thread */
295 vmu_bound_t *vmu_free_bounds;
296 vmu_object_t *vmu_free_objects;
297 vmu_entity_t *vmu_free_entities;
298 vmu_zone_t *vmu_free_zones;
299 } vmu_data_t;
300
301 extern struct as kas;
302 extern proc_t *practive;
303 extern zone_t *global_zone;
304 extern struct seg_ops segvn_ops;
305 extern struct seg_ops segspt_shmops;
306
307 static vmu_data_t vmu_data;
308 static kmem_cache_t *vmu_bound_cache;
309 static kmem_cache_t *vmu_object_cache;
310
311 /*
312 * Comparison routine for AVL tree. We base our comparison on vmb_start.
313 */
314 static int
315 bounds_cmp(const void *bnd1, const void *bnd2)
316 {
317 const vmu_bound_t *bound1 = bnd1;
318 const vmu_bound_t *bound2 = bnd2;
319
320 if (bound1->vmb_start == bound2->vmb_start) {
321 return (0);
322 }
323 if (bound1->vmb_start < bound2->vmb_start) {
324 return (-1);
325 }
326
327 return (1);
328 }
329
330 /*
331 * Save a bound on the free list.
332 */
333 static void
334 vmu_free_bound(vmu_bound_t *bound)
335 {
336 bound->vmb_next = vmu_data.vmu_free_bounds;
337 bound->vmb_start = 0;
338 bound->vmb_end = 0;
339 bound->vmb_type = 0;
340 vmu_data.vmu_free_bounds = bound;
341 }
342
343 /*
344 * Free an object, and all visited bound info.
345 */
346 static void
347 vmu_free_object(mod_hash_val_t val)
348 {
349 vmu_object_t *obj = (vmu_object_t *)val;
350 avl_tree_t *tree = &(obj->vmo_bounds);
351 vmu_bound_t *bound;
352 void *cookie = NULL;
353
354 while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
355 vmu_free_bound(bound);
356 avl_destroy(tree);
357
358 obj->vmo_type = 0;
359 obj->vmo_next = vmu_data.vmu_free_objects;
360 vmu_data.vmu_free_objects = obj;
361 }
362
363 /*
364 * Free an entity, and hashes of visited objects for that entity.
365 */
366 static void
367 vmu_free_entity(mod_hash_val_t val)
368 {
369 vmu_entity_t *entity = (vmu_entity_t *)val;
370
371 if (entity->vme_vnode_hash != NULL)
372 i_mod_hash_clear_nosync(entity->vme_vnode_hash);
373 if (entity->vme_amp_hash != NULL)
374 i_mod_hash_clear_nosync(entity->vme_amp_hash);
375 if (entity->vme_anon_hash != NULL)
376 i_mod_hash_clear_nosync(entity->vme_anon_hash);
377
378 entity->vme_next = vmu_data.vmu_free_entities;
379 vmu_data.vmu_free_entities = entity;
380 }
381
382 /*
383 * Free zone entity, and all hashes of entities inside that zone,
384 * which are projects, tasks, and users.
385 */
386 static void
387 vmu_free_zone(mod_hash_val_t val)
388 {
389 vmu_zone_t *zone = (vmu_zone_t *)val;
390
391 if (zone->vmz_zone != NULL) {
392 vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
393 zone->vmz_zone = NULL;
394 }
395 if (zone->vmz_projects_hash != NULL)
396 i_mod_hash_clear_nosync(zone->vmz_projects_hash);
397 if (zone->vmz_tasks_hash != NULL)
398 i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
399 if (zone->vmz_rusers_hash != NULL)
400 i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
401 if (zone->vmz_eusers_hash != NULL)
402 i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
403 zone->vmz_next = vmu_data.vmu_free_zones;
404 vmu_data.vmu_free_zones = zone;
405 }
406
407 /*
408 * Initialize synchronization primitives and hashes for system-wide tracking
409 * of visited vnodes and shared amps. Initialize results cache.
410 */
411 void
412 vm_usage_init()
413 {
414 mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
415 cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
416
417 vmu_data.vmu_system = NULL;
418 vmu_data.vmu_zones_hash = NULL;
419 vmu_data.vmu_projects_col_hash = NULL;
420 vmu_data.vmu_rusers_col_hash = NULL;
421 vmu_data.vmu_eusers_col_hash = NULL;
422
423 vmu_data.vmu_free_bounds = NULL;
424 vmu_data.vmu_free_objects = NULL;
425 vmu_data.vmu_free_entities = NULL;
426 vmu_data.vmu_free_zones = NULL;
427
428 vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
429 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
430 sizeof (vnode_t));
431 vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
432 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
433 sizeof (struct anon_map));
434 vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
435 "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
436 vmu_free_entity);
437 vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
438 "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
439 vmu_free_entity);
440 vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
441 "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
442 vmu_free_entity);
443 vmu_data.vmu_zones_hash = mod_hash_create_idhash(
444 "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
445
446 vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
447 sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
448 vmu_object_cache = kmem_cache_create("vmu_object_cache",
449 sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
450
451 vmu_data.vmu_entities = NULL;
452 vmu_data.vmu_nentities = 0;
453
454 vmu_data.vmu_cache = NULL;
455 vmu_data.vmu_calc_thread = NULL;
456 vmu_data.vmu_calc_flags = 0;
457 vmu_data.vmu_pending_flags = 0;
458 vmu_data.vmu_pending_waiters = 0;
459 }
460
461 /*
462 * Allocate hashes for tracking vm objects visited for an entity.
463 * Update list of entities.
464 */
465 static vmu_entity_t *
466 vmu_alloc_entity(id_t id, int type, id_t zoneid)
467 {
468 vmu_entity_t *entity;
469
470 if (vmu_data.vmu_free_entities != NULL) {
471 entity = vmu_data.vmu_free_entities;
472 vmu_data.vmu_free_entities =
473 vmu_data.vmu_free_entities->vme_next;
474 bzero(&entity->vme_result, sizeof (vmusage_t));
475 } else {
476 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
477 }
478 entity->vme_result.vmu_id = id;
479 entity->vme_result.vmu_zoneid = zoneid;
480 entity->vme_result.vmu_type = type;
481
482 if (entity->vme_vnode_hash == NULL)
483 entity->vme_vnode_hash = mod_hash_create_ptrhash(
484 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
485 sizeof (vnode_t));
486
487 if (entity->vme_amp_hash == NULL)
488 entity->vme_amp_hash = mod_hash_create_ptrhash(
489 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
490 sizeof (struct anon_map));
491
492 if (entity->vme_anon_hash == NULL)
493 entity->vme_anon_hash = mod_hash_create_ptrhash(
494 "vmusage anon hash", VMUSAGE_HASH_SIZE,
495 mod_hash_null_valdtor, sizeof (struct anon));
496
497 entity->vme_next = vmu_data.vmu_entities;
498 vmu_data.vmu_entities = entity;
499 vmu_data.vmu_nentities++;
500
501 return (entity);
502 }
503
504 /*
505 * Allocate a zone entity, and hashes for tracking visited vm objects
506 * for projects, tasks, and users within that zone.
507 */
508 static vmu_zone_t *
509 vmu_alloc_zone(id_t id)
510 {
511 vmu_zone_t *zone;
512
513 if (vmu_data.vmu_free_zones != NULL) {
514 zone = vmu_data.vmu_free_zones;
515 vmu_data.vmu_free_zones =
516 vmu_data.vmu_free_zones->vmz_next;
517 zone->vmz_next = NULL;
518 zone->vmz_zone = NULL;
519 } else {
520 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
521 }
522
523 zone->vmz_id = id;
524
525 if ((vmu_data.vmu_calc_flags &
526 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
527 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
528
529 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
530 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
531 zone->vmz_projects_hash = mod_hash_create_idhash(
532 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
533
534 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
535 != 0 && zone->vmz_tasks_hash == NULL)
536 zone->vmz_tasks_hash = mod_hash_create_idhash(
537 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
538
539 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
540 != 0 && zone->vmz_rusers_hash == NULL)
541 zone->vmz_rusers_hash = mod_hash_create_idhash(
542 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
543
544 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
545 != 0 && zone->vmz_eusers_hash == NULL)
546 zone->vmz_eusers_hash = mod_hash_create_idhash(
547 "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
548
549 return (zone);
550 }
551
552 /*
553 * Allocate a structure for tracking visited bounds for a vm object.
554 */
555 static vmu_object_t *
556 vmu_alloc_object(caddr_t key, int type)
557 {
558 vmu_object_t *object;
559
560 if (vmu_data.vmu_free_objects != NULL) {
561 object = vmu_data.vmu_free_objects;
562 vmu_data.vmu_free_objects =
563 vmu_data.vmu_free_objects->vmo_next;
564 } else {
565 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
566 }
567
568 object->vmo_next = NULL;
569 object->vmo_key = key;
570 object->vmo_type = type;
571 avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
572
573 return (object);
574 }
575
576 /*
577 * Allocate and return a bound structure.
578 */
579 static vmu_bound_t *
580 vmu_alloc_bound()
581 {
582 vmu_bound_t *bound;
583
584 if (vmu_data.vmu_free_bounds != NULL) {
585 bound = vmu_data.vmu_free_bounds;
586 vmu_data.vmu_free_bounds =
587 vmu_data.vmu_free_bounds->vmb_next;
588 } else {
589 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
590 }
591
592 bound->vmb_next = NULL;
593 bound->vmb_start = 0;
594 bound->vmb_end = 0;
595 bound->vmb_type = 0;
596 return (bound);
597 }
598
599 /*
600 * vmu_find_insert_* functions implement hash lookup or allocate and
601 * insert operations.
602 */
603 static vmu_object_t *
604 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
605 {
606 int ret;
607 vmu_object_t *object;
608
609 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
610 (mod_hash_val_t *)&object);
611 if (ret != 0) {
612 object = vmu_alloc_object(key, type);
613 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
614 (mod_hash_val_t)object, (mod_hash_hndl_t)0);
615 ASSERT(ret == 0);
616 }
617 return (object);
618 }
619
620 static int
621 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
622 {
623 int ret;
624 caddr_t val;
625
626 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
627 (mod_hash_val_t *)&val);
628
629 if (ret == 0)
630 return (0);
631
632 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
633 (mod_hash_val_t)key, (mod_hash_hndl_t)0);
634
635 ASSERT(ret == 0);
636
637 return (1);
638 }
639
640 static vmu_entity_t *
641 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
642 {
643 int ret;
644 vmu_entity_t *entity;
645
646 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
647 (mod_hash_val_t *)&entity);
648 if (ret != 0) {
649 entity = vmu_alloc_entity(id, type, zoneid);
650 ret = i_mod_hash_insert_nosync(hash,
651 (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
652 (mod_hash_hndl_t)0);
653 ASSERT(ret == 0);
654 }
655 return (entity);
656 }
657
658
659
660
661 /*
662 * Returns list of object bounds between start and end. New bounds inserted
663 * by this call are given type.
664 *
665 * Returns the number of pages covered if new bounds are created. Returns 0
666 * if region between start/end consists of all existing bounds.
667 */
668 static pgcnt_t
669 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
670 end, char type, vmu_bound_t **first, vmu_bound_t **last)
671 {
672 avl_tree_t *tree = &(ro->vmo_bounds);
673 avl_index_t where;
674 vmu_bound_t *walker, *tmp;
675 pgcnt_t ret = 0;
676
677 ASSERT(start <= end);
678
679 *first = *last = NULL;
680
681 tmp = vmu_alloc_bound();
682 tmp->vmb_start = start;
683 tmp->vmb_type = type;
684
685 /* Hopelessly optimistic case. */
686 if (walker = avl_find(tree, tmp, &where)) {
687 /* We got lucky. */
688 vmu_free_bound(tmp);
689 *first = walker;
690 }
691
692 if (walker == NULL) {
693 /* Is start in the previous node? */
694 walker = avl_nearest(tree, where, AVL_BEFORE);
695 if (walker != NULL) {
696 if (ISWITHIN(walker, start)) {
697 /* We found start. */
698 vmu_free_bound(tmp);
699 *first = walker;
700 }
701 }
702 }
703
704 /*
705 * At this point, if *first is still NULL, then we
706 * didn't get a direct hit and start isn't covered
707 * by the previous node. We know that the next node
708 * must have a greater start value than we require
709 * because avl_find tells us where the AVL routines would
710 * insert our new node. We have some gap between the
711 * start we want and the next node.
712 */
713 if (*first == NULL) {
714 walker = avl_nearest(tree, where, AVL_AFTER);
715 if (walker != NULL && walker->vmb_start <= end) {
716 /* Fill the gap. */
717 tmp->vmb_end = walker->vmb_start - 1;
718 *first = tmp;
719 } else {
720 /* We have a gap over [start, end]. */
721 tmp->vmb_end = end;
722 *first = *last = tmp;
723 }
724 ret += tmp->vmb_end - tmp->vmb_start + 1;
725 avl_insert(tree, tmp, where);
726 }
727
728 ASSERT(*first != NULL);
729
730 if (*last != NULL) {
731 /* We're done. */
732 return (ret);
733 }
734
735 /*
736 * If we are here we still need to set *last and
737 * that may involve filling in some gaps.
738 */
739 *last = *first;
740 for (;;) {
741 if (ISWITHIN(*last, end)) {
742 /* We're done. */
743 break;
744 }
745 walker = AVL_NEXT(tree, *last);
746 if (walker == NULL || walker->vmb_start > end) {
747 /* Bottom or mid tree with gap. */
748 tmp = vmu_alloc_bound();
749 tmp->vmb_start = (*last)->vmb_end + 1;
750 tmp->vmb_end = end;
751 tmp->vmb_type = type;
752 ret += tmp->vmb_end - tmp->vmb_start + 1;
753 avl_insert_here(tree, tmp, *last, AVL_AFTER);
754 *last = tmp;
755 break;
756 } else {
757 if ((*last)->vmb_end + 1 != walker->vmb_start) {
758 /* Non-contiguous. */
759 tmp = vmu_alloc_bound();
760 tmp->vmb_start = (*last)->vmb_end + 1;
761 tmp->vmb_end = walker->vmb_start - 1;
762 tmp->vmb_type = type;
763 ret += tmp->vmb_end - tmp->vmb_start + 1;
764 avl_insert_here(tree, tmp, *last, AVL_AFTER);
765 *last = tmp;
766 } else {
767 *last = walker;
768 }
769 }
770 }
771
772 return (ret);
773 }
774
775 /*
776 * vmu_update_bounds()
777 *
778 * tree: avl_tree in which first and last hang.
779 *
780 * first, last: list of continuous bounds, of which zero or more are of
781 * type VMUSAGE_BOUND_UNKNOWN.
782 *
783 * new_tree: avl_tree in which new_first and new_last hang.
784 *
785 * new_first, new_last: list of continuous bounds, of which none are of
786 * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
787 * update the types of bounds in (first,last) with
788 * type VMUSAGE_BOUND_UNKNOWN.
789 *
790 * For the list of bounds (first,last), this function updates any bounds
791 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
792 * the list (new_first, new_last).
793 *
794 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
795 * (new_first, new_last), it will be split into multiple bounds.
796 *
797 * Return value:
798 * The number of pages in the list of bounds (first,last) that were of
799 * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
800 * VMUSAGE_BOUND_INCORE.
801 *
802 */
803 static pgcnt_t
804 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
805 avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
806 {
807 vmu_bound_t *next, *new_next, *tmp;
808 pgcnt_t rss = 0;
809
810 next = *first;
811 new_next = new_first;
812
813 /*
814 * Verify first and last bound are covered by new bounds if they
815 * have unknown type.
816 */
817 ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
818 (*first)->vmb_start >= new_first->vmb_start);
819 ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
820 (*last)->vmb_end <= new_last->vmb_end);
821 for (;;) {
822 /* If bound already has type, proceed to next bound. */
823 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
824 if (next == *last)
825 break;
826 next = AVL_NEXT(tree, next);
827 continue;
828 }
829 while (new_next->vmb_end < next->vmb_start)
830 new_next = AVL_NEXT(new_tree, new_next);
831 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
832 next->vmb_type = new_next->vmb_type;
833 if (new_next->vmb_end < next->vmb_end) {
834 /* need to split bound */
835 tmp = vmu_alloc_bound();
836 tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
837 tmp->vmb_start = new_next->vmb_end + 1;
838 tmp->vmb_end = next->vmb_end;
839 avl_insert_here(tree, tmp, next, AVL_AFTER);
840 next->vmb_end = new_next->vmb_end;
841 if (*last == next)
842 *last = tmp;
843 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
844 rss += next->vmb_end - next->vmb_start + 1;
845 next = tmp;
846 } else {
847 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
848 rss += next->vmb_end - next->vmb_start + 1;
849 if (next == *last)
850 break;
851 next = AVL_NEXT(tree, next);
852 }
853 }
854 return (rss);
855 }
856
857 /*
858 * Merges adjacent bounds with same type between first and last bound.
859 * After merge, last pointer may point to a different bound, as (incoming)
860 * last bound may have been merged away.
861 */
862 static void
863 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
864 {
865 vmu_bound_t *current;
866 vmu_bound_t *next;
867
868 ASSERT(tree != NULL);
869 ASSERT(*first != NULL);
870 ASSERT(*last != NULL);
871
872 current = *first;
873 while (current != *last) {
874 next = AVL_NEXT(tree, current);
875 if ((current->vmb_end + 1) == next->vmb_start &&
876 current->vmb_type == next->vmb_type) {
877 current->vmb_end = next->vmb_end;
878 avl_remove(tree, next);
879 vmu_free_bound(next);
880 if (next == *last) {
881 *last = current;
882 }
883 } else {
884 current = AVL_NEXT(tree, current);
885 }
886 }
887 }
888
889 /*
890 * Given an amp and a list of bounds, updates each bound's type with
891 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
892 *
893 * If a bound is partially incore, it will be split into two bounds.
894 * first and last may be modified, as bounds may be split into multiple
895 * bounds if they are partially incore/not-incore.
896 *
897 * Set incore to non-zero if bounds are already known to be incore.
898 *
899 */
900 static void
901 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
902 vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
903 {
904 vmu_bound_t *next;
905 vmu_bound_t *tmp;
906 pgcnt_t index;
907 short bound_type;
908 short page_type;
909 vnode_t *vn;
910 anoff_t off;
911 struct anon *ap;
912
913 next = *first;
914 /* Shared anon slots don't change once set. */
915 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
916 for (;;) {
917 if (incore == B_TRUE)
918 next->vmb_type = VMUSAGE_BOUND_INCORE;
919
920 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
921 if (next == *last)
922 break;
923 next = AVL_NEXT(tree, next);
924 continue;
925 }
926
927 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
928 bound_type = next->vmb_type;
929 index = next->vmb_start;
930 while (index <= next->vmb_end) {
931
932 /*
933 * These are used to determine how much to increment
934 * index when a large page is found.
935 */
936 page_t *page;
937 pgcnt_t pgcnt = 1;
938 uint_t pgshft;
939 pgcnt_t pgmsk;
940
941 ap = anon_get_ptr(amp->ahp, index);
942 if (ap != NULL)
943 swap_xlate(ap, &vn, &off);
944
945 if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
946 (page = page_exists(vn, off)) != NULL) {
947 if (PP_ISFREE(page))
948 page_type = VMUSAGE_BOUND_NOT_INCORE;
949 else
950 page_type = VMUSAGE_BOUND_INCORE;
951 if (page->p_szc > 0) {
952 pgcnt = page_get_pagecnt(page->p_szc);
953 pgshft = page_get_shift(page->p_szc);
954 pgmsk = (0x1 << (pgshft - PAGESHIFT))
955 - 1;
956 }
957 } else {
958 page_type = VMUSAGE_BOUND_NOT_INCORE;
959 }
960
961 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
962 next->vmb_type = page_type;
963 bound_type = page_type;
964 } else if (next->vmb_type != page_type) {
965 /*
966 * If current bound type does not match page
967 * type, need to split off new bound.
968 */
969 tmp = vmu_alloc_bound();
970 tmp->vmb_type = page_type;
971 tmp->vmb_start = index;
972 tmp->vmb_end = next->vmb_end;
973 avl_insert_here(tree, tmp, next, AVL_AFTER);
974 next->vmb_end = index - 1;
975 if (*last == next)
976 *last = tmp;
977 next = tmp;
978 }
979 if (pgcnt > 1) {
980 /*
981 * If inside large page, jump to next large
982 * page
983 */
984 index = (index & ~pgmsk) + pgcnt;
985 } else {
986 index++;
987 }
988 }
989 if (next == *last) {
990 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
991 break;
992 } else
993 next = AVL_NEXT(tree, next);
994 }
995 ANON_LOCK_EXIT(&->a_rwlock);
996 }
997
998 /*
999 * Same as vmu_amp_update_incore_bounds(), except for tracking
1000 * incore-/not-incore for vnodes.
1001 */
1002 static void
1003 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
1004 vmu_bound_t **first, vmu_bound_t **last)
1005 {
1006 vmu_bound_t *next;
1007 vmu_bound_t *tmp;
1008 pgcnt_t index;
1009 short bound_type;
1010 short page_type;
1011
1012 next = *first;
1013 for (;;) {
1014 if (vnode->v_pages == NULL)
1015 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1016
1017 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1018 if (next == *last)
1019 break;
1020 next = AVL_NEXT(tree, next);
1021 continue;
1022 }
1023
1024 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
1025 bound_type = next->vmb_type;
1026 index = next->vmb_start;
1027 while (index <= next->vmb_end) {
1028
1029 /*
1030 * These are used to determine how much to increment
1031 * index when a large page is found.
1032 */
1033 page_t *page;
1034 pgcnt_t pgcnt = 1;
1035 uint_t pgshft;
1036 pgcnt_t pgmsk;
1037
1038 if (vnode->v_pages != NULL &&
1039 (page = page_exists(vnode, ptob(index))) != NULL) {
1040 if (PP_ISFREE(page))
1041 page_type = VMUSAGE_BOUND_NOT_INCORE;
1042 else
1043 page_type = VMUSAGE_BOUND_INCORE;
1044 if (page->p_szc > 0) {
1045 pgcnt = page_get_pagecnt(page->p_szc);
1046 pgshft = page_get_shift(page->p_szc);
1047 pgmsk = (0x1 << (pgshft - PAGESHIFT))
1048 - 1;
1049 }
1050 } else {
1051 page_type = VMUSAGE_BOUND_NOT_INCORE;
1052 }
1053
1054 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1055 next->vmb_type = page_type;
1056 bound_type = page_type;
1057 } else if (next->vmb_type != page_type) {
1058 /*
1059 * If current bound type does not match page
1060 * type, need to split off new bound.
1061 */
1062 tmp = vmu_alloc_bound();
1063 tmp->vmb_type = page_type;
1064 tmp->vmb_start = index;
1065 tmp->vmb_end = next->vmb_end;
1066 avl_insert_here(tree, tmp, next, AVL_AFTER);
1067 next->vmb_end = index - 1;
1068 if (*last == next)
1069 *last = tmp;
1070 next = tmp;
1071 }
1072 if (pgcnt > 1) {
1073 /*
1074 * If inside large page, jump to next large
1075 * page
1076 */
1077 index = (index & ~pgmsk) + pgcnt;
1078 } else {
1079 index++;
1080 }
1081 }
1082 if (next == *last) {
1083 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1084 break;
1085 } else
1086 next = AVL_NEXT(tree, next);
1087 }
1088 }
1089
1090 /*
1091 * Calculate the rss and swap consumed by a segment. vmu_entities is the
1092 * list of entities to visit. For shared segments, the vnode or amp
1093 * is looked up in each entity to see if it has been already counted. Private
1094 * anon pages are checked per entity to ensure that COW pages are not
1095 * double counted.
1096 *
1097 * For private mapped files, first the amp is checked for private pages.
1098 * Bounds not backed by the amp are looked up in the vnode for each entity
1099 * to avoid double counting of private COW vnode pages.
1100 */
1101 static void
1102 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1103 {
1104 struct segvn_data *svd;
1105 struct shm_data *shmd;
1106 struct spt_data *sptd;
1107 vmu_object_t *shared_object = NULL;
1108 vmu_object_t *entity_object = NULL;
1109 vmu_entity_t *entity;
1110 vmusage_t *result;
1111 vmu_bound_t *first = NULL;
1112 vmu_bound_t *last = NULL;
1113 vmu_bound_t *cur = NULL;
1114 vmu_bound_t *e_first = NULL;
1115 vmu_bound_t *e_last = NULL;
1116 vmu_bound_t *tmp;
1117 pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1118 struct anon_map *private_amp = NULL;
1119 boolean_t incore = B_FALSE;
1120 boolean_t shared = B_FALSE;
1121 int file = 0;
1122 pgcnt_t swresv = 0;
1123 pgcnt_t panon = 0;
1124
1125 /* Can zero-length segments exist? Not sure, so paranoia. */
1126 if (seg->s_size <= 0)
1127 return;
1128
1129 /*
1130 * Figure out if there is a shared object (such as a named vnode or
1131 * a shared amp, then figure out if there is a private amp, which
1132 * identifies private pages.
1133 */
1134 if (seg->s_ops == &segvn_ops) {
1135 svd = (struct segvn_data *)seg->s_data;
1136 if (svd->type == MAP_SHARED) {
1137 shared = B_TRUE;
1138 } else {
1139 swresv = svd->swresv;
1140
1141 if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1142 RW_READER) != 0) {
1143 /*
1144 * Text replication anon maps can be shared
1145 * across all zones. Space used for text
1146 * replication is typically capped as a small %
1147 * of memory. To keep it simple for now we
1148 * don't account for swap and memory space used
1149 * for text replication.
1150 */
1151 if (svd->tr_state == SEGVN_TR_OFF &&
1152 svd->amp != NULL) {
1153 private_amp = svd->amp;
1154 p_start = svd->anon_index;
1155 p_end = svd->anon_index +
1156 btop(seg->s_size) - 1;
1157 }
1158 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1159 }
1160 }
1161 if (svd->vp != NULL) {
1162 file = 1;
1163 shared_object = vmu_find_insert_object(
1164 vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1165 VMUSAGE_TYPE_VNODE);
1166 s_start = btop(svd->offset);
1167 s_end = btop(svd->offset + seg->s_size) - 1;
1168 }
1169 if (svd->amp != NULL && svd->type == MAP_SHARED) {
1170 ASSERT(shared_object == NULL);
1171 shared_object = vmu_find_insert_object(
1172 vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1173 VMUSAGE_TYPE_AMP);
1174 s_start = svd->anon_index;
1175 s_end = svd->anon_index + btop(seg->s_size) - 1;
1176 /* schedctl mappings are always in core */
1177 if (svd->amp->swresv == 0)
1178 incore = B_TRUE;
1179 }
1180 } else if (seg->s_ops == &segspt_shmops) {
1181 shared = B_TRUE;
1182 shmd = (struct shm_data *)seg->s_data;
1183 shared_object = vmu_find_insert_object(
1184 vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1185 VMUSAGE_TYPE_AMP);
1186 s_start = 0;
1187 s_end = btop(seg->s_size) - 1;
1188 sptd = shmd->shm_sptseg->s_data;
1189
1190 /* ism segments are always incore and do not reserve swap */
1191 if (sptd->spt_flags & SHM_SHARE_MMU)
1192 incore = B_TRUE;
1193
1194 } else {
1195 return;
1196 }
1197
1198 /*
1199 * If there is a private amp, count anon pages that exist. If an
1200 * anon has a refcnt > 1 (COW sharing), then save the anon in a
1201 * hash so that it is not double counted.
1202 *
1203 * If there is also a shared object, then figure out the bounds
1204 * which are not mapped by the private amp.
1205 */
1206 if (private_amp != NULL) {
1207
1208 /* Enter as writer to prevent COW anons from being freed */
1209 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1210
1211 p_index = p_start;
1212 s_index = s_start;
1213
1214 while (p_index <= p_end) {
1215
1216 pgcnt_t p_index_next;
1217 pgcnt_t p_bound_size;
1218 int cnt;
1219 anoff_t off;
1220 struct vnode *vn;
1221 struct anon *ap;
1222 page_t *page; /* For handling of large */
1223 pgcnt_t pgcnt = 1; /* pages */
1224 pgcnt_t pgstart;
1225 pgcnt_t pgend;
1226 uint_t pgshft;
1227 pgcnt_t pgmsk;
1228
1229 p_index_next = p_index;
1230 ap = anon_get_next_ptr(private_amp->ahp,
1231 &p_index_next);
1232
1233 /*
1234 * If next anon is past end of mapping, simulate
1235 * end of anon so loop terminates.
1236 */
1237 if (p_index_next > p_end) {
1238 p_index_next = p_end + 1;
1239 ap = NULL;
1240 }
1241 /*
1242 * For COW segments, keep track of bounds not
1243 * backed by private amp so they can be looked
1244 * up in the backing vnode
1245 */
1246 if (p_index_next != p_index) {
1247
1248 /*
1249 * Compute index difference between anon and
1250 * previous anon.
1251 */
1252 p_bound_size = p_index_next - p_index - 1;
1253
1254 if (shared_object != NULL) {
1255 cur = vmu_alloc_bound();
1256 cur->vmb_start = s_index;
1257 cur->vmb_end = s_index + p_bound_size;
1258 cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1259 if (first == NULL) {
1260 first = cur;
1261 last = cur;
1262 } else {
1263 last->vmb_next = cur;
1264 last = cur;
1265 }
1266 }
1267 p_index = p_index + p_bound_size + 1;
1268 s_index = s_index + p_bound_size + 1;
1269 }
1270
1271 /* Detect end of anons in amp */
1272 if (ap == NULL)
1273 break;
1274
1275 cnt = ap->an_refcnt;
1276 swap_xlate(ap, &vn, &off);
1277
1278 if (vn == NULL || vn->v_pages == NULL ||
1279 (page = page_exists(vn, off)) == NULL) {
1280 p_index++;
1281 s_index++;
1282 continue;
1283 }
1284
1285 /*
1286 * If large page is found, compute portion of large
1287 * page in mapping, and increment indicies to the next
1288 * large page.
1289 */
1290 if (page->p_szc > 0) {
1291
1292 pgcnt = page_get_pagecnt(page->p_szc);
1293 pgshft = page_get_shift(page->p_szc);
1294 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1295
1296 /* First page in large page */
1297 pgstart = p_index & ~pgmsk;
1298 /* Last page in large page */
1299 pgend = pgstart + pgcnt - 1;
1300 /*
1301 * Artifically end page if page extends past
1302 * end of mapping.
1303 */
1304 if (pgend > p_end)
1305 pgend = p_end;
1306
1307 /*
1308 * Compute number of pages from large page
1309 * which are mapped.
1310 */
1311 pgcnt = pgend - p_index + 1;
1312
1313 /*
1314 * Point indicies at page after large page,
1315 * or at page after end of mapping.
1316 */
1317 p_index += pgcnt;
1318 s_index += pgcnt;
1319 } else {
1320 p_index++;
1321 s_index++;
1322 }
1323
1324 /*
1325 * Pages on the free list aren't counted for the rss.
1326 */
1327 if (PP_ISFREE(page))
1328 continue;
1329
1330 /*
1331 * Assume anon structs with a refcnt
1332 * of 1 are not COW shared, so there
1333 * is no reason to track them per entity.
1334 */
1335 if (cnt == 1) {
1336 panon += pgcnt;
1337 continue;
1338 }
1339 for (entity = vmu_entities; entity != NULL;
1340 entity = entity->vme_next_calc) {
1341
1342 result = &entity->vme_result;
1343 /*
1344 * Track COW anons per entity so
1345 * they are not double counted.
1346 */
1347 if (vmu_find_insert_anon(entity->vme_anon_hash,
1348 (caddr_t)ap) == 0)
1349 continue;
1350
1351 result->vmu_rss_all += (pgcnt << PAGESHIFT);
1352 result->vmu_rss_private +=
1353 (pgcnt << PAGESHIFT);
1354 }
1355 }
1356 ANON_LOCK_EXIT(&private_amp->a_rwlock);
1357 }
1358
1359 /* Add up resident anon and swap reserved for private mappings */
1360 if (swresv > 0 || panon > 0) {
1361 for (entity = vmu_entities; entity != NULL;
1362 entity = entity->vme_next_calc) {
1363 result = &entity->vme_result;
1364 result->vmu_swap_all += swresv;
1365 result->vmu_swap_private += swresv;
1366 result->vmu_rss_all += (panon << PAGESHIFT);
1367 result->vmu_rss_private += (panon << PAGESHIFT);
1368 }
1369 }
1370
1371 /* Compute resident pages backing shared amp or named vnode */
1372 if (shared_object != NULL) {
1373 avl_tree_t *tree = &(shared_object->vmo_bounds);
1374
1375 if (first == NULL) {
1376 /*
1377 * No private amp, or private amp has no anon
1378 * structs. This means entire segment is backed by
1379 * the shared object.
1380 */
1381 first = vmu_alloc_bound();
1382 first->vmb_start = s_start;
1383 first->vmb_end = s_end;
1384 first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1385 }
1386 /*
1387 * Iterate bounds not backed by private amp, and compute
1388 * resident pages.
1389 */
1390 cur = first;
1391 while (cur != NULL) {
1392
1393 if (vmu_insert_lookup_object_bounds(shared_object,
1394 cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1395 &first, &last) > 0) {
1396 /* new bounds, find incore/not-incore */
1397 if (shared_object->vmo_type ==
1398 VMUSAGE_TYPE_VNODE) {
1399 vmu_vnode_update_incore_bounds(
1400 tree,
1401 (vnode_t *)
1402 shared_object->vmo_key, &first,
1403 &last);
1404 } else {
1405 vmu_amp_update_incore_bounds(
1406 tree,
1407 (struct anon_map *)
1408 shared_object->vmo_key, &first,
1409 &last, incore);
1410 }
1411 vmu_merge_bounds(tree, &first, &last);
1412 }
1413 for (entity = vmu_entities; entity != NULL;
1414 entity = entity->vme_next_calc) {
1415 avl_tree_t *e_tree;
1416
1417 result = &entity->vme_result;
1418
1419 entity_object = vmu_find_insert_object(
1420 shared_object->vmo_type ==
1421 VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1422 entity->vme_amp_hash,
1423 shared_object->vmo_key,
1424 shared_object->vmo_type);
1425
1426 virt = vmu_insert_lookup_object_bounds(
1427 entity_object, cur->vmb_start, cur->vmb_end,
1428 VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1429
1430 if (virt == 0)
1431 continue;
1432 /*
1433 * Range visited for this entity
1434 */
1435 e_tree = &(entity_object->vmo_bounds);
1436 rss = vmu_update_bounds(e_tree, &e_first,
1437 &e_last, tree, first, last);
1438 result->vmu_rss_all += (rss << PAGESHIFT);
1439 if (shared == B_TRUE && file == B_FALSE) {
1440 /* shared anon mapping */
1441 result->vmu_swap_all +=
1442 (virt << PAGESHIFT);
1443 result->vmu_swap_shared +=
1444 (virt << PAGESHIFT);
1445 result->vmu_rss_shared +=
1446 (rss << PAGESHIFT);
1447 } else if (shared == B_TRUE && file == B_TRUE) {
1448 /* shared file mapping */
1449 result->vmu_rss_shared +=
1450 (rss << PAGESHIFT);
1451 } else if (shared == B_FALSE &&
1452 file == B_TRUE) {
1453 /* private file mapping */
1454 result->vmu_rss_private +=
1455 (rss << PAGESHIFT);
1456 }
1457 vmu_merge_bounds(e_tree, &e_first, &e_last);
1458 }
1459 tmp = cur;
1460 cur = cur->vmb_next;
1461 vmu_free_bound(tmp);
1462 }
1463 }
1464 }
1465
1466 /*
1467 * Based on the current calculation flags, find the relevant entities
1468 * which are relative to the process. Then calculate each segment
1469 * in the process'es address space for each relevant entity.
1470 */
1471 static void
1472 vmu_calculate_proc(proc_t *p)
1473 {
1474 vmu_entity_t *entities = NULL;
1475 vmu_zone_t *zone;
1476 vmu_entity_t *tmp;
1477 struct as *as;
1478 struct seg *seg;
1479 int ret;
1480
1481 /* Figure out which entities are being computed */
1482 if ((vmu_data.vmu_system) != NULL) {
1483 tmp = vmu_data.vmu_system;
1484 tmp->vme_next_calc = entities;
1485 entities = tmp;
1486 }
1487 if (vmu_data.vmu_calc_flags &
1488 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
1489 VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1490 VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1491 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1492 VMUSAGE_ALL_EUSERS)) {
1493 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1494 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1495 (mod_hash_val_t *)&zone);
1496 if (ret != 0) {
1497 zone = vmu_alloc_zone(p->p_zone->zone_id);
1498 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1499 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1500 (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1501 ASSERT(ret == 0);
1502 }
1503 if (zone->vmz_zone != NULL) {
1504 tmp = zone->vmz_zone;
1505 tmp->vme_next_calc = entities;
1506 entities = tmp;
1507 }
1508 if (vmu_data.vmu_calc_flags &
1509 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1510 tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1511 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1512 zone->vmz_id);
1513 tmp->vme_next_calc = entities;
1514 entities = tmp;
1515 }
1516 if (vmu_data.vmu_calc_flags &
1517 (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1518 tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1519 p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1520 tmp->vme_next_calc = entities;
1521 entities = tmp;
1522 }
1523 if (vmu_data.vmu_calc_flags &
1524 (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1525 tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1526 crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1527 tmp->vme_next_calc = entities;
1528 entities = tmp;
1529 }
1530 if (vmu_data.vmu_calc_flags &
1531 (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1532 tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1533 crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1534 tmp->vme_next_calc = entities;
1535 entities = tmp;
1536 }
1537 }
1538 /* Entities which collapse projects and users for all zones */
1539 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1540 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1541 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1542 tmp->vme_next_calc = entities;
1543 entities = tmp;
1544 }
1545 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1546 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1547 crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1548 tmp->vme_next_calc = entities;
1549 entities = tmp;
1550 }
1551 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1552 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1553 crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1554 tmp->vme_next_calc = entities;
1555 entities = tmp;
1556 }
1557
1558 ASSERT(entities != NULL);
1559 /* process all segs in process's address space */
1560 as = p->p_as;
1561 AS_LOCK_ENTER(as, RW_READER);
1562 for (seg = AS_SEGFIRST(as); seg != NULL;
1563 seg = AS_SEGNEXT(as, seg)) {
1564 vmu_calculate_seg(entities, seg);
1565 }
1566 AS_LOCK_EXIT(as);
1567 }
1568
1569 /*
1570 * Free data created by previous call to vmu_calculate().
1571 */
1572 static void
1573 vmu_clear_calc()
1574 {
1575 if (vmu_data.vmu_system != NULL)
1576 vmu_free_entity(vmu_data.vmu_system);
1577 vmu_data.vmu_system = NULL;
1578 if (vmu_data.vmu_zones_hash != NULL)
1579 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1580 if (vmu_data.vmu_projects_col_hash != NULL)
1581 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1582 if (vmu_data.vmu_rusers_col_hash != NULL)
1583 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1584 if (vmu_data.vmu_eusers_col_hash != NULL)
1585 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1586
1587 i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1588 i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1589 }
1590
1591 /*
1592 * Free unused data structures. These can result if the system workload
1593 * decreases between calculations.
1594 */
1595 static void
1596 vmu_free_extra()
1597 {
1598 vmu_bound_t *tb;
1599 vmu_object_t *to;
1600 vmu_entity_t *te;
1601 vmu_zone_t *tz;
1602
1603 while (vmu_data.vmu_free_bounds != NULL) {
1604 tb = vmu_data.vmu_free_bounds;
1605 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1606 kmem_cache_free(vmu_bound_cache, tb);
1607 }
1608 while (vmu_data.vmu_free_objects != NULL) {
1609 to = vmu_data.vmu_free_objects;
1610 vmu_data.vmu_free_objects =
1611 vmu_data.vmu_free_objects->vmo_next;
1612 kmem_cache_free(vmu_object_cache, to);
1613 }
1614 while (vmu_data.vmu_free_entities != NULL) {
1615 te = vmu_data.vmu_free_entities;
1616 vmu_data.vmu_free_entities =
1617 vmu_data.vmu_free_entities->vme_next;
1618 if (te->vme_vnode_hash != NULL)
1619 mod_hash_destroy_hash(te->vme_vnode_hash);
1620 if (te->vme_amp_hash != NULL)
1621 mod_hash_destroy_hash(te->vme_amp_hash);
1622 if (te->vme_anon_hash != NULL)
1623 mod_hash_destroy_hash(te->vme_anon_hash);
1624 kmem_free(te, sizeof (vmu_entity_t));
1625 }
1626 while (vmu_data.vmu_free_zones != NULL) {
1627 tz = vmu_data.vmu_free_zones;
1628 vmu_data.vmu_free_zones =
1629 vmu_data.vmu_free_zones->vmz_next;
1630 if (tz->vmz_projects_hash != NULL)
1631 mod_hash_destroy_hash(tz->vmz_projects_hash);
1632 if (tz->vmz_tasks_hash != NULL)
1633 mod_hash_destroy_hash(tz->vmz_tasks_hash);
1634 if (tz->vmz_rusers_hash != NULL)
1635 mod_hash_destroy_hash(tz->vmz_rusers_hash);
1636 if (tz->vmz_eusers_hash != NULL)
1637 mod_hash_destroy_hash(tz->vmz_eusers_hash);
1638 kmem_free(tz, sizeof (vmu_zone_t));
1639 }
1640 }
1641
1642 extern kcondvar_t *pr_pid_cv;
1643
1644 /*
1645 * Determine which entity types are relevant and allocate the hashes to
1646 * track them. Then walk the process table and count rss and swap
1647 * for each process'es address space. Address space object such as
1648 * vnodes, amps and anons are tracked per entity, so that they are
1649 * not double counted in the results.
1650 *
1651 */
1652 static void
1653 vmu_calculate()
1654 {
1655 int i = 0;
1656 int ret;
1657 proc_t *p;
1658
1659 vmu_clear_calc();
1660
1661 if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1662 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1663 ALL_ZONES);
1664
1665 /*
1666 * Walk process table and calculate rss of each proc.
1667 *
1668 * Pidlock and p_lock cannot be held while doing the rss calculation.
1669 * This is because:
1670 * 1. The calculation allocates using KM_SLEEP.
1671 * 2. The calculation grabs a_lock, which cannot be grabbed
1672 * after p_lock.
1673 *
1674 * Since pidlock must be dropped, we cannot simply just walk the
1675 * practive list. Instead, we walk the process table, and sprlock
1676 * each process to ensure that it does not exit during the
1677 * calculation.
1678 */
1679
1680 mutex_enter(&pidlock);
1681 for (i = 0; i < v.v_proc; i++) {
1682 again:
1683 p = pid_entry(i);
1684 if (p == NULL)
1685 continue;
1686
1687 mutex_enter(&p->p_lock);
1688 mutex_exit(&pidlock);
1689
1690 if (panicstr) {
1691 mutex_exit(&p->p_lock);
1692 return;
1693 }
1694
1695 /* Try to set P_PR_LOCK */
1696 ret = sprtrylock_proc(p);
1697 if (ret == -1) {
1698 /* Process in invalid state */
1699 mutex_exit(&p->p_lock);
1700 mutex_enter(&pidlock);
1701 continue;
1702 } else if (ret == 1) {
1703 /*
1704 * P_PR_LOCK is already set. Wait and try again.
1705 * This also drops p_lock.
1706 */
1707 sprwaitlock_proc(p);
1708 mutex_enter(&pidlock);
1709 goto again;
1710 }
1711 mutex_exit(&p->p_lock);
1712
1713 vmu_calculate_proc(p);
1714
1715 mutex_enter(&p->p_lock);
1716 sprunlock(p);
1717 mutex_enter(&pidlock);
1718 }
1719 mutex_exit(&pidlock);
1720
1721 vmu_free_extra();
1722 }
1723
1724 /*
1725 * allocate a new cache for N results satisfying flags
1726 */
1727 vmu_cache_t *
1728 vmu_cache_alloc(size_t nres, uint_t flags)
1729 {
1730 vmu_cache_t *cache;
1731
1732 cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1733 cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1734 cache->vmc_nresults = nres;
1735 cache->vmc_flags = flags;
1736 cache->vmc_refcnt = 1;
1737 return (cache);
1738 }
1739
1740 /*
1741 * Make sure cached results are not freed
1742 */
1743 static void
1744 vmu_cache_hold(vmu_cache_t *cache)
1745 {
1746 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1747 cache->vmc_refcnt++;
1748 }
1749
1750 /*
1751 * free cache data
1752 */
1753 static void
1754 vmu_cache_rele(vmu_cache_t *cache)
1755 {
1756 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1757 ASSERT(cache->vmc_refcnt > 0);
1758 cache->vmc_refcnt--;
1759 if (cache->vmc_refcnt == 0) {
1760 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1761 cache->vmc_nresults);
1762 kmem_free(cache, sizeof (vmu_cache_t));
1763 }
1764 }
1765
1766 /*
1767 * When new data is calculated, update the phys_mem rctl usage value in the
1768 * zones.
1769 */
1770 static void
1771 vmu_update_zone_rctls(vmu_cache_t *cache)
1772 {
1773 vmusage_t *rp;
1774 size_t i = 0;
1775 zone_t *zp;
1776
1777 for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
1778 if (rp->vmu_type == VMUSAGE_ZONE &&
1779 rp->vmu_zoneid != ALL_ZONES) {
1780 if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
1781 zp->zone_phys_mem = rp->vmu_rss_all;
1782 zone_rele(zp);
1783 }
1784 }
1785 }
1786 }
1787
1788 /*
1789 * Copy out the cached results to a caller. Inspect the callers flags
1790 * and zone to determine which cached results should be copied.
1791 */
1792 static int
1793 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1794 uint_t flags, id_t req_zone_id, int cpflg)
1795 {
1796 vmusage_t *result, *out_result;
1797 vmusage_t dummy;
1798 size_t i, count = 0;
1799 size_t bufsize;
1800 int ret = 0;
1801 uint_t types = 0;
1802
1803 if (nres != NULL) {
1804 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1805 return (set_errno(EFAULT));
1806 } else {
1807 bufsize = 0;
1808 }
1809
1810 /* figure out what results the caller is interested in. */
1811 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1812 types |= VMUSAGE_SYSTEM;
1813 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
1814 types |= VMUSAGE_ZONE;
1815 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1816 VMUSAGE_COL_PROJECTS))
1817 types |= VMUSAGE_PROJECTS;
1818 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1819 types |= VMUSAGE_TASKS;
1820 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1821 types |= VMUSAGE_RUSERS;
1822 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1823 types |= VMUSAGE_EUSERS;
1824
1825 /* count results for current zone */
1826 out_result = buf;
1827 for (result = cache->vmc_results, i = 0;
1828 i < cache->vmc_nresults; result++, i++) {
1829
1830 /* Do not return "other-zone" results to non-global zones */
1831 if (curproc->p_zone != global_zone &&
1832 curproc->p_zone->zone_id != result->vmu_zoneid)
1833 continue;
1834
1835 /*
1836 * If non-global zone requests VMUSAGE_SYSTEM, fake
1837 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1838 */
1839 if (curproc->p_zone != global_zone &&
1840 (flags & VMUSAGE_SYSTEM) != 0 &&
1841 result->vmu_type == VMUSAGE_ZONE) {
1842 count++;
1843 if (out_result != NULL) {
1844 if (bufsize < count) {
1845 ret = set_errno(EOVERFLOW);
1846 } else {
1847 dummy = *result;
1848 dummy.vmu_zoneid = ALL_ZONES;
1849 dummy.vmu_id = 0;
1850 dummy.vmu_type = VMUSAGE_SYSTEM;
1851 if (ddi_copyout(&dummy, out_result,
1852 sizeof (vmusage_t), cpflg))
1853 return (set_errno(EFAULT));
1854 out_result++;
1855 }
1856 }
1857 }
1858
1859 /* Skip results that do not match requested type */
1860 if ((result->vmu_type & types) == 0)
1861 continue;
1862
1863 /* Skip collated results if not requested */
1864 if (result->vmu_zoneid == ALL_ZONES) {
1865 if (result->vmu_type == VMUSAGE_PROJECTS &&
1866 (flags & VMUSAGE_COL_PROJECTS) == 0)
1867 continue;
1868 if (result->vmu_type == VMUSAGE_EUSERS &&
1869 (flags & VMUSAGE_COL_EUSERS) == 0)
1870 continue;
1871 if (result->vmu_type == VMUSAGE_RUSERS &&
1872 (flags & VMUSAGE_COL_RUSERS) == 0)
1873 continue;
1874 }
1875
1876 if (result->vmu_type == VMUSAGE_ZONE &&
1877 flags & VMUSAGE_A_ZONE) {
1878 /* Skip non-requested zone results */
1879 if (result->vmu_zoneid != req_zone_id)
1880 continue;
1881 } else {
1882 /* Skip "other zone" results if not requested */
1883 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1884 if (result->vmu_type == VMUSAGE_ZONE &&
1885 (flags & VMUSAGE_ALL_ZONES) == 0)
1886 continue;
1887 if (result->vmu_type == VMUSAGE_PROJECTS &&
1888 (flags & (VMUSAGE_ALL_PROJECTS |
1889 VMUSAGE_COL_PROJECTS)) == 0)
1890 continue;
1891 if (result->vmu_type == VMUSAGE_TASKS &&
1892 (flags & VMUSAGE_ALL_TASKS) == 0)
1893 continue;
1894 if (result->vmu_type == VMUSAGE_RUSERS &&
1895 (flags & (VMUSAGE_ALL_RUSERS |
1896 VMUSAGE_COL_RUSERS)) == 0)
1897 continue;
1898 if (result->vmu_type == VMUSAGE_EUSERS &&
1899 (flags & (VMUSAGE_ALL_EUSERS |
1900 VMUSAGE_COL_EUSERS)) == 0)
1901 continue;
1902 }
1903 }
1904 count++;
1905 if (out_result != NULL) {
1906 if (bufsize < count) {
1907 ret = set_errno(EOVERFLOW);
1908 } else {
1909 if (ddi_copyout(result, out_result,
1910 sizeof (vmusage_t), cpflg))
1911 return (set_errno(EFAULT));
1912 out_result++;
1913 }
1914 }
1915 }
1916 if (nres != NULL)
1917 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1918 return (set_errno(EFAULT));
1919
1920 return (ret);
1921 }
1922
1923 /*
1924 * vm_getusage()
1925 *
1926 * Counts rss and swap by zone, project, task, and/or user. The flags argument
1927 * determines the type of results structures returned. Flags requesting
1928 * results from more than one zone are "flattened" to the local zone if the
1929 * caller is not the global zone.
1930 *
1931 * args:
1932 * flags: bitmap consisting of one or more of VMUSAGE_*.
1933 * age: maximum allowable age (time since counting was done) in
1934 * seconds of the results. Results from previous callers are
1935 * cached in kernel.
1936 * buf: pointer to buffer array of vmusage_t. If NULL, then only nres
1937 * set on success.
1938 * nres: Set to number of vmusage_t structures pointed to by buf
1939 * before calling vm_getusage().
1940 * On return 0 (success) or ENOSPC, is set to the number of result
1941 * structures returned or attempted to return.
1942 *
1943 * returns 0 on success, -1 on failure:
1944 * EINTR (interrupted)
1945 * ENOSPC (nres to small for results, nres set to needed value for success)
1946 * EINVAL (flags invalid)
1947 * EFAULT (bad address for buf or nres)
1948 */
1949 int
1950 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1951 {
1952 vmu_entity_t *entity;
1953 vmusage_t *result;
1954 int ret = 0;
1955 int cacherecent = 0;
1956 hrtime_t now;
1957 uint_t flags_orig;
1958 id_t req_zone_id;
1959
1960 /*
1961 * Non-global zones cannot request system wide and/or collated
1962 * results, or the system result, or usage of another zone, so munge
1963 * the flags accordingly.
1964 */
1965 flags_orig = flags;
1966 if (curproc->p_zone != global_zone) {
1967 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1968 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1969 flags |= VMUSAGE_PROJECTS;
1970 }
1971 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1972 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1973 flags |= VMUSAGE_RUSERS;
1974 }
1975 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1976 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1977 flags |= VMUSAGE_EUSERS;
1978 }
1979 if (flags & VMUSAGE_SYSTEM) {
1980 flags &= ~VMUSAGE_SYSTEM;
1981 flags |= VMUSAGE_ZONE;
1982 }
1983 if (flags & VMUSAGE_A_ZONE) {
1984 flags &= ~VMUSAGE_A_ZONE;
1985 flags |= VMUSAGE_ZONE;
1986 }
1987 }
1988
1989 /* Check for unknown flags */
1990 if ((flags & (~VMUSAGE_MASK)) != 0)
1991 return (set_errno(EINVAL));
1992
1993 /* Check for no flags */
1994 if ((flags & VMUSAGE_MASK) == 0)
1995 return (set_errno(EINVAL));
1996
1997 /* If requesting results for a specific zone, get the zone ID */
1998 if (flags & VMUSAGE_A_ZONE) {
1999 size_t bufsize;
2000 vmusage_t zreq;
2001
2002 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
2003 return (set_errno(EFAULT));
2004 /* Requested zone ID is passed in buf, so 0 len not allowed */
2005 if (bufsize == 0)
2006 return (set_errno(EINVAL));
2007 if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
2008 return (set_errno(EFAULT));
2009 req_zone_id = zreq.vmu_id;
2010 }
2011
2012 mutex_enter(&vmu_data.vmu_lock);
2013 now = gethrtime();
2014
2015 start:
2016 if (vmu_data.vmu_cache != NULL) {
2017
2018 vmu_cache_t *cache;
2019
2020 if ((vmu_data.vmu_cache->vmc_timestamp +
2021 ((hrtime_t)age * NANOSEC)) > now)
2022 cacherecent = 1;
2023
2024 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
2025 cacherecent == 1) {
2026 cache = vmu_data.vmu_cache;
2027 vmu_cache_hold(cache);
2028 mutex_exit(&vmu_data.vmu_lock);
2029
2030 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2031 req_zone_id, cpflg);
2032 mutex_enter(&vmu_data.vmu_lock);
2033 vmu_cache_rele(cache);
2034 if (vmu_data.vmu_pending_waiters > 0)
2035 cv_broadcast(&vmu_data.vmu_cv);
2036 mutex_exit(&vmu_data.vmu_lock);
2037 return (ret);
2038 }
2039 /*
2040 * If the cache is recent, it is likely that there are other
2041 * consumers of vm_getusage running, so add their flags to the
2042 * desired flags for the calculation.
2043 */
2044 if (cacherecent == 1)
2045 flags = vmu_data.vmu_cache->vmc_flags | flags;
2046 }
2047 if (vmu_data.vmu_calc_thread == NULL) {
2048
2049 vmu_cache_t *cache;
2050
2051 vmu_data.vmu_calc_thread = curthread;
2052 vmu_data.vmu_calc_flags = flags;
2053 vmu_data.vmu_entities = NULL;
2054 vmu_data.vmu_nentities = 0;
2055 if (vmu_data.vmu_pending_waiters > 0)
2056 vmu_data.vmu_calc_flags |=
2057 vmu_data.vmu_pending_flags;
2058
2059 vmu_data.vmu_pending_flags = 0;
2060 mutex_exit(&vmu_data.vmu_lock);
2061 vmu_calculate();
2062 mutex_enter(&vmu_data.vmu_lock);
2063 /* copy results to cache */
2064 if (vmu_data.vmu_cache != NULL)
2065 vmu_cache_rele(vmu_data.vmu_cache);
2066 cache = vmu_data.vmu_cache =
2067 vmu_cache_alloc(vmu_data.vmu_nentities,
2068 vmu_data.vmu_calc_flags);
2069
2070 result = cache->vmc_results;
2071 for (entity = vmu_data.vmu_entities; entity != NULL;
2072 entity = entity->vme_next) {
2073 *result = entity->vme_result;
2074 result++;
2075 }
2076 cache->vmc_timestamp = gethrtime();
2077 vmu_cache_hold(cache);
2078
2079 vmu_data.vmu_calc_flags = 0;
2080 vmu_data.vmu_calc_thread = NULL;
2081
2082 if (vmu_data.vmu_pending_waiters > 0)
2083 cv_broadcast(&vmu_data.vmu_cv);
2084
2085 mutex_exit(&vmu_data.vmu_lock);
2086
2087 /* update zone's phys. mem. rctl usage */
2088 vmu_update_zone_rctls(cache);
2089 /* copy cache */
2090 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2091 req_zone_id, cpflg);
2092 mutex_enter(&vmu_data.vmu_lock);
2093 vmu_cache_rele(cache);
2094 mutex_exit(&vmu_data.vmu_lock);
2095
2096 return (ret);
2097 }
2098 vmu_data.vmu_pending_flags |= flags;
2099 vmu_data.vmu_pending_waiters++;
2100 while (vmu_data.vmu_calc_thread != NULL) {
2101 if (cv_wait_sig(&vmu_data.vmu_cv,
2102 &vmu_data.vmu_lock) == 0) {
2103 vmu_data.vmu_pending_waiters--;
2104 mutex_exit(&vmu_data.vmu_lock);
2105 return (set_errno(EINTR));
2106 }
2107 }
2108 vmu_data.vmu_pending_waiters--;
2109 goto start;
2110 }
2111
2112 #if defined(__x86)
2113 /*
2114 * Attempt to invalidate all of the pages in the mapping for the given process.
2115 */
2116 static void
2117 map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
2118 {
2119 page_t *pp;
2120 size_t psize;
2121 u_offset_t off;
2122 caddr_t eaddr;
2123 struct vnode *vp;
2124 struct segvn_data *svd;
2125 struct hat *victim_hat;
2126
2127 ASSERT((addr + size) <= (seg->s_base + seg->s_size));
2128
2129 victim_hat = p->p_as->a_hat;
2130 svd = (struct segvn_data *)seg->s_data;
2131 vp = svd->vp;
2132 psize = page_get_pagesize(seg->s_szc);
2133
2134 off = svd->offset + (uintptr_t)(addr - seg->s_base);
2135
2136 for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
2137 pp = page_lookup_nowait(vp, off, SE_SHARED);
2138
2139 if (pp != NULL) {
2140 /* following logic based on pvn_getdirty() */
2141
2142 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2143 page_unlock(pp);
2144 continue;
2145 }
2146
2147 page_io_lock(pp);
2148 hat_page_inval(pp, 0, victim_hat);
2149 page_io_unlock(pp);
2150
2151 /*
2152 * For B_INVALCURONLY-style handling we let
2153 * page_release call VN_DISPOSE if no one else is using
2154 * the page.
2155 *
2156 * A hat_ismod() check would be useless because:
2157 * (1) we are not be holding SE_EXCL lock
2158 * (2) we've not unloaded _all_ translations
2159 *
2160 * Let page_release() do the heavy-lifting.
2161 */
2162 (void) page_release(pp, 1);
2163 }
2164 }
2165 }
2166
2167 /*
2168 * vm_map_inval()
2169 *
2170 * Invalidate as many pages as possible within the given mapping for the given
2171 * process. addr is expected to be the base address of the mapping and size is
2172 * the length of the mapping. In some cases a mapping will encompass an
2173 * entire segment, but at least for anon or stack mappings, these will be
2174 * regions within a single large segment. Thus, the invalidation is oriented
2175 * around a single mapping and not an entire segment.
2176 *
2177 * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
2178 * this code is only applicable to x86.
2179 */
2180 int
2181 vm_map_inval(pid_t pid, caddr_t addr, size_t size)
2182 {
2183 int ret;
2184 int error = 0;
2185 proc_t *p; /* target proc */
2186 struct as *as; /* target proc's address space */
2187 struct seg *seg; /* working segment */
2188
2189 if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
2190 return (set_errno(EPERM));
2191
2192 /* If not a valid mapping address, return an error */
2193 if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
2194 return (set_errno(EINVAL));
2195
2196 again:
2197 mutex_enter(&pidlock);
2198 p = prfind(pid);
2199 if (p == NULL) {
2200 mutex_exit(&pidlock);
2201 return (set_errno(ESRCH));
2202 }
2203
2204 mutex_enter(&p->p_lock);
2205 mutex_exit(&pidlock);
2206
2207 if (panicstr != NULL) {
2208 mutex_exit(&p->p_lock);
2209 return (0);
2210 }
2211
2212 as = p->p_as;
2213
2214 /*
2215 * Try to set P_PR_LOCK - prevents process "changing shape"
2216 * - blocks fork
2217 * - blocks sigkill
2218 * - cannot be a system proc
2219 * - must be fully created proc
2220 */
2221 ret = sprtrylock_proc(p);
2222 if (ret == -1) {
2223 /* Process in invalid state */
2224 mutex_exit(&p->p_lock);
2225 return (set_errno(ESRCH));
2226 }
2227
2228 if (ret == 1) {
2229 /*
2230 * P_PR_LOCK is already set. Wait and try again. This also
2231 * drops p_lock so p may no longer be valid since the proc may
2232 * have exited.
2233 */
2234 sprwaitlock_proc(p);
2235 goto again;
2236 }
2237
2238 /* P_PR_LOCK is now set */
2239 mutex_exit(&p->p_lock);
2240
2241 AS_LOCK_ENTER(as, RW_READER);
2242 if ((seg = as_segat(as, addr)) == NULL) {
2243 AS_LOCK_EXIT(as);
2244 mutex_enter(&p->p_lock);
2245 sprunlock(p);
2246 return (set_errno(ENOMEM));
2247 }
2248
2249 /*
2250 * The invalidation behavior only makes sense for vnode-backed segments.
2251 */
2252 if (seg->s_ops != &segvn_ops) {
2253 AS_LOCK_EXIT(as);
2254 mutex_enter(&p->p_lock);
2255 sprunlock(p);
2256 return (0);
2257 }
2258
2259 /*
2260 * If the mapping is out of bounds of the segement return an error.
2261 */
2262 if ((addr + size) > (seg->s_base + seg->s_size)) {
2263 AS_LOCK_EXIT(as);
2264 mutex_enter(&p->p_lock);
2265 sprunlock(p);
2266 return (set_errno(EINVAL));
2267 }
2268
2269 /*
2270 * Don't use MS_INVALCURPROC flag here since that would eventually
2271 * initiate hat invalidation based on curthread. Since we're doing this
2272 * on behalf of a different process, that would erroneously invalidate
2273 * our own process mappings.
2274 */
2275 error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
2276 if (error == 0) {
2277 /*
2278 * Since we didn't invalidate during the sync above, we now
2279 * try to invalidate all of the pages in the mapping.
2280 */
2281 map_inval(p, seg, addr, size);
2282 }
2283 AS_LOCK_EXIT(as);
2284
2285 mutex_enter(&p->p_lock);
2286 sprunlock(p);
2287
2288 if (error)
2289 (void) set_errno(error);
2290 return (error);
2291 }
2292 #endif