Print this page
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/vm/vm_usage.c
+++ new/usr/src/uts/common/vm/vm_usage.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 /*
28 28 * Copyright 2016, Joyent, Inc.
29 29 */
30 30
31 31 /*
32 32 * vm_usage
33 33 *
34 34 * This file implements the getvmusage() private system call.
35 35 * getvmusage() counts the amount of resident memory pages and swap
36 36 * reserved by the specified process collective. A "process collective" is
37 37 * the set of processes owned by a particular, zone, project, task, or user.
38 38 *
39 39 * rss and swap are counted so that for a given process collective, a page is
40 40 * only counted once. For example, this means that if multiple processes in
41 41 * the same project map the same page, then the project will only be charged
42 42 * once for that page. On the other hand, if two processes in different
43 43 * projects map the same page, then both projects will be charged
44 44 * for the page.
45 45 *
46 46 * The vm_getusage() calculation is implemented so that the first thread
47 47 * performs the rss/swap counting. Other callers will wait for that thread to
48 48 * finish, copying the results. This enables multiple rcapds and prstats to
49 49 * consume data from the same calculation. The results are also cached so that
50 50 * a caller interested in recent results can just copy them instead of starting
51 51 * a new calculation. The caller passes the maximium age (in seconds) of the
52 52 * data. If the cached data is young enough, the cache is copied, otherwise,
53 53 * a new calculation is executed and the cache is replaced with the new
54 54 * data.
55 55 *
56 56 * The rss calculation for each process collective is as follows:
57 57 *
58 58 * - Inspect flags, determine if counting rss for zones, projects, tasks,
59 59 * and/or users.
60 60 * - For each proc:
61 61 * - Figure out proc's collectives (zone, project, task, and/or user).
62 62 * - For each seg in proc's address space:
63 63 * - If seg is private:
64 64 * - Lookup anons in the amp.
65 65 * - For incore pages not previously visited each of the
66 66 * proc's collectives, add incore pagesize to each.
67 67 * collective.
68 68 * Anon's with a refcnt of 1 can be assummed to be not
69 69 * previously visited.
70 70 * - For address ranges without anons in the amp:
71 71 * - Lookup pages in underlying vnode.
72 72 * - For incore pages not previously visiting for
73 73 * each of the proc's collectives, add incore
74 74 * pagesize to each collective.
75 75 * - If seg is shared:
76 76 * - Lookup pages in the shared amp or vnode.
77 77 * - For incore pages not previously visited for each of
78 78 * the proc's collectives, add incore pagesize to each
79 79 * collective.
80 80 *
81 81 * Swap is reserved by private segments, and shared anonymous segments.
82 82 * The only shared anon segments which do not reserve swap are ISM segments
83 83 * and schedctl segments, both of which can be identified by having
84 84 * amp->swresv == 0.
85 85 *
86 86 * The swap calculation for each collective is as follows:
87 87 *
88 88 * - Inspect flags, determine if counting rss for zones, projects, tasks,
89 89 * and/or users.
90 90 * - For each proc:
91 91 * - Figure out proc's collectives (zone, project, task, and/or user).
92 92 * - For each seg in proc's address space:
93 93 * - If seg is private:
94 94 * - Add svd->swresv pages to swap count for each of the
95 95 * proc's collectives.
96 96 * - If seg is anon, shared, and amp->swresv != 0
97 97 * - For address ranges in amp not previously visited for
98 98 * each of the proc's collectives, add size of address
99 99 * range to the swap count for each collective.
100 100 *
101 101 * These two calculations are done simultaneously, with most of the work
102 102 * being done in vmu_calculate_seg(). The results of the calculation are
103 103 * copied into "vmu_data.vmu_cache_results".
104 104 *
105 105 * To perform the calculation, various things are tracked and cached:
106 106 *
107 107 * - incore/not-incore page ranges for all vnodes.
108 108 * (vmu_data.vmu_all_vnodes_hash)
109 109 * This eliminates looking up the same page more than once.
110 110 *
|
↓ open down ↓ |
110 lines elided |
↑ open up ↑ |
111 111 * - incore/not-incore page ranges for all shared amps.
112 112 * (vmu_data.vmu_all_amps_hash)
113 113 * This eliminates looking up the same page more than once.
114 114 *
115 115 * - visited page ranges for each collective.
116 116 * - per vnode (entity->vme_vnode_hash)
117 117 * - per shared amp (entity->vme_amp_hash)
118 118 * For accurate counting of map-shared and COW-shared pages.
119 119 *
120 120 * - visited private anons (refcnt > 1) for each collective.
121 - * (entity->vme_anon)
121 + * (entity->vme_anon_hash)
122 122 * For accurate counting of COW-shared pages.
123 123 *
124 124 * The common accounting structure is the vmu_entity_t, which represents
125 125 * collectives:
126 126 *
127 127 * - A zone.
128 128 * - A project, task, or user within a zone.
129 129 * - The entire system (vmu_data.vmu_system).
130 130 * - Each collapsed (col) project and user. This means a given projid or
131 131 * uid, regardless of which zone the process is in. For instance,
132 132 * project 0 in the global zone and project 0 in a non global zone are
133 133 * the same collapsed project.
134 134 *
135 135 * Each entity structure tracks which pages have been already visited for
136 136 * that entity (via previously inspected processes) so that these pages are
137 137 * not double counted.
138 138 */
139 139
140 140 #include <sys/errno.h>
141 141 #include <sys/types.h>
142 142 #include <sys/zone.h>
143 143 #include <sys/proc.h>
144 144 #include <sys/project.h>
145 145 #include <sys/task.h>
146 146 #include <sys/thread.h>
147 147 #include <sys/time.h>
148 148 #include <sys/mman.h>
|
↓ open down ↓ |
17 lines elided |
↑ open up ↑ |
149 149 #include <sys/modhash.h>
150 150 #include <sys/modhash_impl.h>
151 151 #include <sys/shm.h>
152 152 #include <sys/swap.h>
153 153 #include <sys/synch.h>
154 154 #include <sys/systm.h>
155 155 #include <sys/var.h>
156 156 #include <sys/vm_usage.h>
157 157 #include <sys/zone.h>
158 158 #include <sys/sunddi.h>
159 -#include <sys/sysmacros.h>
160 159 #include <sys/avl.h>
161 160 #include <vm/anon.h>
162 161 #include <vm/as.h>
163 162 #include <vm/seg_vn.h>
164 163 #include <vm/seg_spt.h>
165 164
166 165 #define VMUSAGE_HASH_SIZE 512
167 166
168 167 #define VMUSAGE_TYPE_VNODE 1
169 168 #define VMUSAGE_TYPE_AMP 2
170 169 #define VMUSAGE_TYPE_ANON 3
171 170
172 171 #define VMUSAGE_BOUND_UNKNOWN 0
173 172 #define VMUSAGE_BOUND_INCORE 1
174 173 #define VMUSAGE_BOUND_NOT_INCORE 2
175 174
176 175 #define ISWITHIN(node, addr) ((node)->vmb_start <= addr && \
177 176 (node)->vmb_end >= addr ? 1 : 0)
178 177
179 178 /*
180 179 * bounds for vnodes and shared amps
181 180 * Each bound is either entirely incore, entirely not in core, or
182 181 * entirely unknown. bounds are stored in an avl tree sorted by start member
183 182 * when in use, otherwise (free or temporary lists) they're strung
184 183 * together off of vmb_next.
185 184 */
186 185 typedef struct vmu_bound {
187 186 avl_node_t vmb_node;
188 187 struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
189 188 pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */
190 189 pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */
191 190 char vmb_type; /* One of VMUSAGE_BOUND_* */
192 191 } vmu_bound_t;
193 192
194 193 /*
195 194 * hash of visited objects (vnodes or shared amps)
196 195 * key is address of vnode or amp. Bounds lists known incore/non-incore
|
↓ open down ↓ |
27 lines elided |
↑ open up ↑ |
197 196 * bounds for vnode/amp.
198 197 */
199 198 typedef struct vmu_object {
200 199 struct vmu_object *vmo_next; /* free list */
201 200 caddr_t vmo_key;
202 201 short vmo_type;
203 202 avl_tree_t vmo_bounds;
204 203 } vmu_object_t;
205 204
206 205 /*
207 - * Node for tree of visited COW anons.
208 - */
209 -typedef struct vmu_anon {
210 - avl_node_t vma_node;
211 - uintptr_t vma_addr;
212 -} vmu_anon_t;
213 -
214 -/*
215 206 * Entity by which to count results.
216 207 *
217 208 * The entity structure keeps the current rss/swap counts for each entity
218 209 * (zone, project, etc), and hashes of vm structures that have already
219 210 * been visited for the entity.
220 211 *
221 212 * vme_next: links the list of all entities currently being counted by
222 213 * vmu_calculate().
223 214 *
224 215 * vme_next_calc: links the list of entities related to the current process
225 216 * being counted by vmu_calculate_proc().
226 217 *
|
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
227 218 * vmu_calculate_proc() walks all processes. For each process, it makes a
228 219 * list of the entities related to that process using vme_next_calc. This
229 220 * list changes each time vmu_calculate_proc() is called.
230 221 *
231 222 */
232 223 typedef struct vmu_entity {
233 224 struct vmu_entity *vme_next;
234 225 struct vmu_entity *vme_next_calc;
235 226 mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
236 227 mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
237 - avl_tree_t vme_anon; /* COW anons visited for entity */
228 + mod_hash_t *vme_anon_hash; /* COW anons visited for entity */
238 229 vmusage_t vme_result; /* identifies entity and results */
239 230 } vmu_entity_t;
240 231
241 232 /*
242 233 * Hash of entities visited within a zone, and an entity for the zone
243 234 * itself.
244 235 */
245 236 typedef struct vmu_zone {
246 237 struct vmu_zone *vmz_next; /* free list */
247 238 id_t vmz_id;
248 239 vmu_entity_t *vmz_zone;
249 240 mod_hash_t *vmz_projects_hash;
250 241 mod_hash_t *vmz_tasks_hash;
251 242 mod_hash_t *vmz_rusers_hash;
252 243 mod_hash_t *vmz_eusers_hash;
253 244 } vmu_zone_t;
254 245
255 246 /*
256 247 * Cache of results from last calculation
257 248 */
258 249 typedef struct vmu_cache {
259 250 vmusage_t *vmc_results; /* Results from last call to */
260 251 /* vm_getusage(). */
261 252 uint64_t vmc_nresults; /* Count of cached results */
262 253 uint64_t vmc_refcnt; /* refcnt for free */
263 254 uint_t vmc_flags; /* Flags for vm_getusage() */
264 255 hrtime_t vmc_timestamp; /* when cache was created */
265 256 } vmu_cache_t;
266 257
267 258 /*
268 259 * top level rss info for the system
269 260 */
270 261 typedef struct vmu_data {
271 262 kmutex_t vmu_lock; /* Protects vmu_data */
272 263 kcondvar_t vmu_cv; /* Used to signal threads */
273 264 /* Waiting for */
274 265 /* Rss_calc_thread to finish */
275 266 vmu_entity_t *vmu_system; /* Entity for tracking */
276 267 /* rss/swap for all processes */
277 268 /* in all zones */
278 269 mod_hash_t *vmu_zones_hash; /* Zones visited */
279 270 mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */
280 271 mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */
281 272 mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */
282 273 /* to implement VMUSAGE_COL_* */
283 274 /* flags, which aggregate by */
284 275 /* project or user regardless */
285 276 /* of zoneid. */
286 277 mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */
287 278 /* to track incore/not-incore */
288 279 mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */
289 280 /* amps to track incore/not- */
290 281 /* incore */
291 282 vmu_entity_t *vmu_entities; /* Linked list of entities */
292 283 size_t vmu_nentities; /* Count of entities in list */
293 284 vmu_cache_t *vmu_cache; /* Cached results */
294 285 kthread_t *vmu_calc_thread; /* NULL, or thread running */
295 286 /* vmu_calculate() */
296 287 uint_t vmu_calc_flags; /* Flags being using by */
297 288 /* currently running calc */
298 289 /* thread */
299 290 uint_t vmu_pending_flags; /* Flags of vm_getusage() */
300 291 /* threads waiting for */
301 292 /* calc thread to finish */
302 293 uint_t vmu_pending_waiters; /* Number of threads waiting */
303 294 /* for calc thread */
304 295 vmu_bound_t *vmu_free_bounds;
305 296 vmu_object_t *vmu_free_objects;
306 297 vmu_entity_t *vmu_free_entities;
307 298 vmu_zone_t *vmu_free_zones;
308 299 } vmu_data_t;
309 300
310 301 extern struct as kas;
311 302 extern proc_t *practive;
312 303 extern zone_t *global_zone;
313 304 extern struct seg_ops segvn_ops;
314 305 extern struct seg_ops segspt_shmops;
315 306
316 307 static vmu_data_t vmu_data;
317 308 static kmem_cache_t *vmu_bound_cache;
318 309 static kmem_cache_t *vmu_object_cache;
319 310
320 311 /*
321 312 * Comparison routine for AVL tree. We base our comparison on vmb_start.
322 313 */
323 314 static int
324 315 bounds_cmp(const void *bnd1, const void *bnd2)
325 316 {
326 317 const vmu_bound_t *bound1 = bnd1;
327 318 const vmu_bound_t *bound2 = bnd2;
328 319
329 320 if (bound1->vmb_start == bound2->vmb_start) {
|
↓ open down ↓ |
82 lines elided |
↑ open up ↑ |
330 321 return (0);
331 322 }
332 323 if (bound1->vmb_start < bound2->vmb_start) {
333 324 return (-1);
334 325 }
335 326
336 327 return (1);
337 328 }
338 329
339 330 /*
340 - * Comparison routine for our AVL tree of anon structures.
341 - */
342 -static int
343 -vmu_anon_cmp(const void *lhs, const void *rhs)
344 -{
345 - const vmu_anon_t *l = lhs, *r = rhs;
346 -
347 - if (l->vma_addr == r->vma_addr)
348 - return (0);
349 -
350 - if (l->vma_addr < r->vma_addr)
351 - return (-1);
352 -
353 - return (1);
354 -}
355 -
356 -/*
357 331 * Save a bound on the free list.
358 332 */
359 333 static void
360 334 vmu_free_bound(vmu_bound_t *bound)
361 335 {
362 336 bound->vmb_next = vmu_data.vmu_free_bounds;
363 337 bound->vmb_start = 0;
364 338 bound->vmb_end = 0;
365 339 bound->vmb_type = 0;
366 340 vmu_data.vmu_free_bounds = bound;
367 341 }
368 342
369 343 /*
370 344 * Free an object, and all visited bound info.
371 345 */
372 346 static void
373 347 vmu_free_object(mod_hash_val_t val)
374 348 {
375 349 vmu_object_t *obj = (vmu_object_t *)val;
376 350 avl_tree_t *tree = &(obj->vmo_bounds);
377 351 vmu_bound_t *bound;
378 352 void *cookie = NULL;
379 353
380 354 while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
381 355 vmu_free_bound(bound);
382 356 avl_destroy(tree);
383 357
384 358 obj->vmo_type = 0;
385 359 obj->vmo_next = vmu_data.vmu_free_objects;
|
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
386 360 vmu_data.vmu_free_objects = obj;
387 361 }
388 362
389 363 /*
390 364 * Free an entity, and hashes of visited objects for that entity.
391 365 */
392 366 static void
393 367 vmu_free_entity(mod_hash_val_t val)
394 368 {
395 369 vmu_entity_t *entity = (vmu_entity_t *)val;
396 - vmu_anon_t *anon;
397 - void *cookie = NULL;
398 370
399 371 if (entity->vme_vnode_hash != NULL)
400 372 i_mod_hash_clear_nosync(entity->vme_vnode_hash);
401 373 if (entity->vme_amp_hash != NULL)
402 374 i_mod_hash_clear_nosync(entity->vme_amp_hash);
375 + if (entity->vme_anon_hash != NULL)
376 + i_mod_hash_clear_nosync(entity->vme_anon_hash);
403 377
404 - while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
405 - kmem_free(anon, sizeof (vmu_anon_t));
406 -
407 - avl_destroy(&entity->vme_anon);
408 -
409 378 entity->vme_next = vmu_data.vmu_free_entities;
410 379 vmu_data.vmu_free_entities = entity;
411 380 }
412 381
413 382 /*
414 383 * Free zone entity, and all hashes of entities inside that zone,
415 384 * which are projects, tasks, and users.
416 385 */
417 386 static void
418 387 vmu_free_zone(mod_hash_val_t val)
419 388 {
420 389 vmu_zone_t *zone = (vmu_zone_t *)val;
421 390
422 391 if (zone->vmz_zone != NULL) {
423 392 vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
424 393 zone->vmz_zone = NULL;
425 394 }
426 395 if (zone->vmz_projects_hash != NULL)
427 396 i_mod_hash_clear_nosync(zone->vmz_projects_hash);
428 397 if (zone->vmz_tasks_hash != NULL)
429 398 i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
430 399 if (zone->vmz_rusers_hash != NULL)
431 400 i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
432 401 if (zone->vmz_eusers_hash != NULL)
433 402 i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
434 403 zone->vmz_next = vmu_data.vmu_free_zones;
435 404 vmu_data.vmu_free_zones = zone;
436 405 }
437 406
438 407 /*
439 408 * Initialize synchronization primitives and hashes for system-wide tracking
440 409 * of visited vnodes and shared amps. Initialize results cache.
441 410 */
442 411 void
443 412 vm_usage_init()
444 413 {
445 414 mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
446 415 cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
447 416
448 417 vmu_data.vmu_system = NULL;
449 418 vmu_data.vmu_zones_hash = NULL;
450 419 vmu_data.vmu_projects_col_hash = NULL;
451 420 vmu_data.vmu_rusers_col_hash = NULL;
452 421 vmu_data.vmu_eusers_col_hash = NULL;
453 422
454 423 vmu_data.vmu_free_bounds = NULL;
455 424 vmu_data.vmu_free_objects = NULL;
456 425 vmu_data.vmu_free_entities = NULL;
457 426 vmu_data.vmu_free_zones = NULL;
458 427
459 428 vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
460 429 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
461 430 sizeof (vnode_t));
462 431 vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
463 432 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
464 433 sizeof (struct anon_map));
465 434 vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
466 435 "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
467 436 vmu_free_entity);
468 437 vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
469 438 "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
470 439 vmu_free_entity);
471 440 vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
472 441 "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
473 442 vmu_free_entity);
474 443 vmu_data.vmu_zones_hash = mod_hash_create_idhash(
475 444 "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
476 445
477 446 vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
478 447 sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
479 448 vmu_object_cache = kmem_cache_create("vmu_object_cache",
480 449 sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
481 450
482 451 vmu_data.vmu_entities = NULL;
483 452 vmu_data.vmu_nentities = 0;
484 453
485 454 vmu_data.vmu_cache = NULL;
486 455 vmu_data.vmu_calc_thread = NULL;
487 456 vmu_data.vmu_calc_flags = 0;
488 457 vmu_data.vmu_pending_flags = 0;
489 458 vmu_data.vmu_pending_waiters = 0;
490 459 }
491 460
492 461 /*
493 462 * Allocate hashes for tracking vm objects visited for an entity.
494 463 * Update list of entities.
495 464 */
496 465 static vmu_entity_t *
497 466 vmu_alloc_entity(id_t id, int type, id_t zoneid)
498 467 {
499 468 vmu_entity_t *entity;
500 469
501 470 if (vmu_data.vmu_free_entities != NULL) {
502 471 entity = vmu_data.vmu_free_entities;
503 472 vmu_data.vmu_free_entities =
504 473 vmu_data.vmu_free_entities->vme_next;
505 474 bzero(&entity->vme_result, sizeof (vmusage_t));
506 475 } else {
507 476 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
508 477 }
509 478 entity->vme_result.vmu_id = id;
510 479 entity->vme_result.vmu_zoneid = zoneid;
511 480 entity->vme_result.vmu_type = type;
512 481
|
↓ open down ↓ |
94 lines elided |
↑ open up ↑ |
513 482 if (entity->vme_vnode_hash == NULL)
514 483 entity->vme_vnode_hash = mod_hash_create_ptrhash(
515 484 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
516 485 sizeof (vnode_t));
517 486
518 487 if (entity->vme_amp_hash == NULL)
519 488 entity->vme_amp_hash = mod_hash_create_ptrhash(
520 489 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
521 490 sizeof (struct anon_map));
522 491
523 - VERIFY(avl_first(&entity->vme_anon) == NULL);
492 + if (entity->vme_anon_hash == NULL)
493 + entity->vme_anon_hash = mod_hash_create_ptrhash(
494 + "vmusage anon hash", VMUSAGE_HASH_SIZE,
495 + mod_hash_null_valdtor, sizeof (struct anon));
524 496
525 - avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
526 - offsetof(struct vmu_anon, vma_node));
527 -
528 497 entity->vme_next = vmu_data.vmu_entities;
529 498 vmu_data.vmu_entities = entity;
530 499 vmu_data.vmu_nentities++;
531 500
532 501 return (entity);
533 502 }
534 503
535 504 /*
536 505 * Allocate a zone entity, and hashes for tracking visited vm objects
537 506 * for projects, tasks, and users within that zone.
538 507 */
539 508 static vmu_zone_t *
540 509 vmu_alloc_zone(id_t id)
541 510 {
542 511 vmu_zone_t *zone;
543 512
544 513 if (vmu_data.vmu_free_zones != NULL) {
545 514 zone = vmu_data.vmu_free_zones;
546 515 vmu_data.vmu_free_zones =
547 516 vmu_data.vmu_free_zones->vmz_next;
548 517 zone->vmz_next = NULL;
549 518 zone->vmz_zone = NULL;
550 519 } else {
551 520 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
552 521 }
553 522
554 523 zone->vmz_id = id;
555 524
556 525 if ((vmu_data.vmu_calc_flags &
557 526 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
558 527 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
559 528
560 529 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
561 530 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
562 531 zone->vmz_projects_hash = mod_hash_create_idhash(
563 532 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
564 533
565 534 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
566 535 != 0 && zone->vmz_tasks_hash == NULL)
567 536 zone->vmz_tasks_hash = mod_hash_create_idhash(
568 537 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
569 538
570 539 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
571 540 != 0 && zone->vmz_rusers_hash == NULL)
572 541 zone->vmz_rusers_hash = mod_hash_create_idhash(
573 542 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
574 543
575 544 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
576 545 != 0 && zone->vmz_eusers_hash == NULL)
577 546 zone->vmz_eusers_hash = mod_hash_create_idhash(
578 547 "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
579 548
580 549 return (zone);
581 550 }
582 551
583 552 /*
584 553 * Allocate a structure for tracking visited bounds for a vm object.
585 554 */
586 555 static vmu_object_t *
587 556 vmu_alloc_object(caddr_t key, int type)
588 557 {
589 558 vmu_object_t *object;
590 559
591 560 if (vmu_data.vmu_free_objects != NULL) {
592 561 object = vmu_data.vmu_free_objects;
593 562 vmu_data.vmu_free_objects =
594 563 vmu_data.vmu_free_objects->vmo_next;
595 564 } else {
596 565 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
597 566 }
598 567
599 568 object->vmo_next = NULL;
600 569 object->vmo_key = key;
601 570 object->vmo_type = type;
602 571 avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
603 572
604 573 return (object);
605 574 }
606 575
607 576 /*
608 577 * Allocate and return a bound structure.
609 578 */
610 579 static vmu_bound_t *
611 580 vmu_alloc_bound()
612 581 {
613 582 vmu_bound_t *bound;
614 583
615 584 if (vmu_data.vmu_free_bounds != NULL) {
616 585 bound = vmu_data.vmu_free_bounds;
617 586 vmu_data.vmu_free_bounds =
618 587 vmu_data.vmu_free_bounds->vmb_next;
619 588 } else {
620 589 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
621 590 }
622 591
623 592 bound->vmb_next = NULL;
624 593 bound->vmb_start = 0;
625 594 bound->vmb_end = 0;
626 595 bound->vmb_type = 0;
627 596 return (bound);
628 597 }
629 598
630 599 /*
631 600 * vmu_find_insert_* functions implement hash lookup or allocate and
632 601 * insert operations.
633 602 */
634 603 static vmu_object_t *
635 604 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
636 605 {
637 606 int ret;
638 607 vmu_object_t *object;
639 608
640 609 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
641 610 (mod_hash_val_t *)&object);
|
↓ open down ↓ |
104 lines elided |
↑ open up ↑ |
642 611 if (ret != 0) {
643 612 object = vmu_alloc_object(key, type);
644 613 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
645 614 (mod_hash_val_t)object, (mod_hash_hndl_t)0);
646 615 ASSERT(ret == 0);
647 616 }
648 617 return (object);
649 618 }
650 619
651 620 static int
652 -vmu_find_insert_anon(vmu_entity_t *entity, void *key)
621 +vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
653 622 {
654 - vmu_anon_t anon, *ap;
623 + int ret;
624 + caddr_t val;
655 625
656 - anon.vma_addr = (uintptr_t)key;
626 + ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
627 + (mod_hash_val_t *)&val);
657 628
658 - if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
629 + if (ret == 0)
659 630 return (0);
660 631
661 - ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
662 - ap->vma_addr = (uintptr_t)key;
632 + ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
633 + (mod_hash_val_t)key, (mod_hash_hndl_t)0);
663 634
664 - avl_add(&entity->vme_anon, ap);
635 + ASSERT(ret == 0);
665 636
666 637 return (1);
667 638 }
668 639
669 640 static vmu_entity_t *
670 641 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
671 642 {
672 643 int ret;
673 644 vmu_entity_t *entity;
674 645
675 646 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
676 647 (mod_hash_val_t *)&entity);
677 648 if (ret != 0) {
678 649 entity = vmu_alloc_entity(id, type, zoneid);
679 650 ret = i_mod_hash_insert_nosync(hash,
680 651 (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
681 652 (mod_hash_hndl_t)0);
682 653 ASSERT(ret == 0);
683 654 }
684 655 return (entity);
685 656 }
686 657
687 658
688 659
689 660
690 661 /*
691 662 * Returns list of object bounds between start and end. New bounds inserted
692 663 * by this call are given type.
693 664 *
694 665 * Returns the number of pages covered if new bounds are created. Returns 0
695 666 * if region between start/end consists of all existing bounds.
696 667 */
697 668 static pgcnt_t
698 669 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
699 670 end, char type, vmu_bound_t **first, vmu_bound_t **last)
700 671 {
701 672 avl_tree_t *tree = &(ro->vmo_bounds);
702 673 avl_index_t where;
703 674 vmu_bound_t *walker, *tmp;
704 675 pgcnt_t ret = 0;
705 676
706 677 ASSERT(start <= end);
707 678
708 679 *first = *last = NULL;
709 680
710 681 tmp = vmu_alloc_bound();
711 682 tmp->vmb_start = start;
712 683 tmp->vmb_type = type;
713 684
714 685 /* Hopelessly optimistic case. */
715 686 if (walker = avl_find(tree, tmp, &where)) {
716 687 /* We got lucky. */
717 688 vmu_free_bound(tmp);
718 689 *first = walker;
719 690 }
720 691
721 692 if (walker == NULL) {
722 693 /* Is start in the previous node? */
723 694 walker = avl_nearest(tree, where, AVL_BEFORE);
724 695 if (walker != NULL) {
725 696 if (ISWITHIN(walker, start)) {
726 697 /* We found start. */
727 698 vmu_free_bound(tmp);
728 699 *first = walker;
729 700 }
730 701 }
731 702 }
732 703
733 704 /*
734 705 * At this point, if *first is still NULL, then we
735 706 * didn't get a direct hit and start isn't covered
736 707 * by the previous node. We know that the next node
737 708 * must have a greater start value than we require
738 709 * because avl_find tells us where the AVL routines would
739 710 * insert our new node. We have some gap between the
740 711 * start we want and the next node.
741 712 */
742 713 if (*first == NULL) {
743 714 walker = avl_nearest(tree, where, AVL_AFTER);
744 715 if (walker != NULL && walker->vmb_start <= end) {
745 716 /* Fill the gap. */
746 717 tmp->vmb_end = walker->vmb_start - 1;
747 718 *first = tmp;
748 719 } else {
749 720 /* We have a gap over [start, end]. */
750 721 tmp->vmb_end = end;
751 722 *first = *last = tmp;
752 723 }
753 724 ret += tmp->vmb_end - tmp->vmb_start + 1;
754 725 avl_insert(tree, tmp, where);
755 726 }
756 727
757 728 ASSERT(*first != NULL);
758 729
759 730 if (*last != NULL) {
760 731 /* We're done. */
761 732 return (ret);
762 733 }
763 734
764 735 /*
765 736 * If we are here we still need to set *last and
766 737 * that may involve filling in some gaps.
767 738 */
768 739 *last = *first;
769 740 for (;;) {
770 741 if (ISWITHIN(*last, end)) {
771 742 /* We're done. */
772 743 break;
773 744 }
774 745 walker = AVL_NEXT(tree, *last);
775 746 if (walker == NULL || walker->vmb_start > end) {
776 747 /* Bottom or mid tree with gap. */
777 748 tmp = vmu_alloc_bound();
778 749 tmp->vmb_start = (*last)->vmb_end + 1;
779 750 tmp->vmb_end = end;
780 751 tmp->vmb_type = type;
781 752 ret += tmp->vmb_end - tmp->vmb_start + 1;
782 753 avl_insert_here(tree, tmp, *last, AVL_AFTER);
783 754 *last = tmp;
784 755 break;
785 756 } else {
786 757 if ((*last)->vmb_end + 1 != walker->vmb_start) {
787 758 /* Non-contiguous. */
788 759 tmp = vmu_alloc_bound();
789 760 tmp->vmb_start = (*last)->vmb_end + 1;
790 761 tmp->vmb_end = walker->vmb_start - 1;
791 762 tmp->vmb_type = type;
792 763 ret += tmp->vmb_end - tmp->vmb_start + 1;
793 764 avl_insert_here(tree, tmp, *last, AVL_AFTER);
794 765 *last = tmp;
795 766 } else {
796 767 *last = walker;
797 768 }
798 769 }
799 770 }
800 771
801 772 return (ret);
802 773 }
803 774
804 775 /*
805 776 * vmu_update_bounds()
806 777 *
807 778 * tree: avl_tree in which first and last hang.
808 779 *
809 780 * first, last: list of continuous bounds, of which zero or more are of
810 781 * type VMUSAGE_BOUND_UNKNOWN.
811 782 *
812 783 * new_tree: avl_tree in which new_first and new_last hang.
813 784 *
814 785 * new_first, new_last: list of continuous bounds, of which none are of
815 786 * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
816 787 * update the types of bounds in (first,last) with
817 788 * type VMUSAGE_BOUND_UNKNOWN.
818 789 *
819 790 * For the list of bounds (first,last), this function updates any bounds
820 791 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
821 792 * the list (new_first, new_last).
822 793 *
823 794 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
824 795 * (new_first, new_last), it will be split into multiple bounds.
825 796 *
826 797 * Return value:
827 798 * The number of pages in the list of bounds (first,last) that were of
828 799 * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
829 800 * VMUSAGE_BOUND_INCORE.
830 801 *
831 802 */
832 803 static pgcnt_t
833 804 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
834 805 avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
835 806 {
836 807 vmu_bound_t *next, *new_next, *tmp;
837 808 pgcnt_t rss = 0;
838 809
839 810 next = *first;
840 811 new_next = new_first;
841 812
842 813 /*
843 814 * Verify first and last bound are covered by new bounds if they
844 815 * have unknown type.
845 816 */
846 817 ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
847 818 (*first)->vmb_start >= new_first->vmb_start);
848 819 ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
849 820 (*last)->vmb_end <= new_last->vmb_end);
850 821 for (;;) {
851 822 /* If bound already has type, proceed to next bound. */
852 823 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
853 824 if (next == *last)
854 825 break;
855 826 next = AVL_NEXT(tree, next);
856 827 continue;
857 828 }
858 829 while (new_next->vmb_end < next->vmb_start)
859 830 new_next = AVL_NEXT(new_tree, new_next);
860 831 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
861 832 next->vmb_type = new_next->vmb_type;
862 833 if (new_next->vmb_end < next->vmb_end) {
863 834 /* need to split bound */
864 835 tmp = vmu_alloc_bound();
865 836 tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
866 837 tmp->vmb_start = new_next->vmb_end + 1;
867 838 tmp->vmb_end = next->vmb_end;
868 839 avl_insert_here(tree, tmp, next, AVL_AFTER);
869 840 next->vmb_end = new_next->vmb_end;
870 841 if (*last == next)
871 842 *last = tmp;
872 843 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
873 844 rss += next->vmb_end - next->vmb_start + 1;
874 845 next = tmp;
875 846 } else {
876 847 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
877 848 rss += next->vmb_end - next->vmb_start + 1;
878 849 if (next == *last)
879 850 break;
880 851 next = AVL_NEXT(tree, next);
881 852 }
882 853 }
883 854 return (rss);
884 855 }
885 856
886 857 /*
887 858 * Merges adjacent bounds with same type between first and last bound.
888 859 * After merge, last pointer may point to a different bound, as (incoming)
889 860 * last bound may have been merged away.
890 861 */
891 862 static void
892 863 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
893 864 {
894 865 vmu_bound_t *current;
895 866 vmu_bound_t *next;
896 867
897 868 ASSERT(tree != NULL);
898 869 ASSERT(*first != NULL);
899 870 ASSERT(*last != NULL);
900 871
901 872 current = *first;
902 873 while (current != *last) {
903 874 next = AVL_NEXT(tree, current);
904 875 if ((current->vmb_end + 1) == next->vmb_start &&
905 876 current->vmb_type == next->vmb_type) {
906 877 current->vmb_end = next->vmb_end;
907 878 avl_remove(tree, next);
908 879 vmu_free_bound(next);
909 880 if (next == *last) {
910 881 *last = current;
911 882 }
912 883 } else {
913 884 current = AVL_NEXT(tree, current);
914 885 }
915 886 }
916 887 }
917 888
918 889 /*
919 890 * Given an amp and a list of bounds, updates each bound's type with
920 891 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
921 892 *
922 893 * If a bound is partially incore, it will be split into two bounds.
923 894 * first and last may be modified, as bounds may be split into multiple
924 895 * bounds if they are partially incore/not-incore.
925 896 *
926 897 * Set incore to non-zero if bounds are already known to be incore.
927 898 *
928 899 */
929 900 static void
930 901 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
931 902 vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
932 903 {
933 904 vmu_bound_t *next;
934 905 vmu_bound_t *tmp;
935 906 pgcnt_t index;
936 907 short bound_type;
937 908 short page_type;
938 909 vnode_t *vn;
939 910 anoff_t off;
940 911 struct anon *ap;
941 912
942 913 next = *first;
943 914 /* Shared anon slots don't change once set. */
944 915 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
945 916 for (;;) {
946 917 if (incore == B_TRUE)
947 918 next->vmb_type = VMUSAGE_BOUND_INCORE;
948 919
949 920 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
950 921 if (next == *last)
951 922 break;
952 923 next = AVL_NEXT(tree, next);
953 924 continue;
954 925 }
955 926
956 927 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
957 928 bound_type = next->vmb_type;
958 929 index = next->vmb_start;
959 930 while (index <= next->vmb_end) {
960 931
961 932 /*
962 933 * These are used to determine how much to increment
963 934 * index when a large page is found.
964 935 */
965 936 page_t *page;
966 937 pgcnt_t pgcnt = 1;
967 938 uint_t pgshft;
968 939 pgcnt_t pgmsk;
969 940
970 941 ap = anon_get_ptr(amp->ahp, index);
971 942 if (ap != NULL)
972 943 swap_xlate(ap, &vn, &off);
973 944
974 945 if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
975 946 (page = page_exists(vn, off)) != NULL) {
976 947 if (PP_ISFREE(page))
977 948 page_type = VMUSAGE_BOUND_NOT_INCORE;
978 949 else
979 950 page_type = VMUSAGE_BOUND_INCORE;
980 951 if (page->p_szc > 0) {
981 952 pgcnt = page_get_pagecnt(page->p_szc);
982 953 pgshft = page_get_shift(page->p_szc);
983 954 pgmsk = (0x1 << (pgshft - PAGESHIFT))
984 955 - 1;
985 956 }
986 957 } else {
987 958 page_type = VMUSAGE_BOUND_NOT_INCORE;
988 959 }
989 960
990 961 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
991 962 next->vmb_type = page_type;
992 963 bound_type = page_type;
993 964 } else if (next->vmb_type != page_type) {
994 965 /*
995 966 * If current bound type does not match page
996 967 * type, need to split off new bound.
997 968 */
998 969 tmp = vmu_alloc_bound();
999 970 tmp->vmb_type = page_type;
1000 971 tmp->vmb_start = index;
1001 972 tmp->vmb_end = next->vmb_end;
1002 973 avl_insert_here(tree, tmp, next, AVL_AFTER);
1003 974 next->vmb_end = index - 1;
1004 975 if (*last == next)
1005 976 *last = tmp;
1006 977 next = tmp;
1007 978 }
1008 979 if (pgcnt > 1) {
1009 980 /*
1010 981 * If inside large page, jump to next large
1011 982 * page
1012 983 */
1013 984 index = (index & ~pgmsk) + pgcnt;
1014 985 } else {
1015 986 index++;
1016 987 }
1017 988 }
1018 989 if (next == *last) {
1019 990 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1020 991 break;
1021 992 } else
1022 993 next = AVL_NEXT(tree, next);
1023 994 }
1024 995 ANON_LOCK_EXIT(&->a_rwlock);
1025 996 }
1026 997
1027 998 /*
1028 999 * Same as vmu_amp_update_incore_bounds(), except for tracking
1029 1000 * incore-/not-incore for vnodes.
1030 1001 */
1031 1002 static void
1032 1003 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
1033 1004 vmu_bound_t **first, vmu_bound_t **last)
1034 1005 {
1035 1006 vmu_bound_t *next;
1036 1007 vmu_bound_t *tmp;
1037 1008 pgcnt_t index;
1038 1009 short bound_type;
1039 1010 short page_type;
1040 1011
1041 1012 next = *first;
1042 1013 for (;;) {
1043 1014 if (vnode->v_pages == NULL)
1044 1015 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1045 1016
1046 1017 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1047 1018 if (next == *last)
1048 1019 break;
1049 1020 next = AVL_NEXT(tree, next);
1050 1021 continue;
1051 1022 }
1052 1023
1053 1024 ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
1054 1025 bound_type = next->vmb_type;
1055 1026 index = next->vmb_start;
1056 1027 while (index <= next->vmb_end) {
1057 1028
1058 1029 /*
1059 1030 * These are used to determine how much to increment
1060 1031 * index when a large page is found.
1061 1032 */
1062 1033 page_t *page;
1063 1034 pgcnt_t pgcnt = 1;
1064 1035 uint_t pgshft;
1065 1036 pgcnt_t pgmsk;
1066 1037
1067 1038 if (vnode->v_pages != NULL &&
1068 1039 (page = page_exists(vnode, ptob(index))) != NULL) {
1069 1040 if (PP_ISFREE(page))
1070 1041 page_type = VMUSAGE_BOUND_NOT_INCORE;
1071 1042 else
1072 1043 page_type = VMUSAGE_BOUND_INCORE;
1073 1044 if (page->p_szc > 0) {
1074 1045 pgcnt = page_get_pagecnt(page->p_szc);
1075 1046 pgshft = page_get_shift(page->p_szc);
1076 1047 pgmsk = (0x1 << (pgshft - PAGESHIFT))
1077 1048 - 1;
1078 1049 }
1079 1050 } else {
1080 1051 page_type = VMUSAGE_BOUND_NOT_INCORE;
1081 1052 }
1082 1053
1083 1054 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1084 1055 next->vmb_type = page_type;
1085 1056 bound_type = page_type;
1086 1057 } else if (next->vmb_type != page_type) {
1087 1058 /*
1088 1059 * If current bound type does not match page
1089 1060 * type, need to split off new bound.
1090 1061 */
1091 1062 tmp = vmu_alloc_bound();
1092 1063 tmp->vmb_type = page_type;
1093 1064 tmp->vmb_start = index;
1094 1065 tmp->vmb_end = next->vmb_end;
1095 1066 avl_insert_here(tree, tmp, next, AVL_AFTER);
1096 1067 next->vmb_end = index - 1;
1097 1068 if (*last == next)
1098 1069 *last = tmp;
1099 1070 next = tmp;
1100 1071 }
1101 1072 if (pgcnt > 1) {
1102 1073 /*
1103 1074 * If inside large page, jump to next large
1104 1075 * page
1105 1076 */
1106 1077 index = (index & ~pgmsk) + pgcnt;
1107 1078 } else {
1108 1079 index++;
1109 1080 }
1110 1081 }
1111 1082 if (next == *last) {
1112 1083 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1113 1084 break;
1114 1085 } else
1115 1086 next = AVL_NEXT(tree, next);
1116 1087 }
1117 1088 }
1118 1089
1119 1090 /*
1120 1091 * Calculate the rss and swap consumed by a segment. vmu_entities is the
1121 1092 * list of entities to visit. For shared segments, the vnode or amp
1122 1093 * is looked up in each entity to see if it has been already counted. Private
1123 1094 * anon pages are checked per entity to ensure that COW pages are not
1124 1095 * double counted.
1125 1096 *
1126 1097 * For private mapped files, first the amp is checked for private pages.
1127 1098 * Bounds not backed by the amp are looked up in the vnode for each entity
1128 1099 * to avoid double counting of private COW vnode pages.
1129 1100 */
1130 1101 static void
1131 1102 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1132 1103 {
1133 1104 struct segvn_data *svd;
1134 1105 struct shm_data *shmd;
1135 1106 struct spt_data *sptd;
1136 1107 vmu_object_t *shared_object = NULL;
1137 1108 vmu_object_t *entity_object = NULL;
1138 1109 vmu_entity_t *entity;
1139 1110 vmusage_t *result;
1140 1111 vmu_bound_t *first = NULL;
1141 1112 vmu_bound_t *last = NULL;
1142 1113 vmu_bound_t *cur = NULL;
1143 1114 vmu_bound_t *e_first = NULL;
1144 1115 vmu_bound_t *e_last = NULL;
1145 1116 vmu_bound_t *tmp;
1146 1117 pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1147 1118 struct anon_map *private_amp = NULL;
1148 1119 boolean_t incore = B_FALSE;
1149 1120 boolean_t shared = B_FALSE;
1150 1121 int file = 0;
1151 1122 pgcnt_t swresv = 0;
1152 1123 pgcnt_t panon = 0;
1153 1124
1154 1125 /* Can zero-length segments exist? Not sure, so paranoia. */
1155 1126 if (seg->s_size <= 0)
1156 1127 return;
1157 1128
1158 1129 /*
1159 1130 * Figure out if there is a shared object (such as a named vnode or
1160 1131 * a shared amp, then figure out if there is a private amp, which
1161 1132 * identifies private pages.
1162 1133 */
1163 1134 if (seg->s_ops == &segvn_ops) {
1164 1135 svd = (struct segvn_data *)seg->s_data;
1165 1136 if (svd->type == MAP_SHARED) {
1166 1137 shared = B_TRUE;
1167 1138 } else {
1168 1139 swresv = svd->swresv;
1169 1140
1170 1141 if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1171 1142 RW_READER) != 0) {
1172 1143 /*
1173 1144 * Text replication anon maps can be shared
1174 1145 * across all zones. Space used for text
1175 1146 * replication is typically capped as a small %
1176 1147 * of memory. To keep it simple for now we
1177 1148 * don't account for swap and memory space used
1178 1149 * for text replication.
1179 1150 */
1180 1151 if (svd->tr_state == SEGVN_TR_OFF &&
1181 1152 svd->amp != NULL) {
1182 1153 private_amp = svd->amp;
1183 1154 p_start = svd->anon_index;
1184 1155 p_end = svd->anon_index +
1185 1156 btop(seg->s_size) - 1;
1186 1157 }
1187 1158 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1188 1159 }
1189 1160 }
1190 1161 if (svd->vp != NULL) {
1191 1162 file = 1;
1192 1163 shared_object = vmu_find_insert_object(
1193 1164 vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1194 1165 VMUSAGE_TYPE_VNODE);
1195 1166 s_start = btop(svd->offset);
1196 1167 s_end = btop(svd->offset + seg->s_size) - 1;
1197 1168 }
1198 1169 if (svd->amp != NULL && svd->type == MAP_SHARED) {
1199 1170 ASSERT(shared_object == NULL);
1200 1171 shared_object = vmu_find_insert_object(
1201 1172 vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1202 1173 VMUSAGE_TYPE_AMP);
1203 1174 s_start = svd->anon_index;
1204 1175 s_end = svd->anon_index + btop(seg->s_size) - 1;
1205 1176 /* schedctl mappings are always in core */
1206 1177 if (svd->amp->swresv == 0)
1207 1178 incore = B_TRUE;
1208 1179 }
1209 1180 } else if (seg->s_ops == &segspt_shmops) {
1210 1181 shared = B_TRUE;
1211 1182 shmd = (struct shm_data *)seg->s_data;
1212 1183 shared_object = vmu_find_insert_object(
1213 1184 vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1214 1185 VMUSAGE_TYPE_AMP);
1215 1186 s_start = 0;
1216 1187 s_end = btop(seg->s_size) - 1;
1217 1188 sptd = shmd->shm_sptseg->s_data;
1218 1189
1219 1190 /* ism segments are always incore and do not reserve swap */
1220 1191 if (sptd->spt_flags & SHM_SHARE_MMU)
1221 1192 incore = B_TRUE;
1222 1193
1223 1194 } else {
1224 1195 return;
1225 1196 }
1226 1197
1227 1198 /*
1228 1199 * If there is a private amp, count anon pages that exist. If an
1229 1200 * anon has a refcnt > 1 (COW sharing), then save the anon in a
1230 1201 * hash so that it is not double counted.
1231 1202 *
1232 1203 * If there is also a shared object, then figure out the bounds
1233 1204 * which are not mapped by the private amp.
1234 1205 */
1235 1206 if (private_amp != NULL) {
1236 1207
1237 1208 /* Enter as writer to prevent COW anons from being freed */
1238 1209 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1239 1210
1240 1211 p_index = p_start;
1241 1212 s_index = s_start;
1242 1213
1243 1214 while (p_index <= p_end) {
1244 1215
1245 1216 pgcnt_t p_index_next;
1246 1217 pgcnt_t p_bound_size;
1247 1218 int cnt;
1248 1219 anoff_t off;
1249 1220 struct vnode *vn;
1250 1221 struct anon *ap;
1251 1222 page_t *page; /* For handling of large */
1252 1223 pgcnt_t pgcnt = 1; /* pages */
1253 1224 pgcnt_t pgstart;
1254 1225 pgcnt_t pgend;
1255 1226 uint_t pgshft;
1256 1227 pgcnt_t pgmsk;
1257 1228
1258 1229 p_index_next = p_index;
1259 1230 ap = anon_get_next_ptr(private_amp->ahp,
1260 1231 &p_index_next);
1261 1232
1262 1233 /*
1263 1234 * If next anon is past end of mapping, simulate
1264 1235 * end of anon so loop terminates.
1265 1236 */
1266 1237 if (p_index_next > p_end) {
1267 1238 p_index_next = p_end + 1;
1268 1239 ap = NULL;
1269 1240 }
1270 1241 /*
1271 1242 * For COW segments, keep track of bounds not
1272 1243 * backed by private amp so they can be looked
1273 1244 * up in the backing vnode
1274 1245 */
1275 1246 if (p_index_next != p_index) {
1276 1247
1277 1248 /*
1278 1249 * Compute index difference between anon and
1279 1250 * previous anon.
1280 1251 */
1281 1252 p_bound_size = p_index_next - p_index - 1;
1282 1253
1283 1254 if (shared_object != NULL) {
1284 1255 cur = vmu_alloc_bound();
1285 1256 cur->vmb_start = s_index;
1286 1257 cur->vmb_end = s_index + p_bound_size;
1287 1258 cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1288 1259 if (first == NULL) {
1289 1260 first = cur;
1290 1261 last = cur;
1291 1262 } else {
1292 1263 last->vmb_next = cur;
1293 1264 last = cur;
1294 1265 }
1295 1266 }
1296 1267 p_index = p_index + p_bound_size + 1;
1297 1268 s_index = s_index + p_bound_size + 1;
1298 1269 }
1299 1270
1300 1271 /* Detect end of anons in amp */
1301 1272 if (ap == NULL)
1302 1273 break;
1303 1274
1304 1275 cnt = ap->an_refcnt;
1305 1276 swap_xlate(ap, &vn, &off);
1306 1277
1307 1278 if (vn == NULL || vn->v_pages == NULL ||
1308 1279 (page = page_exists(vn, off)) == NULL) {
1309 1280 p_index++;
1310 1281 s_index++;
1311 1282 continue;
1312 1283 }
1313 1284
1314 1285 /*
1315 1286 * If large page is found, compute portion of large
1316 1287 * page in mapping, and increment indicies to the next
1317 1288 * large page.
1318 1289 */
1319 1290 if (page->p_szc > 0) {
1320 1291
1321 1292 pgcnt = page_get_pagecnt(page->p_szc);
1322 1293 pgshft = page_get_shift(page->p_szc);
1323 1294 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1324 1295
1325 1296 /* First page in large page */
1326 1297 pgstart = p_index & ~pgmsk;
1327 1298 /* Last page in large page */
1328 1299 pgend = pgstart + pgcnt - 1;
1329 1300 /*
1330 1301 * Artifically end page if page extends past
1331 1302 * end of mapping.
1332 1303 */
1333 1304 if (pgend > p_end)
1334 1305 pgend = p_end;
1335 1306
1336 1307 /*
1337 1308 * Compute number of pages from large page
1338 1309 * which are mapped.
1339 1310 */
1340 1311 pgcnt = pgend - p_index + 1;
1341 1312
1342 1313 /*
1343 1314 * Point indicies at page after large page,
1344 1315 * or at page after end of mapping.
1345 1316 */
1346 1317 p_index += pgcnt;
1347 1318 s_index += pgcnt;
1348 1319 } else {
1349 1320 p_index++;
1350 1321 s_index++;
1351 1322 }
1352 1323
1353 1324 /*
1354 1325 * Pages on the free list aren't counted for the rss.
1355 1326 */
1356 1327 if (PP_ISFREE(page))
1357 1328 continue;
1358 1329
1359 1330 /*
1360 1331 * Assume anon structs with a refcnt
1361 1332 * of 1 are not COW shared, so there
1362 1333 * is no reason to track them per entity.
1363 1334 */
1364 1335 if (cnt == 1) {
1365 1336 panon += pgcnt;
|
↓ open down ↓ |
691 lines elided |
↑ open up ↑ |
1366 1337 continue;
1367 1338 }
1368 1339 for (entity = vmu_entities; entity != NULL;
1369 1340 entity = entity->vme_next_calc) {
1370 1341
1371 1342 result = &entity->vme_result;
1372 1343 /*
1373 1344 * Track COW anons per entity so
1374 1345 * they are not double counted.
1375 1346 */
1376 - if (vmu_find_insert_anon(entity, ap) == 0)
1347 + if (vmu_find_insert_anon(entity->vme_anon_hash,
1348 + (caddr_t)ap) == 0)
1377 1349 continue;
1378 1350
1379 1351 result->vmu_rss_all += (pgcnt << PAGESHIFT);
1380 1352 result->vmu_rss_private +=
1381 1353 (pgcnt << PAGESHIFT);
1382 1354 }
1383 1355 }
1384 1356 ANON_LOCK_EXIT(&private_amp->a_rwlock);
1385 1357 }
1386 1358
1387 1359 /* Add up resident anon and swap reserved for private mappings */
1388 1360 if (swresv > 0 || panon > 0) {
1389 1361 for (entity = vmu_entities; entity != NULL;
1390 1362 entity = entity->vme_next_calc) {
1391 1363 result = &entity->vme_result;
1392 1364 result->vmu_swap_all += swresv;
1393 1365 result->vmu_swap_private += swresv;
1394 1366 result->vmu_rss_all += (panon << PAGESHIFT);
1395 1367 result->vmu_rss_private += (panon << PAGESHIFT);
1396 1368 }
1397 1369 }
1398 1370
1399 1371 /* Compute resident pages backing shared amp or named vnode */
1400 1372 if (shared_object != NULL) {
1401 1373 avl_tree_t *tree = &(shared_object->vmo_bounds);
1402 1374
1403 1375 if (first == NULL) {
1404 1376 /*
1405 1377 * No private amp, or private amp has no anon
1406 1378 * structs. This means entire segment is backed by
1407 1379 * the shared object.
1408 1380 */
1409 1381 first = vmu_alloc_bound();
1410 1382 first->vmb_start = s_start;
1411 1383 first->vmb_end = s_end;
1412 1384 first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1413 1385 }
1414 1386 /*
1415 1387 * Iterate bounds not backed by private amp, and compute
1416 1388 * resident pages.
1417 1389 */
1418 1390 cur = first;
1419 1391 while (cur != NULL) {
1420 1392
1421 1393 if (vmu_insert_lookup_object_bounds(shared_object,
1422 1394 cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1423 1395 &first, &last) > 0) {
1424 1396 /* new bounds, find incore/not-incore */
1425 1397 if (shared_object->vmo_type ==
1426 1398 VMUSAGE_TYPE_VNODE) {
1427 1399 vmu_vnode_update_incore_bounds(
1428 1400 tree,
1429 1401 (vnode_t *)
1430 1402 shared_object->vmo_key, &first,
1431 1403 &last);
1432 1404 } else {
1433 1405 vmu_amp_update_incore_bounds(
1434 1406 tree,
1435 1407 (struct anon_map *)
1436 1408 shared_object->vmo_key, &first,
1437 1409 &last, incore);
1438 1410 }
1439 1411 vmu_merge_bounds(tree, &first, &last);
1440 1412 }
1441 1413 for (entity = vmu_entities; entity != NULL;
1442 1414 entity = entity->vme_next_calc) {
1443 1415 avl_tree_t *e_tree;
1444 1416
1445 1417 result = &entity->vme_result;
1446 1418
1447 1419 entity_object = vmu_find_insert_object(
1448 1420 shared_object->vmo_type ==
1449 1421 VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1450 1422 entity->vme_amp_hash,
1451 1423 shared_object->vmo_key,
1452 1424 shared_object->vmo_type);
1453 1425
1454 1426 virt = vmu_insert_lookup_object_bounds(
1455 1427 entity_object, cur->vmb_start, cur->vmb_end,
1456 1428 VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1457 1429
1458 1430 if (virt == 0)
1459 1431 continue;
1460 1432 /*
1461 1433 * Range visited for this entity
1462 1434 */
1463 1435 e_tree = &(entity_object->vmo_bounds);
1464 1436 rss = vmu_update_bounds(e_tree, &e_first,
1465 1437 &e_last, tree, first, last);
1466 1438 result->vmu_rss_all += (rss << PAGESHIFT);
1467 1439 if (shared == B_TRUE && file == B_FALSE) {
1468 1440 /* shared anon mapping */
1469 1441 result->vmu_swap_all +=
1470 1442 (virt << PAGESHIFT);
1471 1443 result->vmu_swap_shared +=
1472 1444 (virt << PAGESHIFT);
1473 1445 result->vmu_rss_shared +=
1474 1446 (rss << PAGESHIFT);
1475 1447 } else if (shared == B_TRUE && file == B_TRUE) {
1476 1448 /* shared file mapping */
1477 1449 result->vmu_rss_shared +=
1478 1450 (rss << PAGESHIFT);
1479 1451 } else if (shared == B_FALSE &&
1480 1452 file == B_TRUE) {
1481 1453 /* private file mapping */
1482 1454 result->vmu_rss_private +=
1483 1455 (rss << PAGESHIFT);
1484 1456 }
1485 1457 vmu_merge_bounds(e_tree, &e_first, &e_last);
1486 1458 }
1487 1459 tmp = cur;
1488 1460 cur = cur->vmb_next;
1489 1461 vmu_free_bound(tmp);
1490 1462 }
1491 1463 }
1492 1464 }
1493 1465
1494 1466 /*
1495 1467 * Based on the current calculation flags, find the relevant entities
1496 1468 * which are relative to the process. Then calculate each segment
1497 1469 * in the process'es address space for each relevant entity.
1498 1470 */
1499 1471 static void
1500 1472 vmu_calculate_proc(proc_t *p)
1501 1473 {
1502 1474 vmu_entity_t *entities = NULL;
1503 1475 vmu_zone_t *zone;
1504 1476 vmu_entity_t *tmp;
1505 1477 struct as *as;
1506 1478 struct seg *seg;
1507 1479 int ret;
1508 1480
1509 1481 /* Figure out which entities are being computed */
1510 1482 if ((vmu_data.vmu_system) != NULL) {
1511 1483 tmp = vmu_data.vmu_system;
1512 1484 tmp->vme_next_calc = entities;
1513 1485 entities = tmp;
1514 1486 }
1515 1487 if (vmu_data.vmu_calc_flags &
1516 1488 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
1517 1489 VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1518 1490 VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1519 1491 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1520 1492 VMUSAGE_ALL_EUSERS)) {
1521 1493 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1522 1494 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1523 1495 (mod_hash_val_t *)&zone);
1524 1496 if (ret != 0) {
1525 1497 zone = vmu_alloc_zone(p->p_zone->zone_id);
1526 1498 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1527 1499 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1528 1500 (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1529 1501 ASSERT(ret == 0);
1530 1502 }
1531 1503 if (zone->vmz_zone != NULL) {
1532 1504 tmp = zone->vmz_zone;
1533 1505 tmp->vme_next_calc = entities;
1534 1506 entities = tmp;
1535 1507 }
1536 1508 if (vmu_data.vmu_calc_flags &
1537 1509 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1538 1510 tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1539 1511 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1540 1512 zone->vmz_id);
1541 1513 tmp->vme_next_calc = entities;
1542 1514 entities = tmp;
1543 1515 }
1544 1516 if (vmu_data.vmu_calc_flags &
1545 1517 (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1546 1518 tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1547 1519 p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1548 1520 tmp->vme_next_calc = entities;
1549 1521 entities = tmp;
1550 1522 }
1551 1523 if (vmu_data.vmu_calc_flags &
1552 1524 (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1553 1525 tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1554 1526 crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1555 1527 tmp->vme_next_calc = entities;
1556 1528 entities = tmp;
1557 1529 }
1558 1530 if (vmu_data.vmu_calc_flags &
1559 1531 (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1560 1532 tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1561 1533 crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1562 1534 tmp->vme_next_calc = entities;
1563 1535 entities = tmp;
1564 1536 }
1565 1537 }
1566 1538 /* Entities which collapse projects and users for all zones */
1567 1539 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1568 1540 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1569 1541 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1570 1542 tmp->vme_next_calc = entities;
1571 1543 entities = tmp;
1572 1544 }
1573 1545 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1574 1546 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1575 1547 crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1576 1548 tmp->vme_next_calc = entities;
1577 1549 entities = tmp;
1578 1550 }
1579 1551 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1580 1552 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1581 1553 crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1582 1554 tmp->vme_next_calc = entities;
1583 1555 entities = tmp;
1584 1556 }
1585 1557
1586 1558 ASSERT(entities != NULL);
1587 1559 /* process all segs in process's address space */
1588 1560 as = p->p_as;
1589 1561 AS_LOCK_ENTER(as, RW_READER);
1590 1562 for (seg = AS_SEGFIRST(as); seg != NULL;
1591 1563 seg = AS_SEGNEXT(as, seg)) {
1592 1564 vmu_calculate_seg(entities, seg);
1593 1565 }
1594 1566 AS_LOCK_EXIT(as);
1595 1567 }
1596 1568
1597 1569 /*
1598 1570 * Free data created by previous call to vmu_calculate().
1599 1571 */
1600 1572 static void
1601 1573 vmu_clear_calc()
1602 1574 {
1603 1575 if (vmu_data.vmu_system != NULL)
1604 1576 vmu_free_entity(vmu_data.vmu_system);
1605 1577 vmu_data.vmu_system = NULL;
1606 1578 if (vmu_data.vmu_zones_hash != NULL)
1607 1579 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1608 1580 if (vmu_data.vmu_projects_col_hash != NULL)
1609 1581 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1610 1582 if (vmu_data.vmu_rusers_col_hash != NULL)
1611 1583 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1612 1584 if (vmu_data.vmu_eusers_col_hash != NULL)
1613 1585 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1614 1586
1615 1587 i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1616 1588 i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1617 1589 }
1618 1590
1619 1591 /*
1620 1592 * Free unused data structures. These can result if the system workload
1621 1593 * decreases between calculations.
1622 1594 */
1623 1595 static void
1624 1596 vmu_free_extra()
1625 1597 {
1626 1598 vmu_bound_t *tb;
1627 1599 vmu_object_t *to;
1628 1600 vmu_entity_t *te;
1629 1601 vmu_zone_t *tz;
1630 1602
1631 1603 while (vmu_data.vmu_free_bounds != NULL) {
1632 1604 tb = vmu_data.vmu_free_bounds;
1633 1605 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1634 1606 kmem_cache_free(vmu_bound_cache, tb);
1635 1607 }
1636 1608 while (vmu_data.vmu_free_objects != NULL) {
1637 1609 to = vmu_data.vmu_free_objects;
1638 1610 vmu_data.vmu_free_objects =
1639 1611 vmu_data.vmu_free_objects->vmo_next;
|
↓ open down ↓ |
253 lines elided |
↑ open up ↑ |
1640 1612 kmem_cache_free(vmu_object_cache, to);
1641 1613 }
1642 1614 while (vmu_data.vmu_free_entities != NULL) {
1643 1615 te = vmu_data.vmu_free_entities;
1644 1616 vmu_data.vmu_free_entities =
1645 1617 vmu_data.vmu_free_entities->vme_next;
1646 1618 if (te->vme_vnode_hash != NULL)
1647 1619 mod_hash_destroy_hash(te->vme_vnode_hash);
1648 1620 if (te->vme_amp_hash != NULL)
1649 1621 mod_hash_destroy_hash(te->vme_amp_hash);
1650 - VERIFY(avl_first(&te->vme_anon) == NULL);
1622 + if (te->vme_anon_hash != NULL)
1623 + mod_hash_destroy_hash(te->vme_anon_hash);
1651 1624 kmem_free(te, sizeof (vmu_entity_t));
1652 1625 }
1653 1626 while (vmu_data.vmu_free_zones != NULL) {
1654 1627 tz = vmu_data.vmu_free_zones;
1655 1628 vmu_data.vmu_free_zones =
1656 1629 vmu_data.vmu_free_zones->vmz_next;
1657 1630 if (tz->vmz_projects_hash != NULL)
1658 1631 mod_hash_destroy_hash(tz->vmz_projects_hash);
1659 1632 if (tz->vmz_tasks_hash != NULL)
1660 1633 mod_hash_destroy_hash(tz->vmz_tasks_hash);
1661 1634 if (tz->vmz_rusers_hash != NULL)
1662 1635 mod_hash_destroy_hash(tz->vmz_rusers_hash);
1663 1636 if (tz->vmz_eusers_hash != NULL)
1664 1637 mod_hash_destroy_hash(tz->vmz_eusers_hash);
1665 1638 kmem_free(tz, sizeof (vmu_zone_t));
1666 1639 }
1667 1640 }
1668 1641
1669 1642 extern kcondvar_t *pr_pid_cv;
1670 1643
1671 1644 /*
1672 1645 * Determine which entity types are relevant and allocate the hashes to
1673 1646 * track them. Then walk the process table and count rss and swap
1674 1647 * for each process'es address space. Address space object such as
1675 1648 * vnodes, amps and anons are tracked per entity, so that they are
1676 1649 * not double counted in the results.
1677 1650 *
1678 1651 */
1679 1652 static void
1680 1653 vmu_calculate()
1681 1654 {
1682 1655 int i = 0;
1683 1656 int ret;
1684 1657 proc_t *p;
1685 1658
1686 1659 vmu_clear_calc();
1687 1660
1688 1661 if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1689 1662 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1690 1663 ALL_ZONES);
1691 1664
1692 1665 /*
1693 1666 * Walk process table and calculate rss of each proc.
1694 1667 *
1695 1668 * Pidlock and p_lock cannot be held while doing the rss calculation.
1696 1669 * This is because:
1697 1670 * 1. The calculation allocates using KM_SLEEP.
1698 1671 * 2. The calculation grabs a_lock, which cannot be grabbed
1699 1672 * after p_lock.
1700 1673 *
1701 1674 * Since pidlock must be dropped, we cannot simply just walk the
1702 1675 * practive list. Instead, we walk the process table, and sprlock
1703 1676 * each process to ensure that it does not exit during the
1704 1677 * calculation.
1705 1678 */
1706 1679
1707 1680 mutex_enter(&pidlock);
1708 1681 for (i = 0; i < v.v_proc; i++) {
1709 1682 again:
1710 1683 p = pid_entry(i);
1711 1684 if (p == NULL)
1712 1685 continue;
1713 1686
1714 1687 mutex_enter(&p->p_lock);
1715 1688 mutex_exit(&pidlock);
1716 1689
1717 1690 if (panicstr) {
1718 1691 mutex_exit(&p->p_lock);
1719 1692 return;
1720 1693 }
1721 1694
1722 1695 /* Try to set P_PR_LOCK */
1723 1696 ret = sprtrylock_proc(p);
1724 1697 if (ret == -1) {
1725 1698 /* Process in invalid state */
1726 1699 mutex_exit(&p->p_lock);
1727 1700 mutex_enter(&pidlock);
1728 1701 continue;
1729 1702 } else if (ret == 1) {
1730 1703 /*
1731 1704 * P_PR_LOCK is already set. Wait and try again.
1732 1705 * This also drops p_lock.
1733 1706 */
1734 1707 sprwaitlock_proc(p);
1735 1708 mutex_enter(&pidlock);
1736 1709 goto again;
1737 1710 }
1738 1711 mutex_exit(&p->p_lock);
1739 1712
1740 1713 vmu_calculate_proc(p);
1741 1714
1742 1715 mutex_enter(&p->p_lock);
1743 1716 sprunlock(p);
1744 1717 mutex_enter(&pidlock);
1745 1718 }
1746 1719 mutex_exit(&pidlock);
1747 1720
1748 1721 vmu_free_extra();
1749 1722 }
1750 1723
1751 1724 /*
1752 1725 * allocate a new cache for N results satisfying flags
1753 1726 */
1754 1727 vmu_cache_t *
1755 1728 vmu_cache_alloc(size_t nres, uint_t flags)
1756 1729 {
1757 1730 vmu_cache_t *cache;
1758 1731
1759 1732 cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1760 1733 cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1761 1734 cache->vmc_nresults = nres;
1762 1735 cache->vmc_flags = flags;
1763 1736 cache->vmc_refcnt = 1;
1764 1737 return (cache);
1765 1738 }
1766 1739
1767 1740 /*
1768 1741 * Make sure cached results are not freed
1769 1742 */
1770 1743 static void
1771 1744 vmu_cache_hold(vmu_cache_t *cache)
1772 1745 {
1773 1746 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1774 1747 cache->vmc_refcnt++;
1775 1748 }
1776 1749
1777 1750 /*
1778 1751 * free cache data
1779 1752 */
1780 1753 static void
1781 1754 vmu_cache_rele(vmu_cache_t *cache)
1782 1755 {
1783 1756 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1784 1757 ASSERT(cache->vmc_refcnt > 0);
1785 1758 cache->vmc_refcnt--;
1786 1759 if (cache->vmc_refcnt == 0) {
1787 1760 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1788 1761 cache->vmc_nresults);
1789 1762 kmem_free(cache, sizeof (vmu_cache_t));
1790 1763 }
1791 1764 }
1792 1765
1793 1766 /*
1794 1767 * When new data is calculated, update the phys_mem rctl usage value in the
1795 1768 * zones.
1796 1769 */
1797 1770 static void
|
↓ open down ↓ |
137 lines elided |
↑ open up ↑ |
1798 1771 vmu_update_zone_rctls(vmu_cache_t *cache)
1799 1772 {
1800 1773 vmusage_t *rp;
1801 1774 size_t i = 0;
1802 1775 zone_t *zp;
1803 1776
1804 1777 for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
1805 1778 if (rp->vmu_type == VMUSAGE_ZONE &&
1806 1779 rp->vmu_zoneid != ALL_ZONES) {
1807 1780 if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
1808 - zp->zone_phys_mem = rp->vmu_rss_all;
1809 - zone_rele(zp);
1781 + zp->zone_phys_mem = rp->vmu_rss_all;
1782 + zone_rele(zp);
1810 1783 }
1811 1784 }
1812 1785 }
1813 1786 }
1814 1787
1815 1788 /*
1816 1789 * Copy out the cached results to a caller. Inspect the callers flags
1817 1790 * and zone to determine which cached results should be copied.
1818 1791 */
1819 1792 static int
1820 1793 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1821 1794 uint_t flags, id_t req_zone_id, int cpflg)
1822 1795 {
1823 1796 vmusage_t *result, *out_result;
1824 1797 vmusage_t dummy;
1825 1798 size_t i, count = 0;
1826 1799 size_t bufsize;
1827 1800 int ret = 0;
1828 1801 uint_t types = 0;
1829 1802
1830 1803 if (nres != NULL) {
1831 1804 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1832 1805 return (set_errno(EFAULT));
1833 1806 } else {
1834 1807 bufsize = 0;
1835 1808 }
1836 1809
1837 1810 /* figure out what results the caller is interested in. */
1838 1811 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1839 1812 types |= VMUSAGE_SYSTEM;
1840 1813 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
1841 1814 types |= VMUSAGE_ZONE;
1842 1815 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1843 1816 VMUSAGE_COL_PROJECTS))
1844 1817 types |= VMUSAGE_PROJECTS;
1845 1818 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1846 1819 types |= VMUSAGE_TASKS;
1847 1820 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1848 1821 types |= VMUSAGE_RUSERS;
1849 1822 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1850 1823 types |= VMUSAGE_EUSERS;
1851 1824
1852 1825 /* count results for current zone */
1853 1826 out_result = buf;
1854 1827 for (result = cache->vmc_results, i = 0;
1855 1828 i < cache->vmc_nresults; result++, i++) {
1856 1829
1857 1830 /* Do not return "other-zone" results to non-global zones */
1858 1831 if (curproc->p_zone != global_zone &&
1859 1832 curproc->p_zone->zone_id != result->vmu_zoneid)
1860 1833 continue;
1861 1834
1862 1835 /*
1863 1836 * If non-global zone requests VMUSAGE_SYSTEM, fake
1864 1837 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1865 1838 */
1866 1839 if (curproc->p_zone != global_zone &&
1867 1840 (flags & VMUSAGE_SYSTEM) != 0 &&
1868 1841 result->vmu_type == VMUSAGE_ZONE) {
1869 1842 count++;
1870 1843 if (out_result != NULL) {
1871 1844 if (bufsize < count) {
1872 1845 ret = set_errno(EOVERFLOW);
1873 1846 } else {
1874 1847 dummy = *result;
1875 1848 dummy.vmu_zoneid = ALL_ZONES;
1876 1849 dummy.vmu_id = 0;
1877 1850 dummy.vmu_type = VMUSAGE_SYSTEM;
1878 1851 if (ddi_copyout(&dummy, out_result,
1879 1852 sizeof (vmusage_t), cpflg))
1880 1853 return (set_errno(EFAULT));
1881 1854 out_result++;
1882 1855 }
1883 1856 }
1884 1857 }
1885 1858
1886 1859 /* Skip results that do not match requested type */
1887 1860 if ((result->vmu_type & types) == 0)
1888 1861 continue;
1889 1862
1890 1863 /* Skip collated results if not requested */
1891 1864 if (result->vmu_zoneid == ALL_ZONES) {
1892 1865 if (result->vmu_type == VMUSAGE_PROJECTS &&
1893 1866 (flags & VMUSAGE_COL_PROJECTS) == 0)
1894 1867 continue;
1895 1868 if (result->vmu_type == VMUSAGE_EUSERS &&
1896 1869 (flags & VMUSAGE_COL_EUSERS) == 0)
1897 1870 continue;
1898 1871 if (result->vmu_type == VMUSAGE_RUSERS &&
1899 1872 (flags & VMUSAGE_COL_RUSERS) == 0)
1900 1873 continue;
1901 1874 }
1902 1875
1903 1876 if (result->vmu_type == VMUSAGE_ZONE &&
1904 1877 flags & VMUSAGE_A_ZONE) {
1905 1878 /* Skip non-requested zone results */
1906 1879 if (result->vmu_zoneid != req_zone_id)
1907 1880 continue;
1908 1881 } else {
1909 1882 /* Skip "other zone" results if not requested */
1910 1883 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1911 1884 if (result->vmu_type == VMUSAGE_ZONE &&
1912 1885 (flags & VMUSAGE_ALL_ZONES) == 0)
1913 1886 continue;
1914 1887 if (result->vmu_type == VMUSAGE_PROJECTS &&
1915 1888 (flags & (VMUSAGE_ALL_PROJECTS |
1916 1889 VMUSAGE_COL_PROJECTS)) == 0)
1917 1890 continue;
1918 1891 if (result->vmu_type == VMUSAGE_TASKS &&
1919 1892 (flags & VMUSAGE_ALL_TASKS) == 0)
1920 1893 continue;
1921 1894 if (result->vmu_type == VMUSAGE_RUSERS &&
1922 1895 (flags & (VMUSAGE_ALL_RUSERS |
1923 1896 VMUSAGE_COL_RUSERS)) == 0)
1924 1897 continue;
1925 1898 if (result->vmu_type == VMUSAGE_EUSERS &&
1926 1899 (flags & (VMUSAGE_ALL_EUSERS |
1927 1900 VMUSAGE_COL_EUSERS)) == 0)
1928 1901 continue;
1929 1902 }
1930 1903 }
1931 1904 count++;
1932 1905 if (out_result != NULL) {
1933 1906 if (bufsize < count) {
1934 1907 ret = set_errno(EOVERFLOW);
1935 1908 } else {
1936 1909 if (ddi_copyout(result, out_result,
1937 1910 sizeof (vmusage_t), cpflg))
1938 1911 return (set_errno(EFAULT));
1939 1912 out_result++;
1940 1913 }
1941 1914 }
1942 1915 }
1943 1916 if (nres != NULL)
1944 1917 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1945 1918 return (set_errno(EFAULT));
1946 1919
1947 1920 return (ret);
1948 1921 }
1949 1922
1950 1923 /*
1951 1924 * vm_getusage()
1952 1925 *
1953 1926 * Counts rss and swap by zone, project, task, and/or user. The flags argument
1954 1927 * determines the type of results structures returned. Flags requesting
1955 1928 * results from more than one zone are "flattened" to the local zone if the
1956 1929 * caller is not the global zone.
1957 1930 *
1958 1931 * args:
1959 1932 * flags: bitmap consisting of one or more of VMUSAGE_*.
1960 1933 * age: maximum allowable age (time since counting was done) in
1961 1934 * seconds of the results. Results from previous callers are
1962 1935 * cached in kernel.
1963 1936 * buf: pointer to buffer array of vmusage_t. If NULL, then only nres
1964 1937 * set on success.
1965 1938 * nres: Set to number of vmusage_t structures pointed to by buf
1966 1939 * before calling vm_getusage().
1967 1940 * On return 0 (success) or ENOSPC, is set to the number of result
1968 1941 * structures returned or attempted to return.
1969 1942 *
1970 1943 * returns 0 on success, -1 on failure:
1971 1944 * EINTR (interrupted)
1972 1945 * ENOSPC (nres to small for results, nres set to needed value for success)
1973 1946 * EINVAL (flags invalid)
1974 1947 * EFAULT (bad address for buf or nres)
1975 1948 */
1976 1949 int
1977 1950 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1978 1951 {
1979 1952 vmu_entity_t *entity;
1980 1953 vmusage_t *result;
1981 1954 int ret = 0;
1982 1955 int cacherecent = 0;
1983 1956 hrtime_t now;
1984 1957 uint_t flags_orig;
1985 1958 id_t req_zone_id;
1986 1959
1987 1960 /*
1988 1961 * Non-global zones cannot request system wide and/or collated
1989 1962 * results, or the system result, or usage of another zone, so munge
1990 1963 * the flags accordingly.
1991 1964 */
1992 1965 flags_orig = flags;
1993 1966 if (curproc->p_zone != global_zone) {
1994 1967 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1995 1968 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1996 1969 flags |= VMUSAGE_PROJECTS;
1997 1970 }
1998 1971 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1999 1972 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
2000 1973 flags |= VMUSAGE_RUSERS;
2001 1974 }
2002 1975 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
2003 1976 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
2004 1977 flags |= VMUSAGE_EUSERS;
2005 1978 }
2006 1979 if (flags & VMUSAGE_SYSTEM) {
2007 1980 flags &= ~VMUSAGE_SYSTEM;
2008 1981 flags |= VMUSAGE_ZONE;
2009 1982 }
2010 1983 if (flags & VMUSAGE_A_ZONE) {
2011 1984 flags &= ~VMUSAGE_A_ZONE;
2012 1985 flags |= VMUSAGE_ZONE;
2013 1986 }
2014 1987 }
2015 1988
2016 1989 /* Check for unknown flags */
2017 1990 if ((flags & (~VMUSAGE_MASK)) != 0)
2018 1991 return (set_errno(EINVAL));
2019 1992
2020 1993 /* Check for no flags */
2021 1994 if ((flags & VMUSAGE_MASK) == 0)
2022 1995 return (set_errno(EINVAL));
2023 1996
2024 1997 /* If requesting results for a specific zone, get the zone ID */
2025 1998 if (flags & VMUSAGE_A_ZONE) {
2026 1999 size_t bufsize;
2027 2000 vmusage_t zreq;
2028 2001
2029 2002 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
2030 2003 return (set_errno(EFAULT));
2031 2004 /* Requested zone ID is passed in buf, so 0 len not allowed */
2032 2005 if (bufsize == 0)
2033 2006 return (set_errno(EINVAL));
2034 2007 if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
2035 2008 return (set_errno(EFAULT));
2036 2009 req_zone_id = zreq.vmu_id;
2037 2010 }
2038 2011
2039 2012 mutex_enter(&vmu_data.vmu_lock);
2040 2013 now = gethrtime();
2041 2014
2042 2015 start:
2043 2016 if (vmu_data.vmu_cache != NULL) {
2044 2017
2045 2018 vmu_cache_t *cache;
2046 2019
2047 2020 if ((vmu_data.vmu_cache->vmc_timestamp +
2048 2021 ((hrtime_t)age * NANOSEC)) > now)
2049 2022 cacherecent = 1;
2050 2023
2051 2024 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
2052 2025 cacherecent == 1) {
2053 2026 cache = vmu_data.vmu_cache;
2054 2027 vmu_cache_hold(cache);
2055 2028 mutex_exit(&vmu_data.vmu_lock);
2056 2029
2057 2030 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2058 2031 req_zone_id, cpflg);
2059 2032 mutex_enter(&vmu_data.vmu_lock);
2060 2033 vmu_cache_rele(cache);
2061 2034 if (vmu_data.vmu_pending_waiters > 0)
2062 2035 cv_broadcast(&vmu_data.vmu_cv);
2063 2036 mutex_exit(&vmu_data.vmu_lock);
2064 2037 return (ret);
2065 2038 }
2066 2039 /*
2067 2040 * If the cache is recent, it is likely that there are other
2068 2041 * consumers of vm_getusage running, so add their flags to the
2069 2042 * desired flags for the calculation.
2070 2043 */
2071 2044 if (cacherecent == 1)
2072 2045 flags = vmu_data.vmu_cache->vmc_flags | flags;
2073 2046 }
2074 2047 if (vmu_data.vmu_calc_thread == NULL) {
2075 2048
2076 2049 vmu_cache_t *cache;
2077 2050
2078 2051 vmu_data.vmu_calc_thread = curthread;
2079 2052 vmu_data.vmu_calc_flags = flags;
2080 2053 vmu_data.vmu_entities = NULL;
2081 2054 vmu_data.vmu_nentities = 0;
2082 2055 if (vmu_data.vmu_pending_waiters > 0)
2083 2056 vmu_data.vmu_calc_flags |=
2084 2057 vmu_data.vmu_pending_flags;
2085 2058
2086 2059 vmu_data.vmu_pending_flags = 0;
2087 2060 mutex_exit(&vmu_data.vmu_lock);
2088 2061 vmu_calculate();
2089 2062 mutex_enter(&vmu_data.vmu_lock);
2090 2063 /* copy results to cache */
2091 2064 if (vmu_data.vmu_cache != NULL)
2092 2065 vmu_cache_rele(vmu_data.vmu_cache);
2093 2066 cache = vmu_data.vmu_cache =
2094 2067 vmu_cache_alloc(vmu_data.vmu_nentities,
2095 2068 vmu_data.vmu_calc_flags);
2096 2069
2097 2070 result = cache->vmc_results;
2098 2071 for (entity = vmu_data.vmu_entities; entity != NULL;
2099 2072 entity = entity->vme_next) {
2100 2073 *result = entity->vme_result;
2101 2074 result++;
2102 2075 }
2103 2076 cache->vmc_timestamp = gethrtime();
2104 2077 vmu_cache_hold(cache);
2105 2078
2106 2079 vmu_data.vmu_calc_flags = 0;
2107 2080 vmu_data.vmu_calc_thread = NULL;
2108 2081
2109 2082 if (vmu_data.vmu_pending_waiters > 0)
2110 2083 cv_broadcast(&vmu_data.vmu_cv);
2111 2084
2112 2085 mutex_exit(&vmu_data.vmu_lock);
2113 2086
2114 2087 /* update zone's phys. mem. rctl usage */
2115 2088 vmu_update_zone_rctls(cache);
2116 2089 /* copy cache */
2117 2090 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2118 2091 req_zone_id, cpflg);
2119 2092 mutex_enter(&vmu_data.vmu_lock);
2120 2093 vmu_cache_rele(cache);
2121 2094 mutex_exit(&vmu_data.vmu_lock);
2122 2095
2123 2096 return (ret);
2124 2097 }
2125 2098 vmu_data.vmu_pending_flags |= flags;
2126 2099 vmu_data.vmu_pending_waiters++;
2127 2100 while (vmu_data.vmu_calc_thread != NULL) {
2128 2101 if (cv_wait_sig(&vmu_data.vmu_cv,
2129 2102 &vmu_data.vmu_lock) == 0) {
2130 2103 vmu_data.vmu_pending_waiters--;
2131 2104 mutex_exit(&vmu_data.vmu_lock);
2132 2105 return (set_errno(EINTR));
2133 2106 }
2134 2107 }
2135 2108 vmu_data.vmu_pending_waiters--;
2136 2109 goto start;
2137 2110 }
2138 2111
2139 2112 #if defined(__x86)
2140 2113 /*
2141 2114 * Attempt to invalidate all of the pages in the mapping for the given process.
2142 2115 */
2143 2116 static void
2144 2117 map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
2145 2118 {
2146 2119 page_t *pp;
2147 2120 size_t psize;
2148 2121 u_offset_t off;
2149 2122 caddr_t eaddr;
2150 2123 struct vnode *vp;
2151 2124 struct segvn_data *svd;
2152 2125 struct hat *victim_hat;
2153 2126
2154 2127 ASSERT((addr + size) <= (seg->s_base + seg->s_size));
2155 2128
2156 2129 victim_hat = p->p_as->a_hat;
2157 2130 svd = (struct segvn_data *)seg->s_data;
2158 2131 vp = svd->vp;
2159 2132 psize = page_get_pagesize(seg->s_szc);
2160 2133
2161 2134 off = svd->offset + (uintptr_t)(addr - seg->s_base);
2162 2135
2163 2136 for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
2164 2137 pp = page_lookup_nowait(vp, off, SE_SHARED);
2165 2138
2166 2139 if (pp != NULL) {
2167 2140 /* following logic based on pvn_getdirty() */
2168 2141
2169 2142 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2170 2143 page_unlock(pp);
2171 2144 continue;
2172 2145 }
2173 2146
2174 2147 page_io_lock(pp);
2175 2148 hat_page_inval(pp, 0, victim_hat);
2176 2149 page_io_unlock(pp);
2177 2150
2178 2151 /*
2179 2152 * For B_INVALCURONLY-style handling we let
2180 2153 * page_release call VN_DISPOSE if no one else is using
2181 2154 * the page.
2182 2155 *
2183 2156 * A hat_ismod() check would be useless because:
2184 2157 * (1) we are not be holding SE_EXCL lock
2185 2158 * (2) we've not unloaded _all_ translations
2186 2159 *
2187 2160 * Let page_release() do the heavy-lifting.
2188 2161 */
2189 2162 (void) page_release(pp, 1);
2190 2163 }
2191 2164 }
2192 2165 }
2193 2166
2194 2167 /*
2195 2168 * vm_map_inval()
2196 2169 *
2197 2170 * Invalidate as many pages as possible within the given mapping for the given
2198 2171 * process. addr is expected to be the base address of the mapping and size is
2199 2172 * the length of the mapping. In some cases a mapping will encompass an
2200 2173 * entire segment, but at least for anon or stack mappings, these will be
2201 2174 * regions within a single large segment. Thus, the invalidation is oriented
2202 2175 * around a single mapping and not an entire segment.
2203 2176 *
2204 2177 * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
2205 2178 * this code is only applicable to x86.
2206 2179 */
2207 2180 int
2208 2181 vm_map_inval(pid_t pid, caddr_t addr, size_t size)
2209 2182 {
2210 2183 int ret;
2211 2184 int error = 0;
2212 2185 proc_t *p; /* target proc */
2213 2186 struct as *as; /* target proc's address space */
2214 2187 struct seg *seg; /* working segment */
2215 2188
2216 2189 if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
2217 2190 return (set_errno(EPERM));
2218 2191
2219 2192 /* If not a valid mapping address, return an error */
2220 2193 if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
2221 2194 return (set_errno(EINVAL));
2222 2195
2223 2196 again:
2224 2197 mutex_enter(&pidlock);
2225 2198 p = prfind(pid);
2226 2199 if (p == NULL) {
2227 2200 mutex_exit(&pidlock);
2228 2201 return (set_errno(ESRCH));
2229 2202 }
2230 2203
2231 2204 mutex_enter(&p->p_lock);
2232 2205 mutex_exit(&pidlock);
2233 2206
2234 2207 if (panicstr != NULL) {
2235 2208 mutex_exit(&p->p_lock);
2236 2209 return (0);
2237 2210 }
2238 2211
2239 2212 as = p->p_as;
2240 2213
2241 2214 /*
2242 2215 * Try to set P_PR_LOCK - prevents process "changing shape"
2243 2216 * - blocks fork
2244 2217 * - blocks sigkill
2245 2218 * - cannot be a system proc
2246 2219 * - must be fully created proc
2247 2220 */
2248 2221 ret = sprtrylock_proc(p);
2249 2222 if (ret == -1) {
2250 2223 /* Process in invalid state */
2251 2224 mutex_exit(&p->p_lock);
2252 2225 return (set_errno(ESRCH));
2253 2226 }
2254 2227
2255 2228 if (ret == 1) {
2256 2229 /*
2257 2230 * P_PR_LOCK is already set. Wait and try again. This also
2258 2231 * drops p_lock so p may no longer be valid since the proc may
2259 2232 * have exited.
2260 2233 */
2261 2234 sprwaitlock_proc(p);
2262 2235 goto again;
2263 2236 }
2264 2237
2265 2238 /* P_PR_LOCK is now set */
2266 2239 mutex_exit(&p->p_lock);
2267 2240
2268 2241 AS_LOCK_ENTER(as, RW_READER);
2269 2242 if ((seg = as_segat(as, addr)) == NULL) {
2270 2243 AS_LOCK_EXIT(as);
2271 2244 mutex_enter(&p->p_lock);
2272 2245 sprunlock(p);
2273 2246 return (set_errno(ENOMEM));
2274 2247 }
2275 2248
2276 2249 /*
2277 2250 * The invalidation behavior only makes sense for vnode-backed segments.
2278 2251 */
2279 2252 if (seg->s_ops != &segvn_ops) {
2280 2253 AS_LOCK_EXIT(as);
2281 2254 mutex_enter(&p->p_lock);
2282 2255 sprunlock(p);
2283 2256 return (0);
2284 2257 }
2285 2258
2286 2259 /*
2287 2260 * If the mapping is out of bounds of the segement return an error.
2288 2261 */
2289 2262 if ((addr + size) > (seg->s_base + seg->s_size)) {
2290 2263 AS_LOCK_EXIT(as);
2291 2264 mutex_enter(&p->p_lock);
2292 2265 sprunlock(p);
2293 2266 return (set_errno(EINVAL));
2294 2267 }
2295 2268
2296 2269 /*
2297 2270 * Don't use MS_INVALCURPROC flag here since that would eventually
2298 2271 * initiate hat invalidation based on curthread. Since we're doing this
2299 2272 * on behalf of a different process, that would erroneously invalidate
2300 2273 * our own process mappings.
2301 2274 */
2302 2275 error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
2303 2276 if (error == 0) {
2304 2277 /*
2305 2278 * Since we didn't invalidate during the sync above, we now
2306 2279 * try to invalidate all of the pages in the mapping.
2307 2280 */
2308 2281 map_inval(p, seg, addr, size);
2309 2282 }
2310 2283 AS_LOCK_EXIT(as);
2311 2284
2312 2285 mutex_enter(&p->p_lock);
2313 2286 sprunlock(p);
2314 2287
2315 2288 if (error)
2316 2289 (void) set_errno(error);
2317 2290 return (error);
2318 2291 }
2319 2292 #endif
|
↓ open down ↓ |
500 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX