Print this page
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
+++ new/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 * Copyright 2012 Joyent, Inc. All rights reserved.
25 25 */
26 26
27 27 #pragma ident "%Z%%M% %I% %E% SMI"
28 28
29 29 #include <sys/mman.h>
30 30 #include <sys/param.h>
31 31 #include <sys/stat.h>
32 32 #include <sys/types.h>
33 33 #include <assert.h>
34 34 #include <errno.h>
35 35 #include <fcntl.h>
36 36 #include <libproc.h>
37 37 #include <limits.h>
38 38 #include <procfs.h>
39 39 #include <stdio.h>
40 40 #include <stdlib.h>
41 41 #include <strings.h>
42 42 #include <time.h>
43 43 #include <unistd.h>
44 44 #include "rcapd.h"
45 45 #include "rcapd_rfd.h"
46 46 #include "rcapd_mapping.h"
47 47 #include "utils.h"
48 48
49 49 static int lpc_xmap_update(lprocess_t *);
50 50 #ifdef DEBUG
51 51 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
52 52 #endif /* DEBUG */
53 53
54 54 /*
55 55 * The number of file descriptors required to grab a process and create an
56 56 * agent in it.
57 57 */
58 58 #define PGRAB_FD_COUNT 10
59 59
60 60 /*
61 61 * Record a position in an address space as it corresponds to a prpageheader_t
62 62 * and affiliated structures.
63 63 */
64 64 typedef struct prpageheader_cur {
65 65 int pr_nmap; /* number of mappings in address space */
66 66 int pr_map; /* number of this mapping */
67 67 uint64_t pr_pgoff; /* page offset into mapping */
68 68 uint64_t pr_npage; /* number of pages in mapping */
69 69 uint64_t pr_pagesize; /* page size of mapping */
70 70 uintptr_t pr_addr; /* base of mapping */
71 71 prpageheader_t *pr_prpageheader; /* associated page header */
72 72 void *pr_pdaddr; /* address of page's byte in pagedata */
73 73 prxmap_t *pr_xmap; /* array containing per-segment information */
74 74 int pr_nxmap; /* number of xmaps in array */
75 75 int64_t pr_rss; /* number of resident pages in mapping, */
76 76 /* or -1 if xmap is out of sync */
77 77 int64_t pr_pg_rss; /* number of pageable pages in mapping, or -1 */
78 78 } prpageheader_cur_t;
79 79
80 80 static struct ps_prochandle *scan_pr; /* currently-scanned process's handle */
81 81
82 82 typedef enum {
83 83 STDL_NORMAL,
84 84 STDL_HIGH
85 85 } st_debug_level_t;
86 86
87 87 /*
88 88 * Output a scanning-related debug message.
89 89 */
90 90 /*PRINTFLIKE3*/ /*ARGSUSED*/
91 91 static void
92 92 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
93 93 {
94 94 #ifdef DEBUG_MSG
95 95 va_list alist;
96 96 char *buf;
97 97 size_t len;
98 98
99 99 if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
100 100 : RCM_DEBUG))
101 101 return;
102 102
103 103 len = strlen(msg) + LINELEN;
104 104 buf = malloc(len);
105 105 if (buf == NULL)
106 106 return;
107 107 (void) snprintf(buf, len, "%s %s scanner %s",
108 108 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
109 109 lcol->lcol_name, msg);
110 110
111 111 va_start(alist, msg);
112 112 vdprintfe(RCM_DEBUG, buf, alist);
113 113 va_end(alist);
114 114
115 115 free(buf);
116 116 #endif /* DEBUG_MSG */
117 117 }
118 118
119 119 /*
120 120 * Determine the collection's current victim, based on its last. The last will
121 121 * be returned, or, if invalid, any other valid process, if the collection has
122 122 * any.
123 123 */
124 124 static lprocess_t *
125 125 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
126 126 {
127 127 if (lpc == NULL || !lcollection_member(lcol, lpc))
128 128 lpc = lcol->lcol_lprocess;
129 129
130 130 /*
131 131 * Find the next scannable process, and make it the victim.
132 132 */
133 133 while (lpc != NULL && lpc->lpc_unscannable != 0)
134 134 lpc = lpc->lpc_next;
135 135
136 136 return (lpc);
137 137 }
138 138
139 139 /*
140 140 * Get a process's combined current pagedata (per-page referenced and modified
141 141 * bits) and set the supplied pointer to it. The caller is responsible for
142 142 * freeing the data. If the pagedata is unreadable, a nonzero value is
143 143 * returned, and errno is set. Otherwise, 0 is returned.
144 144 */
145 145 static int
146 146 get_pagedata(prpageheader_t **pghpp, int fd)
147 147 {
148 148 int res;
149 149 struct stat st;
150 150
151 151 redo:
152 152 errno = 0;
153 153 if (fstat(fd, &st) != 0) {
154 154 debug("cannot stat pagedata\n");
155 155 return (-1);
156 156 }
157 157
158 158 errno = 0;
159 159 *pghpp = malloc(st.st_size);
160 160 if (*pghpp == NULL) {
161 161 debug("cannot malloc() %ld bytes for pagedata", st.st_size);
162 162 return (-1);
163 163 }
164 164 (void) bzero(*pghpp, st.st_size);
165 165
166 166 errno = 0;
167 167 if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
168 168 free(*pghpp);
169 169 *pghpp = NULL;
170 170 if (res > 0 || errno == E2BIG) {
171 171 debug("pagedata changed size, retrying\n");
172 172 goto redo;
173 173 } else {
174 174 debug("cannot read pagedata");
175 175 return (-1);
176 176 }
177 177 }
178 178
179 179 return (0);
180 180 }
181 181
182 182 /*
183 183 * Return the count of kilobytes of pages represented by the given pagedata
184 184 * which meet the given criteria, having pages which are in all of the states
185 185 * specified by the mask, and in none of the states in the notmask. If the
186 186 * CP_CLEAR flag is set, the pagedata will also be cleared.
187 187 */
188 188 #define CP_CLEAR 1
189 189 static uint64_t
190 190 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
191 191 {
192 192 int map;
193 193 caddr_t cur, end;
194 194 prpageheader_t pgh = *pghp;
195 195 prasmap_t *asmapp;
196 196 uint64_t count = 0;
197 197
198 198 cur = (caddr_t)pghp + sizeof (*pghp);
199 199 for (map = 0; map < pgh.pr_nmap; map++) {
200 200 asmapp = (prasmap_t *)(uintptr_t)cur;
201 201 cur += sizeof (*asmapp);
202 202 end = cur + asmapp->pr_npage;
203 203 while (cur < end) {
204 204 if ((*cur & mask) == mask && (*cur & notmask) == 0)
205 205 count += asmapp->pr_pagesize / 1024;
206 206 if ((flags & CP_CLEAR) != 0)
207 207 *cur = 0;
208 208 cur++;
209 209 }
210 210
211 211 /*
212 212 * Skip to next 64-bit-aligned address to get the next
213 213 * prasmap_t.
214 214 */
215 215 cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
216 216 }
217 217
218 218 return (count);
219 219 }
220 220
221 221 /*
222 222 * Return the amount of memory (in kilobytes) that hasn't been referenced or
223 223 * modified, which memory which will be paged out first. Should be written to
224 224 * exclude nonresident pages when sufficient interfaces exist.
225 225 */
226 226 static uint64_t
227 227 unrm_size(lprocess_t *lpc)
228 228 {
229 229 return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
230 230 0, PG_MODIFIED | PG_REFERENCED));
231 231 }
232 232
233 233 /*
234 234 * Advance a prpageheader_cur_t to the address space's next mapping, returning
235 235 * its address, or NULL if there is none. Any known nonpageable or nonresident
236 236 * mappings will be skipped over.
237 237 */
238 238 static uintptr_t
239 239 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
240 240 {
241 241 prasmap_t *pap;
242 242 int i;
243 243
244 244 next:
245 245 ASSERT(pcp->pr_map < pcp->pr_nmap);
246 246 if ((pcp->pr_map + 1) == pcp->pr_nmap)
247 247 return (NULL);
248 248 pcp->pr_map++;
249 249 if (pcp->pr_pgoff < pcp->pr_npage) {
250 250 pcp->pr_pdaddr = (caddr_t)(uintptr_t)
251 251 ((uintptr_t)pcp->pr_pdaddr +
252 252 (pcp->pr_npage - pcp->pr_pgoff));
253 253 pcp->pr_pgoff = pcp->pr_npage;
254 254 }
255 255 /*
256 256 * Skip to next 64-bit-aligned address to get the next prasmap_t.
257 257 */
258 258 pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
259 259 pap = (prasmap_t *)pcp->pr_pdaddr;
260 260 pcp->pr_pgoff = 0;
261 261 pcp->pr_npage = pap->pr_npage;
262 262 pcp->pr_pagesize = pap->pr_pagesize;
263 263 pcp->pr_addr = pap->pr_vaddr;
264 264 pcp->pr_pdaddr = pap + 1;
265 265
266 266 /*
267 267 * Skip any known nonpageable mappings. Currently, the only one
268 268 * detected is the schedctl page.
269 269 */
270 270 if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
271 271 MA_ANON)) == 0 && pap->pr_npage == 1) {
272 272 debug("identified nonpageable schedctl mapping at %p\n",
273 273 (void *)pcp->pr_addr);
274 274 goto next;
275 275 }
276 276
277 277 /*
278 278 * Skip mappings with no resident pages. If the xmap does not
279 279 * correspond to the pagedata for any reason, it will be ignored.
280 280 */
281 281 pcp->pr_rss = -1;
282 282 pcp->pr_pg_rss = -1;
283 283 for (i = 0; i < pcp->pr_nxmap; i++) {
284 284 prxmap_t *xmap = &pcp->pr_xmap[i];
285 285
286 286 if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
287 287 (pcp->pr_npage * pcp->pr_pagesize)) {
288 288 pcp->pr_rss = xmap->pr_rss;
289 289 /*
290 290 * Remove COW pages from the pageable RSS count.
291 291 */
292 292 if ((xmap->pr_mflags & MA_SHARED) == 0)
293 293 pcp->pr_pg_rss = xmap->pr_anon;
294 294 break;
295 295 }
296 296 }
297 297 if (pcp->pr_rss == 0) {
298 298 debug("identified nonresident mapping at 0x%p\n",
299 299 (void *)pcp->pr_addr);
300 300 goto next;
301 301 } else if (pcp->pr_pg_rss == 0) {
302 302 debug("identified unpageable mapping at 0x%p\n",
303 303 (void *)pcp->pr_addr);
304 304 goto next;
305 305 }
306 306
307 307 return (pcp->pr_addr);
308 308 }
309 309
310 310 /*
311 311 * Advance a prpageheader_cur_t to the mapping's next page, returning its
312 312 * address, or NULL if there is none.
313 313 */
314 314 static void *
315 315 advance_prpageheader_cur(prpageheader_cur_t *pcp)
316 316 {
317 317 ASSERT(pcp->pr_pgoff < pcp->pr_npage);
318 318 if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
319 319 return (NULL);
320 320 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
321 321 pcp->pr_pgoff++;
322 322
323 323 ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
324 324 return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
325 325 }
326 326
327 327 /*
328 328 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
329 329 * of an address space.
330 330 */
331 331 static void *
332 332 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
333 333 prxmap_t *xmap, int nxmap)
334 334 {
335 335 bzero(pcp, sizeof (*pcp));
336 336 pcp->pr_nmap = php->pr_nmap;
337 337 pcp->pr_map = -1;
338 338 pcp->pr_prpageheader = php;
339 339 pcp->pr_xmap = xmap;
340 340 pcp->pr_nxmap = nxmap;
341 341 pcp->pr_pdaddr = (prpageheader_t *)php + 1;
342 342
343 343 return ((void *)advance_prpageheader_cur_nextmapping(pcp));
344 344 }
345 345
346 346 /*
347 347 * Position a prpageheader_cur_t to the mapped address greater or equal to the
348 348 * given value.
349 349 */
350 350 static void *
351 351 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
352 352 prxmap_t *xmap, int nxmap, void *naddr)
353 353 {
354 354 void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
355 355
356 356 while (addr != NULL && addr <= naddr)
357 357 if (naddr < (void *)((caddr_t)pcp->pr_addr +
358 358 pcp->pr_pagesize * pcp->pr_npage)) {
359 359 uint64_t pgdiff = ((uintptr_t)naddr -
360 360 (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
361 361 pcp->pr_pgoff += pgdiff;
362 362 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
363 363 addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
364 364 pcp->pr_pgoff;
365 365 break;
366 366 } else
367 367 addr =
368 368 (void *)advance_prpageheader_cur_nextmapping(pcp);
369 369
370 370 return (addr);
371 371 }
372 372
373 373 static void
374 374 revoke_pagedata(rfd_t *rfd)
375 375 {
376 376 lprocess_t *lpc = rfd->rfd_data;
377 377
378 378 st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
379 379 " process %d\n", (int)lpc->lpc_pid);
380 380 ASSERT(lpc->lpc_pgdata_fd != -1);
381 381 lpc->lpc_pgdata_fd = -1;
382 382 }
383 383
384 384 #ifdef DEBUG
385 385 static void
386 386 mklmapping(lmapping_t **lm, prpageheader_t *pgh)
387 387 {
388 388 prpageheader_cur_t cur;
389 389 void *addr;
390 390
391 391 addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
392 392 ASSERT(*lm == NULL);
393 393 while (addr != NULL) {
394 394 (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
395 395 cur.pr_pagesize);
396 396 addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
397 397 }
398 398 }
399 399
400 400 static void
401 401 lmapping_dump(lmapping_t *lm)
402 402 {
403 403 debug("lm: %p\n", (void *)lm);
404 404 while (lm != NULL) {
405 405 debug("\t(%p, %llx\n", (void *)lm->lm_addr,
406 406 (unsigned long long)lm->lm_size);
407 407 lm = lm->lm_next;
408 408 }
409 409 }
410 410 #endif /* DEBUG */
411 411
412 412 /*
413 413 * OR two prpagedata_t which are supposedly snapshots of the same address
414 414 * space. Intersecting mappings with different page sizes are tolerated but
415 415 * not normalized (not accurate). If the mappings of the two snapshots differ
416 416 * in any regard, the supplied mappings_changed flag will be set.
417 417 */
418 418 static void
419 419 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
420 420 {
421 421 prpageheader_cur_t src_cur;
422 422 prpageheader_cur_t dst_cur;
423 423 uintptr_t src_addr;
424 424 uintptr_t dst_addr;
425 425 int mappings_changed = 0;
426 426
427 427 /*
428 428 * OR source pagedata with the destination, for pages of intersecting
429 429 * mappings.
430 430 */
431 431 src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
432 432 dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
433 433 while (src_addr != NULL && dst_addr != NULL) {
434 434 while (src_addr == dst_addr && src_addr != NULL) {
435 435 *(char *)dst_cur.pr_pdaddr |=
436 436 *(char *)src_cur.pr_pdaddr;
437 437 src_addr = (uintptr_t)advance_prpageheader_cur(
438 438 &src_cur);
439 439 dst_addr = (uintptr_t)advance_prpageheader_cur(
440 440 &dst_cur);
441 441 }
442 442 if (src_addr != dst_addr)
443 443 mappings_changed = 1;
444 444 src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
445 445 dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
446 446 while (src_addr != dst_addr && src_addr != NULL && dst_addr !=
447 447 NULL) {
448 448 mappings_changed = 1;
449 449 if (src_addr < dst_addr)
450 450 src_addr = advance_prpageheader_cur_nextmapping(
451 451 &src_cur);
452 452 else
453 453 dst_addr = advance_prpageheader_cur_nextmapping(
454 454 &dst_cur);
455 455 }
456 456 }
457 457
458 458 *mappings_changedp = mappings_changed;
459 459 }
460 460
461 461 /*
462 462 * Merge the current pagedata with that on hand. If the pagedata is
463 463 * unretrievable for any reason, such as the process having exited or being a
464 464 * zombie, a nonzero value is returned, the process should be marked
465 465 * unscannable, and future attempts to scan it should be avoided, since the
466 466 * symptom is probably permament. If the mappings of either pagedata
467 467 * differ in any respect, the supplied callback will be invoked once.
468 468 */
469 469 static int
470 470 merge_current_pagedata(lprocess_t *lpc,
471 471 void(*mappings_changed_cb) (lprocess_t *))
472 472 {
473 473 prpageheader_t *pghp;
474 474 int mappings_changed = 0;
475 475 uint64_t cnt;
476 476
477 477 if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
478 478 0) {
479 479 char pathbuf[PROC_PATH_MAX];
480 480
481 481 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
482 482 (int)lpc->lpc_pid);
483 483 if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
484 484 revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
485 485 get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
486 486 return (-1);
487 487 debug("starting/resuming pagedata collection for %d\n",
488 488 (int)lpc->lpc_pid);
489 489 }
490 490
491 491 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
492 492 if (cnt != 0 || lpc->lpc_rss != 0)
493 493 debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
494 494 (int)lpc->lpc_pid, (unsigned long long)cnt,
495 495 (unsigned long long)lpc->lpc_rss);
496 496 if (lpc->lpc_prpageheader != NULL) {
497 497 /*
498 498 * OR the two snapshots.
499 499 */
500 500 #ifdef DEBUG
501 501 lmapping_t *old = NULL;
502 502 lmapping_t *new = NULL;
503 503
504 504 mklmapping(&new, pghp);
505 505 mklmapping(&old, lpc->lpc_prpageheader);
506 506 #endif /* DEBUG */
507 507 OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
508 508 #ifdef DEBUG
509 509 if (((mappings_changed != 0) ^
510 510 (lmapping_dump_diff(old, new) != 0))) {
511 511 debug("lmapping_changed inconsistent with lmapping\n");
512 512 debug("old\n");
513 513 lmapping_dump(old);
514 514 debug("new\n");
515 515 lmapping_dump(new);
516 516 debug("ignored\n");
517 517 lmapping_dump(lpc->lpc_ignore);
518 518 ASSERT(0);
519 519 }
520 520 lmapping_free(&new);
521 521 lmapping_free(&old);
522 522 #endif /* DEBUG */
523 523 free(lpc->lpc_prpageheader);
524 524 } else
525 525 mappings_changed = 1;
526 526 lpc->lpc_prpageheader = pghp;
527 527
528 528 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
529 529 if (cnt != 0 || lpc->lpc_rss != 0)
530 530 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
531 531 (int)lpc->lpc_pid, (unsigned long long)cnt,
532 532 (unsigned long long)lpc->lpc_rss);
533 533 if (mappings_changed != 0) {
534 534 debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
535 535 if (mappings_changed_cb != NULL)
536 536 mappings_changed_cb(lpc);
537 537 }
538 538 return (0);
539 539 }
540 540
541 541 /*
542 542 * Attempt to page out a region of the given process's address space. May
543 543 * return nonzero if not all of the pages may are pageable, for any reason.
544 544 */
545 545 static int
546 546 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
547 547 {
548 548 int res;
549 549
550 550 if (end <= start)
551 551 return (0);
552 552
553 553 errno = 0;
554 554 res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
555 555 (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
556 556 debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
557 557
558 558 /*
559 559 * EBUSY indicates none of the pages have backing store allocated, or
560 560 * some pages were locked, which are less interesting than other
561 561 * conditions, which are noted.
562 562 */
563 563 if (res != 0)
564 564 if (errno == EBUSY)
565 565 res = 0;
566 566 else
567 567 debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
568 568 (void *)start, (long long)(end - start), errno);
569 569
570 570 return (res);
571 571 }
572 572
573 573 /*
574 574 * Compute the delta of the victim process's RSS since the last call. If the
575 575 * psinfo cannot be obtained, no work is done, and no error is returned; it is
576 576 * up to the caller to detect the process' termination via other means.
577 577 */
578 578 static int64_t
579 579 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
580 580 {
581 581 int64_t d_rss = 0;
582 582
583 583 if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
584 584 lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
585 585 d_rss = (int64_t)new_psinfo->pr_rssize -
586 586 (int64_t)old_psinfo->pr_rssize;
587 587 if (d_rss < 0)
588 588 vic->lpc_collection->lcol_stat.lcols_pg_eff +=
589 589 (- d_rss);
590 590 *old_psinfo = *new_psinfo;
591 591 }
592 592
593 593 return (d_rss);
594 594 }
595 595
596 596 static void
597 597 unignore_mappings(lprocess_t *lpc)
598 598 {
599 599 lmapping_free(&lpc->lpc_ignore);
600 600 }
601 601
602 602 static void
603 603 unignore_referenced_mappings(lprocess_t *lpc)
604 604 {
605 605 prpageheader_cur_t cur;
606 606 void *vicaddr;
607 607
608 608 vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
609 609 while (vicaddr != NULL) {
610 610 if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
611 611 != 0) {
612 612 if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
613 613 cur.pr_npage * cur.pr_pagesize) == 0)
614 614 debug("removed mapping 0x%p+0t%llukB from"
615 615 " ignored set\n", (void *)cur.pr_addr,
616 616 (unsigned long long)(cur.pr_npage *
617 617 cur.pr_pagesize / 1024));
618 618 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
619 619 &cur);
620 620 } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
621 621 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
622 622 &cur);
623 623 }
624 624 }
625 625
626 626 /*
627 627 * Resume scanning, starting with the last victim, if it is still valid, or any
628 628 * other one, otherwise.
629 629 */
630 630 void
631 631 scan(lcollection_t *lcol, int64_t excess)
632 632 {
633 633 lprocess_t *vic, *lpc;
634 634 void *vicaddr, *endaddr, *nvicaddr;
635 635 prpageheader_cur_t cur;
636 636 psinfo_t old_psinfo, new_psinfo;
637 637 hrtime_t scan_start;
638 638 int res, resumed;
639 639 uint64_t col_unrm_size;
640 640
641 641 st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
642 642 (long long)excess);
643 643
644 644 /*
645 645 * Determine the address to start scanning at, depending on whether
646 646 * scanning can be resumed.
647 647 */
648 648 endaddr = NULL;
649 649 if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
650 650 lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
651 651 vicaddr = lcol->lcol_resaddr;
652 652 st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
653 653 (int)vic->lpc_pid);
654 654 resumed = 1;
655 655 } else {
656 656 vicaddr = NULL;
657 657 resumed = 0;
658 658 }
659 659
660 660 scan_start = gethrtime();
661 661 /*
662 662 * Obtain the most current pagedata for the processes that might be
663 663 * scanned, and remove from the ignored set any mappings which have
664 664 * referenced or modified pages (in the hopes that the pageability of
665 665 * the mapping's pages may have changed). Determine if the
666 666 * unreferenced and unmodified portion is impossibly small to suffice
667 667 * to reduce the excess completely. If so, ignore these bits so that
668 668 * even working set will be paged out.
669 669 */
670 670 col_unrm_size = 0;
671 671 lpc = vic;
672 672 while (lpc != NULL && should_run) {
673 673 if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
674 674 st_debug(STDL_NORMAL, lcol, "process %d:"
675 675 " exited/temporarily unscannable",
676 676 (int)lpc->lpc_pid);
677 677 goto next;
678 678 }
679 679 debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
680 680 (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
681 681 (unsigned long long)lpc->lpc_size);
682 682 col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
683 683
684 684 if ((lcol->lcol_stat.lcols_scan_count %
685 685 RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
686 686 /*
687 687 * Periodically clear the set of ignored mappings.
688 688 * This will allow processes whose ignored segments'
689 689 * pageability have changed (without a corresponding
690 690 * reference or modification to a page) to be
691 691 * recognized.
692 692 */
693 693 if (lcol->lcol_stat.lcols_scan_count > 0)
694 694 unignore_mappings(lpc);
695 695 } else {
696 696 /*
697 697 * Ensure mappings with referenced or modified pages
698 698 * are not in the ignored set. Their usage might mean
699 699 * the condition which made them unpageable is gone.
700 700 */
701 701 unignore_referenced_mappings(lpc);
702 702 }
703 703 next:
704 704 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
705 705 lpc->lpc_next) : NULL;
706 706 }
707 707 if (col_unrm_size < excess) {
708 708 lpc = vic;
709 709 debug("will not reduce excess with only unreferenced pages\n");
710 710 while (lpc != NULL && should_run) {
711 711 if (lpc->lpc_prpageheader != NULL) {
712 712 (void) count_pages(lpc->lpc_prpageheader,
713 713 CP_CLEAR, 0, 0);
714 714 if (lpc->lpc_pgdata_fd >= 0) {
715 715 if (rfd_close(lpc->lpc_pgdata_fd) != 0)
716 716 debug("coud not close %d"
717 717 " lpc_pgdata_fd %d",
718 718 (int)lpc->lpc_pid,
719 719 lpc->lpc_pgdata_fd);
720 720 lpc->lpc_pgdata_fd = -1;
721 721 }
722 722 }
723 723 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
724 724 lpc->lpc_next) : NULL;
725 725 }
726 726 }
727 727
728 728 /*
729 729 * Examine each process for pages to remove until the excess is
730 730 * reduced.
731 731 */
732 732 while (vic != NULL && excess > 0 && should_run) {
733 733 /*
734 734 * Skip processes whose death was reported when the merging of
735 735 * pagedata was attempted.
736 736 */
737 737 if (vic->lpc_prpageheader == NULL)
738 738 goto nextproc;
739 739
740 740 /*
741 741 * Obtain optional segment residency information.
742 742 */
743 743 if (lpc_xmap_update(vic) != 0)
744 744 st_debug(STDL_NORMAL, lcol, "process %d: xmap"
745 745 " unreadable; ignoring", (int)vic->lpc_pid);
746 746
747 747 #ifdef DEBUG_MSG
748 748 {
749 749 void *ovicaddr = vicaddr;
750 750 #endif /* DEBUG_MSG */
751 751 vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
752 752 vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
753 753 #ifdef DEBUG_MSG
754 754 st_debug(STDL_NORMAL, lcol, "trying to resume from"
755 755 " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
756 756 }
757 757 #endif /* DEBUG_MSG */
758 758
759 759 /*
760 760 * Take control of the victim.
761 761 */
762 762 if (get_psinfo(vic->lpc_pid, &old_psinfo,
763 763 vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
764 764 vic, vic) != 0) {
765 765 st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
766 766 (int)vic->lpc_pid);
767 767 goto nextproc;
768 768 }
769 769 (void) rfd_reserve(PGRAB_FD_COUNT);
770 770 if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
771 771 st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
772 772 (int)vic->lpc_pid, res);
773 773 goto nextproc;
774 774 }
775 775 if (Pcreate_agent(scan_pr) != 0) {
776 776 st_debug(STDL_NORMAL, lcol, "cannot control %d",
777 777 (int)vic->lpc_pid);
778 778 goto nextproc;
779 779 }
780 780 /*
781 781 * Be very pessimistic about the state of the agent LWP --
782 782 * verify it's actually stopped.
783 783 */
784 784 errno = 0;
785 785 while (Pstate(scan_pr) == PS_RUN)
786 786 (void) Pwait(scan_pr, 0);
787 787 if (Pstate(scan_pr) != PS_STOP) {
788 788 st_debug(STDL_NORMAL, lcol, "agent not in expected"
789 789 " state (%d)", Pstate(scan_pr));
790 790 goto nextproc;
791 791 }
792 792
793 793 /*
794 794 * Within the victim's address space, find contiguous ranges of
795 795 * unreferenced pages to page out.
796 796 */
797 797 st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
798 798 (int)vic->lpc_pid);
799 799 while (excess > 0 && vicaddr != NULL && should_run) {
800 800 /*
801 801 * Skip mappings in the ignored set. Mappings get
802 802 * placed in the ignored set when all their resident
803 803 * pages are unreference and unmodified, yet unpageable
804 804 * -- such as when they are locked, or involved in
805 805 * asynchronous I/O. They will be scanned again when
806 806 * some page is referenced or modified.
807 807 */
808 808 if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
809 809 cur.pr_npage * cur.pr_pagesize)) {
810 810 debug("ignored mapping at 0x%p\n",
811 811 (void *)cur.pr_addr);
812 812 /*
813 813 * Update statistics.
814 814 */
815 815 lcol->lcol_stat.lcols_pg_att +=
816 816 cur.pr_npage * cur.pr_pagesize / 1024;
817 817
818 818 vicaddr = (void *)
819 819 advance_prpageheader_cur_nextmapping(&cur);
820 820 continue;
821 821 }
822 822
823 823 /*
824 824 * Determine a range of unreferenced pages to page out,
825 825 * and clear the R/M bits in the preceding referenced
826 826 * range.
827 827 */
828 828 st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
829 829 " npage %llu\n", vicaddr,
830 830 (unsigned long long)cur.pr_npage);
831 831 while (vicaddr != NULL &&
832 832 *(caddr_t)cur.pr_pdaddr != 0) {
833 833 *(caddr_t)cur.pr_pdaddr = 0;
834 834 vicaddr = advance_prpageheader_cur(&cur);
835 835 }
836 836 st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
837 837 " %p\n", vicaddr, cur.pr_pdaddr);
838 838 if (vicaddr == NULL) {
839 839 /*
840 840 * The end of mapping was reached before any
841 841 * unreferenced pages were seen.
842 842 */
843 843 vicaddr = (void *)
844 844 advance_prpageheader_cur_nextmapping(&cur);
845 845 continue;
846 846 }
847 847 do
848 848 endaddr = advance_prpageheader_cur(&cur);
849 849 while (endaddr != NULL &&
850 850 *(caddr_t)cur.pr_pdaddr == 0 &&
851 851 (((intptr_t)endaddr - (intptr_t)vicaddr) /
852 852 1024) < excess);
853 853 st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
854 854 endaddr, *(caddr_t)cur.pr_pdaddr);
855 855
856 856 /*
857 857 * Page out from vicaddr to the end of the mapping, or
858 858 * endaddr if set, then continue scanning after
859 859 * endaddr, or the next mapping, if not set.
860 860 */
861 861 nvicaddr = endaddr;
862 862 if (endaddr == NULL)
863 863 endaddr = (caddr_t)cur.pr_addr +
864 864 cur.pr_pagesize * cur.pr_npage;
865 865 if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
866 866 0) {
867 867 int64_t d_rss, att;
868 868 int willignore = 0;
869 869
870 870 excess += (d_rss = rss_delta(
871 871 &new_psinfo, &old_psinfo, vic));
872 872
873 873 /*
874 874 * If this pageout attempt was unsuccessful
875 875 * (the resident portion was not affected), and
876 876 * was for the whole mapping, put it in the
877 877 * ignored set, so it will not be scanned again
878 878 * until some page is referenced or modified.
879 879 */
880 880 if (d_rss >= 0 && (void *)cur.pr_addr ==
881 881 vicaddr && (cur.pr_pagesize * cur.pr_npage)
882 882 == ((uintptr_t)endaddr -
883 883 (uintptr_t)vicaddr)) {
884 884 if (lmapping_insert(
885 885 &vic->lpc_ignore,
886 886 cur.pr_addr,
887 887 cur.pr_pagesize *
888 888 cur.pr_npage) != 0)
889 889 debug("not enough memory to add"
890 890 " mapping at %p to ignored"
891 891 " set\n",
892 892 (void *)cur.pr_addr);
893 893 willignore = 1;
894 894 }
895 895
896 896 /*
897 897 * Update statistics.
898 898 */
899 899 lcol->lcol_stat.lcols_pg_att += (att =
900 900 ((intptr_t)endaddr - (intptr_t)vicaddr) /
901 901 1024);
902 902 st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
903 903 "+0t(%llu/%llu)kB%s\n", vicaddr,
904 904 (unsigned long long)((d_rss <
905 905 0) ? - d_rss : 0), (unsigned long long)att,
906 906 willignore ? " (will ignore)" : "");
907 907 } else {
908 908 st_debug(STDL_NORMAL, lcol,
909 909 "process %d: exited/unscannable\n",
910 910 (int)vic->lpc_pid);
911 911 vic->lpc_unscannable = 1;
912 912 goto nextproc;
913 913 }
914 914
915 915 /*
916 916 * Update the statistics file, if it's time.
917 917 */
918 918 check_update_statistics();
919 919
920 920 vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
921 921 *)advance_prpageheader_cur_nextmapping(&cur);
922 922 }
923 923 excess += rss_delta(&new_psinfo, &old_psinfo, vic);
924 924 st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
925 925 (long long)excess);
926 926 nextproc:
927 927 /*
928 928 * If a process was grabbed, release it, destroying its agent.
929 929 */
930 930 if (scan_pr != NULL) {
931 931 (void) Prelease(scan_pr, 0);
932 932 scan_pr = NULL;
933 933 }
934 934 lcol->lcol_victim = vic;
935 935 /*
936 936 * Scan the collection at most once. Only if scanning was not
937 937 * aborted for any reason, and the end of lprocess has not been
938 938 * reached, determine the next victim and scan it.
939 939 */
940 940 if (vic != NULL) {
941 941 if (vic->lpc_next != NULL) {
942 942 /*
943 943 * Determine the next process to be scanned.
944 944 */
945 945 if (excess > 0) {
946 946 vic = get_valid_victim(lcol,
947 947 vic->lpc_next);
948 948 vicaddr = 0;
949 949 }
950 950 } else {
951 951 /*
952 952 * A complete scan of the collection was made,
953 953 * so tick the scan counter and stop scanning
954 954 * until the next request.
955 955 */
956 956 lcol->lcol_stat.lcols_scan_count++;
957 957 lcol->lcol_stat.lcols_scan_time_complete
958 958 = lcol->lcol_stat.lcols_scan_time;
959 959 /*
960 960 * If an excess still exists, tick the
961 961 * "ineffective scan" counter, signalling that
962 962 * the cap may be uneforceable.
963 963 */
964 964 if (resumed == 0 && excess > 0)
965 965 lcol->lcol_stat
966 966 .lcols_scan_ineffective++;
967 967 /*
968 968 * Scanning should start at the beginning of
969 969 * the process list at the next request.
970 970 */
971 971 if (excess > 0)
972 972 vic = NULL;
973 973 }
974 974 }
975 975 }
976 976 lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
977 977 st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
978 978 (long long)excess);
979 979
980 980 lcol->lcol_resaddr = vicaddr;
981 981 if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
982 982 lcol->lcol_victim = get_valid_victim(lcol,
983 983 lcol->lcol_victim->lpc_next);
984 984 }
985 985 }
986 986
987 987 /*
988 988 * Abort the scan in progress, and destroy the agent LWP of any grabbed
989 989 * processes.
990 990 */
991 991 void
992 992 scan_abort(void)
993 993 {
994 994 if (scan_pr != NULL)
995 995 (void) Prelease(scan_pr, NULL);
996 996 }
997 997
998 998 static void
999 999 revoke_xmap(rfd_t *rfd)
1000 1000 {
1001 1001 lprocess_t *lpc = rfd->rfd_data;
1002 1002
1003 1003 debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
1004 1004 ASSERT(lpc->lpc_xmap_fd != -1);
1005 1005 lpc->lpc_xmap_fd = -1;
1006 1006 }
1007 1007
1008 1008 /*
1009 1009 * Retrieve the process's current xmap , which is used to determine the size of
1010 1010 * the resident portion of its segments. Return zero if successful.
1011 1011 */
1012 1012 static int
1013 1013 lpc_xmap_update(lprocess_t *lpc)
1014 1014 {
1015 1015 int res;
1016 1016 struct stat st;
1017 1017
1018 1018 free(lpc->lpc_xmap);
1019 1019 lpc->lpc_xmap = NULL;
1020 1020 lpc->lpc_nxmap = -1;
1021 1021
1022 1022 if (lpc->lpc_xmap_fd == -1) {
1023 1023 char pathbuf[PROC_PATH_MAX];
1024 1024
1025 1025 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
1026 1026 (int)lpc->lpc_pid);
1027 1027 if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
1028 1028 revoke_xmap, lpc, O_RDONLY, 0)) < 0)
1029 1029 return (-1);
1030 1030 }
1031 1031
1032 1032 redo:
1033 1033 errno = 0;
1034 1034 if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
1035 1035 debug("cannot stat xmap\n");
1036 1036 (void) rfd_close(lpc->lpc_xmap_fd);
1037 1037 lpc->lpc_xmap_fd = -1;
1038 1038 return (-1);
1039 1039 }
1040 1040
1041 1041 if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
1042 1042 debug("xmap wrong size\n");
1043 1043 (void) rfd_close(lpc->lpc_xmap_fd);
1044 1044 lpc->lpc_xmap_fd = -1;
1045 1045 return (-1);
1046 1046 }
1047 1047
1048 1048 lpc->lpc_xmap = malloc(st.st_size);
1049 1049 if (lpc->lpc_xmap == NULL) {
1050 1050 debug("cannot malloc() %ld bytes for xmap", st.st_size);
1051 1051 (void) rfd_close(lpc->lpc_xmap_fd);
1052 1052 lpc->lpc_xmap_fd = -1;
1053 1053 return (-1);
1054 1054 }
1055 1055
1056 1056 if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
1057 1057 st.st_size) {
1058 1058 free(lpc->lpc_xmap);
1059 1059 lpc->lpc_xmap = NULL;
1060 1060 if (res > 0) {
1061 1061 debug("xmap changed size, retrying\n");
1062 1062 goto redo;
1063 1063 } else {
1064 1064 debug("cannot read xmap");
1065 1065 return (-1);
1066 1066 }
1067 1067 }
1068 1068 lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
1069 1069
1070 1070 return (0);
1071 1071 }
|
↓ open down ↓ |
1071 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX