Print this page
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
+++ new/usr/src/cmd/rcap/rcapd/rcapd_scanner.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 + * Copyright 2012 Joyent, Inc. All rights reserved.
24 25 */
25 26
26 27 #pragma ident "%Z%%M% %I% %E% SMI"
27 28
28 29 #include <sys/mman.h>
29 30 #include <sys/param.h>
30 31 #include <sys/stat.h>
31 32 #include <sys/types.h>
32 33 #include <assert.h>
33 34 #include <errno.h>
34 35 #include <fcntl.h>
35 36 #include <libproc.h>
36 37 #include <limits.h>
37 38 #include <procfs.h>
38 39 #include <stdio.h>
39 40 #include <stdlib.h>
40 41 #include <strings.h>
41 42 #include <time.h>
42 43 #include <unistd.h>
43 44 #include "rcapd.h"
44 45 #include "rcapd_rfd.h"
45 46 #include "rcapd_mapping.h"
46 47 #include "utils.h"
47 48
48 49 static int lpc_xmap_update(lprocess_t *);
49 50 #ifdef DEBUG
50 51 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
51 52 #endif /* DEBUG */
52 53
53 54 /*
54 55 * The number of file descriptors required to grab a process and create an
55 56 * agent in it.
56 57 */
57 58 #define PGRAB_FD_COUNT 10
58 59
59 60 /*
60 61 * Record a position in an address space as it corresponds to a prpageheader_t
61 62 * and affiliated structures.
62 63 */
63 64 typedef struct prpageheader_cur {
64 65 int pr_nmap; /* number of mappings in address space */
65 66 int pr_map; /* number of this mapping */
66 67 uint64_t pr_pgoff; /* page offset into mapping */
67 68 uint64_t pr_npage; /* number of pages in mapping */
68 69 uint64_t pr_pagesize; /* page size of mapping */
69 70 uintptr_t pr_addr; /* base of mapping */
70 71 prpageheader_t *pr_prpageheader; /* associated page header */
71 72 void *pr_pdaddr; /* address of page's byte in pagedata */
72 73 prxmap_t *pr_xmap; /* array containing per-segment information */
73 74 int pr_nxmap; /* number of xmaps in array */
74 75 int64_t pr_rss; /* number of resident pages in mapping, */
75 76 /* or -1 if xmap is out of sync */
76 77 int64_t pr_pg_rss; /* number of pageable pages in mapping, or -1 */
77 78 } prpageheader_cur_t;
78 79
79 80 static struct ps_prochandle *scan_pr; /* currently-scanned process's handle */
80 81
81 82 typedef enum {
82 83 STDL_NORMAL,
83 84 STDL_HIGH
84 85 } st_debug_level_t;
85 86
86 87 /*
87 88 * Output a scanning-related debug message.
88 89 */
89 90 /*PRINTFLIKE3*/ /*ARGSUSED*/
90 91 static void
91 92 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
92 93 {
93 94 #ifdef DEBUG_MSG
94 95 va_list alist;
95 96 char *buf;
96 97 size_t len;
97 98
98 99 if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
99 100 : RCM_DEBUG))
100 101 return;
101 102
102 103 len = strlen(msg) + LINELEN;
103 104 buf = malloc(len);
104 105 if (buf == NULL)
105 106 return;
106 107 (void) snprintf(buf, len, "%s %s scanner %s",
107 108 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
108 109 lcol->lcol_name, msg);
109 110
110 111 va_start(alist, msg);
111 112 vdprintfe(RCM_DEBUG, buf, alist);
112 113 va_end(alist);
113 114
114 115 free(buf);
115 116 #endif /* DEBUG_MSG */
116 117 }
117 118
118 119 /*
119 120 * Determine the collection's current victim, based on its last. The last will
120 121 * be returned, or, if invalid, any other valid process, if the collection has
121 122 * any.
122 123 */
123 124 static lprocess_t *
124 125 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
125 126 {
126 127 if (lpc == NULL || !lcollection_member(lcol, lpc))
127 128 lpc = lcol->lcol_lprocess;
128 129
129 130 /*
130 131 * Find the next scannable process, and make it the victim.
131 132 */
132 133 while (lpc != NULL && lpc->lpc_unscannable != 0)
133 134 lpc = lpc->lpc_next;
134 135
135 136 return (lpc);
136 137 }
137 138
138 139 /*
139 140 * Get a process's combined current pagedata (per-page referenced and modified
140 141 * bits) and set the supplied pointer to it. The caller is responsible for
141 142 * freeing the data. If the pagedata is unreadable, a nonzero value is
142 143 * returned, and errno is set. Otherwise, 0 is returned.
143 144 */
144 145 static int
145 146 get_pagedata(prpageheader_t **pghpp, int fd)
146 147 {
147 148 int res;
148 149 struct stat st;
149 150
150 151 redo:
151 152 errno = 0;
152 153 if (fstat(fd, &st) != 0) {
153 154 debug("cannot stat pagedata\n");
154 155 return (-1);
155 156 }
156 157
157 158 errno = 0;
158 159 *pghpp = malloc(st.st_size);
159 160 if (*pghpp == NULL) {
160 161 debug("cannot malloc() %ld bytes for pagedata", st.st_size);
161 162 return (-1);
162 163 }
163 164 (void) bzero(*pghpp, st.st_size);
164 165
165 166 errno = 0;
166 167 if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
167 168 free(*pghpp);
168 169 *pghpp = NULL;
169 170 if (res > 0 || errno == E2BIG) {
170 171 debug("pagedata changed size, retrying\n");
171 172 goto redo;
172 173 } else {
173 174 debug("cannot read pagedata");
174 175 return (-1);
175 176 }
176 177 }
177 178
178 179 return (0);
179 180 }
180 181
181 182 /*
182 183 * Return the count of kilobytes of pages represented by the given pagedata
183 184 * which meet the given criteria, having pages which are in all of the states
184 185 * specified by the mask, and in none of the states in the notmask. If the
185 186 * CP_CLEAR flag is set, the pagedata will also be cleared.
186 187 */
187 188 #define CP_CLEAR 1
188 189 static uint64_t
189 190 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
190 191 {
191 192 int map;
192 193 caddr_t cur, end;
193 194 prpageheader_t pgh = *pghp;
194 195 prasmap_t *asmapp;
195 196 uint64_t count = 0;
196 197
197 198 cur = (caddr_t)pghp + sizeof (*pghp);
198 199 for (map = 0; map < pgh.pr_nmap; map++) {
199 200 asmapp = (prasmap_t *)(uintptr_t)cur;
200 201 cur += sizeof (*asmapp);
201 202 end = cur + asmapp->pr_npage;
202 203 while (cur < end) {
203 204 if ((*cur & mask) == mask && (*cur & notmask) == 0)
204 205 count += asmapp->pr_pagesize / 1024;
205 206 if ((flags & CP_CLEAR) != 0)
206 207 *cur = 0;
207 208 cur++;
208 209 }
209 210
210 211 /*
211 212 * Skip to next 64-bit-aligned address to get the next
212 213 * prasmap_t.
213 214 */
214 215 cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
215 216 }
216 217
217 218 return (count);
218 219 }
219 220
220 221 /*
221 222 * Return the amount of memory (in kilobytes) that hasn't been referenced or
222 223 * modified, which memory which will be paged out first. Should be written to
223 224 * exclude nonresident pages when sufficient interfaces exist.
224 225 */
225 226 static uint64_t
226 227 unrm_size(lprocess_t *lpc)
227 228 {
228 229 return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
229 230 0, PG_MODIFIED | PG_REFERENCED));
230 231 }
231 232
232 233 /*
233 234 * Advance a prpageheader_cur_t to the address space's next mapping, returning
234 235 * its address, or NULL if there is none. Any known nonpageable or nonresident
235 236 * mappings will be skipped over.
236 237 */
237 238 static uintptr_t
238 239 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
239 240 {
240 241 prasmap_t *pap;
241 242 int i;
242 243
243 244 next:
244 245 ASSERT(pcp->pr_map < pcp->pr_nmap);
245 246 if ((pcp->pr_map + 1) == pcp->pr_nmap)
246 247 return (NULL);
247 248 pcp->pr_map++;
248 249 if (pcp->pr_pgoff < pcp->pr_npage) {
249 250 pcp->pr_pdaddr = (caddr_t)(uintptr_t)
250 251 ((uintptr_t)pcp->pr_pdaddr +
251 252 (pcp->pr_npage - pcp->pr_pgoff));
252 253 pcp->pr_pgoff = pcp->pr_npage;
253 254 }
254 255 /*
255 256 * Skip to next 64-bit-aligned address to get the next prasmap_t.
256 257 */
257 258 pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
258 259 pap = (prasmap_t *)pcp->pr_pdaddr;
259 260 pcp->pr_pgoff = 0;
260 261 pcp->pr_npage = pap->pr_npage;
261 262 pcp->pr_pagesize = pap->pr_pagesize;
262 263 pcp->pr_addr = pap->pr_vaddr;
263 264 pcp->pr_pdaddr = pap + 1;
264 265
265 266 /*
266 267 * Skip any known nonpageable mappings. Currently, the only one
267 268 * detected is the schedctl page.
268 269 */
269 270 if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
270 271 MA_ANON)) == 0 && pap->pr_npage == 1) {
271 272 debug("identified nonpageable schedctl mapping at %p\n",
272 273 (void *)pcp->pr_addr);
273 274 goto next;
274 275 }
275 276
276 277 /*
277 278 * Skip mappings with no resident pages. If the xmap does not
278 279 * correspond to the pagedata for any reason, it will be ignored.
279 280 */
280 281 pcp->pr_rss = -1;
281 282 pcp->pr_pg_rss = -1;
282 283 for (i = 0; i < pcp->pr_nxmap; i++) {
283 284 prxmap_t *xmap = &pcp->pr_xmap[i];
284 285
285 286 if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
286 287 (pcp->pr_npage * pcp->pr_pagesize)) {
287 288 pcp->pr_rss = xmap->pr_rss;
288 289 /*
289 290 * Remove COW pages from the pageable RSS count.
290 291 */
291 292 if ((xmap->pr_mflags & MA_SHARED) == 0)
292 293 pcp->pr_pg_rss = xmap->pr_anon;
293 294 break;
294 295 }
295 296 }
296 297 if (pcp->pr_rss == 0) {
297 298 debug("identified nonresident mapping at 0x%p\n",
298 299 (void *)pcp->pr_addr);
299 300 goto next;
300 301 } else if (pcp->pr_pg_rss == 0) {
301 302 debug("identified unpageable mapping at 0x%p\n",
302 303 (void *)pcp->pr_addr);
303 304 goto next;
304 305 }
305 306
306 307 return (pcp->pr_addr);
307 308 }
308 309
309 310 /*
310 311 * Advance a prpageheader_cur_t to the mapping's next page, returning its
311 312 * address, or NULL if there is none.
312 313 */
313 314 static void *
314 315 advance_prpageheader_cur(prpageheader_cur_t *pcp)
315 316 {
316 317 ASSERT(pcp->pr_pgoff < pcp->pr_npage);
317 318 if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
318 319 return (NULL);
319 320 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
320 321 pcp->pr_pgoff++;
321 322
322 323 ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
323 324 return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
324 325 }
325 326
326 327 /*
327 328 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
328 329 * of an address space.
329 330 */
330 331 static void *
331 332 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
332 333 prxmap_t *xmap, int nxmap)
333 334 {
334 335 bzero(pcp, sizeof (*pcp));
335 336 pcp->pr_nmap = php->pr_nmap;
336 337 pcp->pr_map = -1;
337 338 pcp->pr_prpageheader = php;
338 339 pcp->pr_xmap = xmap;
339 340 pcp->pr_nxmap = nxmap;
340 341 pcp->pr_pdaddr = (prpageheader_t *)php + 1;
341 342
342 343 return ((void *)advance_prpageheader_cur_nextmapping(pcp));
343 344 }
344 345
345 346 /*
346 347 * Position a prpageheader_cur_t to the mapped address greater or equal to the
347 348 * given value.
348 349 */
349 350 static void *
350 351 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
351 352 prxmap_t *xmap, int nxmap, void *naddr)
352 353 {
353 354 void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
354 355
355 356 while (addr != NULL && addr <= naddr)
356 357 if (naddr < (void *)((caddr_t)pcp->pr_addr +
357 358 pcp->pr_pagesize * pcp->pr_npage)) {
358 359 uint64_t pgdiff = ((uintptr_t)naddr -
359 360 (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
360 361 pcp->pr_pgoff += pgdiff;
361 362 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
362 363 addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
363 364 pcp->pr_pgoff;
364 365 break;
365 366 } else
366 367 addr =
367 368 (void *)advance_prpageheader_cur_nextmapping(pcp);
368 369
369 370 return (addr);
370 371 }
371 372
372 373 static void
373 374 revoke_pagedata(rfd_t *rfd)
374 375 {
375 376 lprocess_t *lpc = rfd->rfd_data;
376 377
377 378 st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
378 379 " process %d\n", (int)lpc->lpc_pid);
379 380 ASSERT(lpc->lpc_pgdata_fd != -1);
380 381 lpc->lpc_pgdata_fd = -1;
381 382 }
382 383
383 384 #ifdef DEBUG
384 385 static void
385 386 mklmapping(lmapping_t **lm, prpageheader_t *pgh)
386 387 {
387 388 prpageheader_cur_t cur;
388 389 void *addr;
389 390
390 391 addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
391 392 ASSERT(*lm == NULL);
392 393 while (addr != NULL) {
393 394 (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
394 395 cur.pr_pagesize);
395 396 addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
396 397 }
397 398 }
398 399
399 400 static void
400 401 lmapping_dump(lmapping_t *lm)
401 402 {
402 403 debug("lm: %p\n", (void *)lm);
403 404 while (lm != NULL) {
404 405 debug("\t(%p, %llx\n", (void *)lm->lm_addr,
405 406 (unsigned long long)lm->lm_size);
406 407 lm = lm->lm_next;
407 408 }
408 409 }
409 410 #endif /* DEBUG */
410 411
411 412 /*
412 413 * OR two prpagedata_t which are supposedly snapshots of the same address
413 414 * space. Intersecting mappings with different page sizes are tolerated but
414 415 * not normalized (not accurate). If the mappings of the two snapshots differ
415 416 * in any regard, the supplied mappings_changed flag will be set.
416 417 */
417 418 static void
418 419 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
419 420 {
420 421 prpageheader_cur_t src_cur;
421 422 prpageheader_cur_t dst_cur;
422 423 uintptr_t src_addr;
423 424 uintptr_t dst_addr;
424 425 int mappings_changed = 0;
425 426
426 427 /*
427 428 * OR source pagedata with the destination, for pages of intersecting
428 429 * mappings.
429 430 */
430 431 src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
431 432 dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
432 433 while (src_addr != NULL && dst_addr != NULL) {
433 434 while (src_addr == dst_addr && src_addr != NULL) {
434 435 *(char *)dst_cur.pr_pdaddr |=
435 436 *(char *)src_cur.pr_pdaddr;
436 437 src_addr = (uintptr_t)advance_prpageheader_cur(
437 438 &src_cur);
438 439 dst_addr = (uintptr_t)advance_prpageheader_cur(
439 440 &dst_cur);
440 441 }
441 442 if (src_addr != dst_addr)
442 443 mappings_changed = 1;
443 444 src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
444 445 dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
445 446 while (src_addr != dst_addr && src_addr != NULL && dst_addr !=
446 447 NULL) {
447 448 mappings_changed = 1;
448 449 if (src_addr < dst_addr)
449 450 src_addr = advance_prpageheader_cur_nextmapping(
450 451 &src_cur);
451 452 else
452 453 dst_addr = advance_prpageheader_cur_nextmapping(
453 454 &dst_cur);
454 455 }
455 456 }
456 457
457 458 *mappings_changedp = mappings_changed;
458 459 }
459 460
460 461 /*
461 462 * Merge the current pagedata with that on hand. If the pagedata is
462 463 * unretrievable for any reason, such as the process having exited or being a
463 464 * zombie, a nonzero value is returned, the process should be marked
464 465 * unscannable, and future attempts to scan it should be avoided, since the
465 466 * symptom is probably permament. If the mappings of either pagedata
466 467 * differ in any respect, the supplied callback will be invoked once.
467 468 */
468 469 static int
469 470 merge_current_pagedata(lprocess_t *lpc,
470 471 void(*mappings_changed_cb) (lprocess_t *))
471 472 {
472 473 prpageheader_t *pghp;
473 474 int mappings_changed = 0;
474 475 uint64_t cnt;
475 476
476 477 if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
477 478 0) {
478 479 char pathbuf[PROC_PATH_MAX];
479 480
480 481 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
481 482 (int)lpc->lpc_pid);
482 483 if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
483 484 revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
484 485 get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
485 486 return (-1);
486 487 debug("starting/resuming pagedata collection for %d\n",
487 488 (int)lpc->lpc_pid);
488 489 }
489 490
490 491 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
491 492 if (cnt != 0 || lpc->lpc_rss != 0)
492 493 debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
493 494 (int)lpc->lpc_pid, (unsigned long long)cnt,
494 495 (unsigned long long)lpc->lpc_rss);
495 496 if (lpc->lpc_prpageheader != NULL) {
496 497 /*
497 498 * OR the two snapshots.
498 499 */
499 500 #ifdef DEBUG
500 501 lmapping_t *old = NULL;
501 502 lmapping_t *new = NULL;
502 503
503 504 mklmapping(&new, pghp);
504 505 mklmapping(&old, lpc->lpc_prpageheader);
505 506 #endif /* DEBUG */
506 507 OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
507 508 #ifdef DEBUG
508 509 if (((mappings_changed != 0) ^
509 510 (lmapping_dump_diff(old, new) != 0))) {
510 511 debug("lmapping_changed inconsistent with lmapping\n");
511 512 debug("old\n");
512 513 lmapping_dump(old);
513 514 debug("new\n");
514 515 lmapping_dump(new);
515 516 debug("ignored\n");
516 517 lmapping_dump(lpc->lpc_ignore);
517 518 ASSERT(0);
518 519 }
519 520 lmapping_free(&new);
520 521 lmapping_free(&old);
521 522 #endif /* DEBUG */
522 523 free(lpc->lpc_prpageheader);
523 524 } else
524 525 mappings_changed = 1;
525 526 lpc->lpc_prpageheader = pghp;
526 527
527 528 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
528 529 if (cnt != 0 || lpc->lpc_rss != 0)
529 530 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
530 531 (int)lpc->lpc_pid, (unsigned long long)cnt,
531 532 (unsigned long long)lpc->lpc_rss);
532 533 if (mappings_changed != 0) {
533 534 debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
534 535 if (mappings_changed_cb != NULL)
535 536 mappings_changed_cb(lpc);
536 537 }
537 538 return (0);
538 539 }
539 540
540 541 /*
541 542 * Attempt to page out a region of the given process's address space. May
542 543 * return nonzero if not all of the pages may are pageable, for any reason.
543 544 */
|
↓ open down ↓ |
510 lines elided |
↑ open up ↑ |
544 545 static int
545 546 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
546 547 {
547 548 int res;
548 549
549 550 if (end <= start)
550 551 return (0);
551 552
552 553 errno = 0;
553 554 res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
554 - (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
555 + (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
555 556 debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
556 557
557 558 /*
558 559 * EBUSY indicates none of the pages have backing store allocated, or
559 560 * some pages were locked, which are less interesting than other
560 561 * conditions, which are noted.
561 562 */
562 563 if (res != 0)
563 564 if (errno == EBUSY)
564 565 res = 0;
565 566 else
566 567 debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
567 568 (void *)start, (long long)(end - start), errno);
568 569
569 570 return (res);
570 571 }
571 572
572 573 /*
573 574 * Compute the delta of the victim process's RSS since the last call. If the
574 575 * psinfo cannot be obtained, no work is done, and no error is returned; it is
575 576 * up to the caller to detect the process' termination via other means.
576 577 */
577 578 static int64_t
578 579 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
579 580 {
580 581 int64_t d_rss = 0;
581 582
582 583 if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
583 584 lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
584 585 d_rss = (int64_t)new_psinfo->pr_rssize -
585 586 (int64_t)old_psinfo->pr_rssize;
586 587 if (d_rss < 0)
587 588 vic->lpc_collection->lcol_stat.lcols_pg_eff +=
588 589 (- d_rss);
589 590 *old_psinfo = *new_psinfo;
590 591 }
591 592
592 593 return (d_rss);
593 594 }
594 595
595 596 static void
596 597 unignore_mappings(lprocess_t *lpc)
597 598 {
598 599 lmapping_free(&lpc->lpc_ignore);
599 600 }
600 601
601 602 static void
602 603 unignore_referenced_mappings(lprocess_t *lpc)
603 604 {
604 605 prpageheader_cur_t cur;
605 606 void *vicaddr;
606 607
607 608 vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
608 609 while (vicaddr != NULL) {
609 610 if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
610 611 != 0) {
611 612 if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
612 613 cur.pr_npage * cur.pr_pagesize) == 0)
613 614 debug("removed mapping 0x%p+0t%llukB from"
614 615 " ignored set\n", (void *)cur.pr_addr,
615 616 (unsigned long long)(cur.pr_npage *
616 617 cur.pr_pagesize / 1024));
617 618 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
618 619 &cur);
619 620 } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
620 621 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
621 622 &cur);
622 623 }
623 624 }
624 625
625 626 /*
626 627 * Resume scanning, starting with the last victim, if it is still valid, or any
627 628 * other one, otherwise.
628 629 */
629 630 void
630 631 scan(lcollection_t *lcol, int64_t excess)
631 632 {
632 633 lprocess_t *vic, *lpc;
633 634 void *vicaddr, *endaddr, *nvicaddr;
634 635 prpageheader_cur_t cur;
635 636 psinfo_t old_psinfo, new_psinfo;
636 637 hrtime_t scan_start;
637 638 int res, resumed;
638 639 uint64_t col_unrm_size;
639 640
640 641 st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
641 642 (long long)excess);
642 643
643 644 /*
644 645 * Determine the address to start scanning at, depending on whether
645 646 * scanning can be resumed.
646 647 */
647 648 endaddr = NULL;
648 649 if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
649 650 lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
650 651 vicaddr = lcol->lcol_resaddr;
651 652 st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
652 653 (int)vic->lpc_pid);
653 654 resumed = 1;
654 655 } else {
655 656 vicaddr = NULL;
656 657 resumed = 0;
657 658 }
658 659
659 660 scan_start = gethrtime();
660 661 /*
661 662 * Obtain the most current pagedata for the processes that might be
662 663 * scanned, and remove from the ignored set any mappings which have
663 664 * referenced or modified pages (in the hopes that the pageability of
664 665 * the mapping's pages may have changed). Determine if the
665 666 * unreferenced and unmodified portion is impossibly small to suffice
666 667 * to reduce the excess completely. If so, ignore these bits so that
667 668 * even working set will be paged out.
668 669 */
669 670 col_unrm_size = 0;
670 671 lpc = vic;
671 672 while (lpc != NULL && should_run) {
672 673 if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
673 674 st_debug(STDL_NORMAL, lcol, "process %d:"
674 675 " exited/temporarily unscannable",
675 676 (int)lpc->lpc_pid);
676 677 goto next;
677 678 }
678 679 debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
679 680 (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
680 681 (unsigned long long)lpc->lpc_size);
681 682 col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
682 683
683 684 if ((lcol->lcol_stat.lcols_scan_count %
684 685 RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
685 686 /*
686 687 * Periodically clear the set of ignored mappings.
687 688 * This will allow processes whose ignored segments'
688 689 * pageability have changed (without a corresponding
689 690 * reference or modification to a page) to be
690 691 * recognized.
691 692 */
692 693 if (lcol->lcol_stat.lcols_scan_count > 0)
693 694 unignore_mappings(lpc);
694 695 } else {
695 696 /*
696 697 * Ensure mappings with referenced or modified pages
697 698 * are not in the ignored set. Their usage might mean
698 699 * the condition which made them unpageable is gone.
699 700 */
700 701 unignore_referenced_mappings(lpc);
701 702 }
702 703 next:
703 704 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
704 705 lpc->lpc_next) : NULL;
705 706 }
706 707 if (col_unrm_size < excess) {
707 708 lpc = vic;
708 709 debug("will not reduce excess with only unreferenced pages\n");
709 710 while (lpc != NULL && should_run) {
710 711 if (lpc->lpc_prpageheader != NULL) {
711 712 (void) count_pages(lpc->lpc_prpageheader,
712 713 CP_CLEAR, 0, 0);
713 714 if (lpc->lpc_pgdata_fd >= 0) {
714 715 if (rfd_close(lpc->lpc_pgdata_fd) != 0)
715 716 debug("coud not close %d"
716 717 " lpc_pgdata_fd %d",
717 718 (int)lpc->lpc_pid,
718 719 lpc->lpc_pgdata_fd);
719 720 lpc->lpc_pgdata_fd = -1;
720 721 }
721 722 }
722 723 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
723 724 lpc->lpc_next) : NULL;
724 725 }
725 726 }
726 727
727 728 /*
728 729 * Examine each process for pages to remove until the excess is
729 730 * reduced.
730 731 */
731 732 while (vic != NULL && excess > 0 && should_run) {
732 733 /*
733 734 * Skip processes whose death was reported when the merging of
734 735 * pagedata was attempted.
735 736 */
736 737 if (vic->lpc_prpageheader == NULL)
737 738 goto nextproc;
738 739
739 740 /*
740 741 * Obtain optional segment residency information.
741 742 */
742 743 if (lpc_xmap_update(vic) != 0)
743 744 st_debug(STDL_NORMAL, lcol, "process %d: xmap"
744 745 " unreadable; ignoring", (int)vic->lpc_pid);
745 746
746 747 #ifdef DEBUG_MSG
747 748 {
748 749 void *ovicaddr = vicaddr;
749 750 #endif /* DEBUG_MSG */
750 751 vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
751 752 vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
752 753 #ifdef DEBUG_MSG
753 754 st_debug(STDL_NORMAL, lcol, "trying to resume from"
754 755 " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
755 756 }
756 757 #endif /* DEBUG_MSG */
757 758
758 759 /*
759 760 * Take control of the victim.
760 761 */
761 762 if (get_psinfo(vic->lpc_pid, &old_psinfo,
762 763 vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
763 764 vic, vic) != 0) {
764 765 st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
765 766 (int)vic->lpc_pid);
766 767 goto nextproc;
767 768 }
768 769 (void) rfd_reserve(PGRAB_FD_COUNT);
769 770 if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
770 771 st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
771 772 (int)vic->lpc_pid, res);
772 773 goto nextproc;
773 774 }
774 775 if (Pcreate_agent(scan_pr) != 0) {
775 776 st_debug(STDL_NORMAL, lcol, "cannot control %d",
776 777 (int)vic->lpc_pid);
777 778 goto nextproc;
778 779 }
779 780 /*
780 781 * Be very pessimistic about the state of the agent LWP --
781 782 * verify it's actually stopped.
782 783 */
783 784 errno = 0;
784 785 while (Pstate(scan_pr) == PS_RUN)
785 786 (void) Pwait(scan_pr, 0);
786 787 if (Pstate(scan_pr) != PS_STOP) {
787 788 st_debug(STDL_NORMAL, lcol, "agent not in expected"
788 789 " state (%d)", Pstate(scan_pr));
789 790 goto nextproc;
790 791 }
791 792
792 793 /*
793 794 * Within the victim's address space, find contiguous ranges of
794 795 * unreferenced pages to page out.
795 796 */
796 797 st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
797 798 (int)vic->lpc_pid);
798 799 while (excess > 0 && vicaddr != NULL && should_run) {
799 800 /*
800 801 * Skip mappings in the ignored set. Mappings get
801 802 * placed in the ignored set when all their resident
802 803 * pages are unreference and unmodified, yet unpageable
803 804 * -- such as when they are locked, or involved in
804 805 * asynchronous I/O. They will be scanned again when
805 806 * some page is referenced or modified.
806 807 */
807 808 if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
808 809 cur.pr_npage * cur.pr_pagesize)) {
809 810 debug("ignored mapping at 0x%p\n",
810 811 (void *)cur.pr_addr);
811 812 /*
812 813 * Update statistics.
813 814 */
814 815 lcol->lcol_stat.lcols_pg_att +=
815 816 cur.pr_npage * cur.pr_pagesize / 1024;
816 817
817 818 vicaddr = (void *)
818 819 advance_prpageheader_cur_nextmapping(&cur);
819 820 continue;
820 821 }
821 822
822 823 /*
823 824 * Determine a range of unreferenced pages to page out,
824 825 * and clear the R/M bits in the preceding referenced
825 826 * range.
826 827 */
827 828 st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
828 829 " npage %llu\n", vicaddr,
829 830 (unsigned long long)cur.pr_npage);
830 831 while (vicaddr != NULL &&
831 832 *(caddr_t)cur.pr_pdaddr != 0) {
832 833 *(caddr_t)cur.pr_pdaddr = 0;
833 834 vicaddr = advance_prpageheader_cur(&cur);
834 835 }
835 836 st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
836 837 " %p\n", vicaddr, cur.pr_pdaddr);
837 838 if (vicaddr == NULL) {
838 839 /*
839 840 * The end of mapping was reached before any
840 841 * unreferenced pages were seen.
841 842 */
842 843 vicaddr = (void *)
843 844 advance_prpageheader_cur_nextmapping(&cur);
844 845 continue;
845 846 }
846 847 do
847 848 endaddr = advance_prpageheader_cur(&cur);
848 849 while (endaddr != NULL &&
849 850 *(caddr_t)cur.pr_pdaddr == 0 &&
850 851 (((intptr_t)endaddr - (intptr_t)vicaddr) /
851 852 1024) < excess);
852 853 st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
853 854 endaddr, *(caddr_t)cur.pr_pdaddr);
854 855
855 856 /*
856 857 * Page out from vicaddr to the end of the mapping, or
857 858 * endaddr if set, then continue scanning after
858 859 * endaddr, or the next mapping, if not set.
859 860 */
860 861 nvicaddr = endaddr;
861 862 if (endaddr == NULL)
862 863 endaddr = (caddr_t)cur.pr_addr +
863 864 cur.pr_pagesize * cur.pr_npage;
864 865 if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
865 866 0) {
866 867 int64_t d_rss, att;
867 868 int willignore = 0;
868 869
869 870 excess += (d_rss = rss_delta(
870 871 &new_psinfo, &old_psinfo, vic));
871 872
872 873 /*
873 874 * If this pageout attempt was unsuccessful
874 875 * (the resident portion was not affected), and
875 876 * was for the whole mapping, put it in the
876 877 * ignored set, so it will not be scanned again
877 878 * until some page is referenced or modified.
878 879 */
879 880 if (d_rss >= 0 && (void *)cur.pr_addr ==
880 881 vicaddr && (cur.pr_pagesize * cur.pr_npage)
881 882 == ((uintptr_t)endaddr -
882 883 (uintptr_t)vicaddr)) {
883 884 if (lmapping_insert(
884 885 &vic->lpc_ignore,
885 886 cur.pr_addr,
886 887 cur.pr_pagesize *
887 888 cur.pr_npage) != 0)
888 889 debug("not enough memory to add"
889 890 " mapping at %p to ignored"
890 891 " set\n",
891 892 (void *)cur.pr_addr);
892 893 willignore = 1;
893 894 }
894 895
895 896 /*
896 897 * Update statistics.
897 898 */
898 899 lcol->lcol_stat.lcols_pg_att += (att =
899 900 ((intptr_t)endaddr - (intptr_t)vicaddr) /
900 901 1024);
901 902 st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
902 903 "+0t(%llu/%llu)kB%s\n", vicaddr,
903 904 (unsigned long long)((d_rss <
904 905 0) ? - d_rss : 0), (unsigned long long)att,
905 906 willignore ? " (will ignore)" : "");
906 907 } else {
907 908 st_debug(STDL_NORMAL, lcol,
908 909 "process %d: exited/unscannable\n",
909 910 (int)vic->lpc_pid);
910 911 vic->lpc_unscannable = 1;
911 912 goto nextproc;
912 913 }
913 914
914 915 /*
915 916 * Update the statistics file, if it's time.
916 917 */
917 918 check_update_statistics();
918 919
919 920 vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
920 921 *)advance_prpageheader_cur_nextmapping(&cur);
921 922 }
922 923 excess += rss_delta(&new_psinfo, &old_psinfo, vic);
923 924 st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
924 925 (long long)excess);
925 926 nextproc:
926 927 /*
927 928 * If a process was grabbed, release it, destroying its agent.
928 929 */
929 930 if (scan_pr != NULL) {
930 931 (void) Prelease(scan_pr, 0);
931 932 scan_pr = NULL;
932 933 }
933 934 lcol->lcol_victim = vic;
934 935 /*
935 936 * Scan the collection at most once. Only if scanning was not
936 937 * aborted for any reason, and the end of lprocess has not been
937 938 * reached, determine the next victim and scan it.
938 939 */
939 940 if (vic != NULL) {
940 941 if (vic->lpc_next != NULL) {
941 942 /*
942 943 * Determine the next process to be scanned.
943 944 */
944 945 if (excess > 0) {
945 946 vic = get_valid_victim(lcol,
946 947 vic->lpc_next);
947 948 vicaddr = 0;
948 949 }
949 950 } else {
950 951 /*
951 952 * A complete scan of the collection was made,
952 953 * so tick the scan counter and stop scanning
953 954 * until the next request.
954 955 */
955 956 lcol->lcol_stat.lcols_scan_count++;
956 957 lcol->lcol_stat.lcols_scan_time_complete
957 958 = lcol->lcol_stat.lcols_scan_time;
958 959 /*
959 960 * If an excess still exists, tick the
960 961 * "ineffective scan" counter, signalling that
961 962 * the cap may be uneforceable.
962 963 */
963 964 if (resumed == 0 && excess > 0)
964 965 lcol->lcol_stat
965 966 .lcols_scan_ineffective++;
966 967 /*
967 968 * Scanning should start at the beginning of
968 969 * the process list at the next request.
969 970 */
970 971 if (excess > 0)
971 972 vic = NULL;
972 973 }
973 974 }
974 975 }
975 976 lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
976 977 st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
977 978 (long long)excess);
978 979
979 980 lcol->lcol_resaddr = vicaddr;
980 981 if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
981 982 lcol->lcol_victim = get_valid_victim(lcol,
982 983 lcol->lcol_victim->lpc_next);
983 984 }
984 985 }
985 986
986 987 /*
987 988 * Abort the scan in progress, and destroy the agent LWP of any grabbed
988 989 * processes.
989 990 */
990 991 void
991 992 scan_abort(void)
992 993 {
993 994 if (scan_pr != NULL)
994 995 (void) Prelease(scan_pr, NULL);
995 996 }
996 997
997 998 static void
998 999 revoke_xmap(rfd_t *rfd)
999 1000 {
1000 1001 lprocess_t *lpc = rfd->rfd_data;
1001 1002
1002 1003 debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
1003 1004 ASSERT(lpc->lpc_xmap_fd != -1);
1004 1005 lpc->lpc_xmap_fd = -1;
1005 1006 }
1006 1007
1007 1008 /*
1008 1009 * Retrieve the process's current xmap , which is used to determine the size of
1009 1010 * the resident portion of its segments. Return zero if successful.
1010 1011 */
1011 1012 static int
1012 1013 lpc_xmap_update(lprocess_t *lpc)
1013 1014 {
1014 1015 int res;
1015 1016 struct stat st;
1016 1017
1017 1018 free(lpc->lpc_xmap);
1018 1019 lpc->lpc_xmap = NULL;
1019 1020 lpc->lpc_nxmap = -1;
1020 1021
1021 1022 if (lpc->lpc_xmap_fd == -1) {
1022 1023 char pathbuf[PROC_PATH_MAX];
1023 1024
1024 1025 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
1025 1026 (int)lpc->lpc_pid);
1026 1027 if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
1027 1028 revoke_xmap, lpc, O_RDONLY, 0)) < 0)
1028 1029 return (-1);
1029 1030 }
1030 1031
1031 1032 redo:
1032 1033 errno = 0;
1033 1034 if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
1034 1035 debug("cannot stat xmap\n");
1035 1036 (void) rfd_close(lpc->lpc_xmap_fd);
1036 1037 lpc->lpc_xmap_fd = -1;
1037 1038 return (-1);
1038 1039 }
1039 1040
1040 1041 if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
1041 1042 debug("xmap wrong size\n");
1042 1043 (void) rfd_close(lpc->lpc_xmap_fd);
1043 1044 lpc->lpc_xmap_fd = -1;
1044 1045 return (-1);
1045 1046 }
1046 1047
1047 1048 lpc->lpc_xmap = malloc(st.st_size);
1048 1049 if (lpc->lpc_xmap == NULL) {
1049 1050 debug("cannot malloc() %ld bytes for xmap", st.st_size);
1050 1051 (void) rfd_close(lpc->lpc_xmap_fd);
1051 1052 lpc->lpc_xmap_fd = -1;
1052 1053 return (-1);
1053 1054 }
1054 1055
1055 1056 if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
1056 1057 st.st_size) {
1057 1058 free(lpc->lpc_xmap);
1058 1059 lpc->lpc_xmap = NULL;
1059 1060 if (res > 0) {
1060 1061 debug("xmap changed size, retrying\n");
1061 1062 goto redo;
1062 1063 } else {
1063 1064 debug("cannot read xmap");
1064 1065 return (-1);
1065 1066 }
1066 1067 }
1067 1068 lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
1068 1069
1069 1070 return (0);
1070 1071 }
|
↓ open down ↓ |
506 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX