1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2012 Joyent, Inc. All rights reserved.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 #include <sys/mman.h>
30 #include <sys/param.h>
31 #include <sys/stat.h>
32 #include <sys/types.h>
33 #include <assert.h>
34 #include <errno.h>
35 #include <fcntl.h>
36 #include <libproc.h>
37 #include <limits.h>
38 #include <procfs.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <strings.h>
42 #include <time.h>
43 #include <unistd.h>
44 #include "rcapd.h"
45 #include "rcapd_rfd.h"
46 #include "rcapd_mapping.h"
47 #include "utils.h"
48
49 static int lpc_xmap_update(lprocess_t *);
50 #ifdef DEBUG
51 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
52 #endif /* DEBUG */
53
54 /*
55 * The number of file descriptors required to grab a process and create an
56 * agent in it.
57 */
58 #define PGRAB_FD_COUNT 10
59
60 /*
61 * Record a position in an address space as it corresponds to a prpageheader_t
62 * and affiliated structures.
63 */
64 typedef struct prpageheader_cur {
65 int pr_nmap; /* number of mappings in address space */
66 int pr_map; /* number of this mapping */
67 uint64_t pr_pgoff; /* page offset into mapping */
68 uint64_t pr_npage; /* number of pages in mapping */
69 uint64_t pr_pagesize; /* page size of mapping */
70 uintptr_t pr_addr; /* base of mapping */
71 prpageheader_t *pr_prpageheader; /* associated page header */
72 void *pr_pdaddr; /* address of page's byte in pagedata */
73 prxmap_t *pr_xmap; /* array containing per-segment information */
74 int pr_nxmap; /* number of xmaps in array */
75 int64_t pr_rss; /* number of resident pages in mapping, */
76 /* or -1 if xmap is out of sync */
77 int64_t pr_pg_rss; /* number of pageable pages in mapping, or -1 */
78 } prpageheader_cur_t;
79
80 static struct ps_prochandle *scan_pr; /* currently-scanned process's handle */
81
82 typedef enum {
83 STDL_NORMAL,
84 STDL_HIGH
85 } st_debug_level_t;
86
87 /*
88 * Output a scanning-related debug message.
89 */
90 /*PRINTFLIKE3*/ /*ARGSUSED*/
91 static void
92 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
93 {
94 #ifdef DEBUG_MSG
95 va_list alist;
96 char *buf;
97 size_t len;
98
99 if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
100 : RCM_DEBUG))
101 return;
102
103 len = strlen(msg) + LINELEN;
104 buf = malloc(len);
105 if (buf == NULL)
106 return;
107 (void) snprintf(buf, len, "%s %s scanner %s",
108 (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
109 lcol->lcol_name, msg);
110
111 va_start(alist, msg);
112 vdprintfe(RCM_DEBUG, buf, alist);
113 va_end(alist);
114
115 free(buf);
116 #endif /* DEBUG_MSG */
117 }
118
119 /*
120 * Determine the collection's current victim, based on its last. The last will
121 * be returned, or, if invalid, any other valid process, if the collection has
122 * any.
123 */
124 static lprocess_t *
125 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
126 {
127 if (lpc == NULL || !lcollection_member(lcol, lpc))
128 lpc = lcol->lcol_lprocess;
129
130 /*
131 * Find the next scannable process, and make it the victim.
132 */
133 while (lpc != NULL && lpc->lpc_unscannable != 0)
134 lpc = lpc->lpc_next;
135
136 return (lpc);
137 }
138
139 /*
140 * Get a process's combined current pagedata (per-page referenced and modified
141 * bits) and set the supplied pointer to it. The caller is responsible for
142 * freeing the data. If the pagedata is unreadable, a nonzero value is
143 * returned, and errno is set. Otherwise, 0 is returned.
144 */
145 static int
146 get_pagedata(prpageheader_t **pghpp, int fd)
147 {
148 int res;
149 struct stat st;
150
151 redo:
152 errno = 0;
153 if (fstat(fd, &st) != 0) {
154 debug("cannot stat pagedata\n");
155 return (-1);
156 }
157
158 errno = 0;
159 *pghpp = malloc(st.st_size);
160 if (*pghpp == NULL) {
161 debug("cannot malloc() %ld bytes for pagedata", st.st_size);
162 return (-1);
163 }
164 (void) bzero(*pghpp, st.st_size);
165
166 errno = 0;
167 if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
168 free(*pghpp);
169 *pghpp = NULL;
170 if (res > 0 || errno == E2BIG) {
171 debug("pagedata changed size, retrying\n");
172 goto redo;
173 } else {
174 debug("cannot read pagedata");
175 return (-1);
176 }
177 }
178
179 return (0);
180 }
181
182 /*
183 * Return the count of kilobytes of pages represented by the given pagedata
184 * which meet the given criteria, having pages which are in all of the states
185 * specified by the mask, and in none of the states in the notmask. If the
186 * CP_CLEAR flag is set, the pagedata will also be cleared.
187 */
188 #define CP_CLEAR 1
189 static uint64_t
190 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
191 {
192 int map;
193 caddr_t cur, end;
194 prpageheader_t pgh = *pghp;
195 prasmap_t *asmapp;
196 uint64_t count = 0;
197
198 cur = (caddr_t)pghp + sizeof (*pghp);
199 for (map = 0; map < pgh.pr_nmap; map++) {
200 asmapp = (prasmap_t *)(uintptr_t)cur;
201 cur += sizeof (*asmapp);
202 end = cur + asmapp->pr_npage;
203 while (cur < end) {
204 if ((*cur & mask) == mask && (*cur & notmask) == 0)
205 count += asmapp->pr_pagesize / 1024;
206 if ((flags & CP_CLEAR) != 0)
207 *cur = 0;
208 cur++;
209 }
210
211 /*
212 * Skip to next 64-bit-aligned address to get the next
213 * prasmap_t.
214 */
215 cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
216 }
217
218 return (count);
219 }
220
221 /*
222 * Return the amount of memory (in kilobytes) that hasn't been referenced or
223 * modified, which memory which will be paged out first. Should be written to
224 * exclude nonresident pages when sufficient interfaces exist.
225 */
226 static uint64_t
227 unrm_size(lprocess_t *lpc)
228 {
229 return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
230 0, PG_MODIFIED | PG_REFERENCED));
231 }
232
233 /*
234 * Advance a prpageheader_cur_t to the address space's next mapping, returning
235 * its address, or NULL if there is none. Any known nonpageable or nonresident
236 * mappings will be skipped over.
237 */
238 static uintptr_t
239 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
240 {
241 prasmap_t *pap;
242 int i;
243
244 next:
245 ASSERT(pcp->pr_map < pcp->pr_nmap);
246 if ((pcp->pr_map + 1) == pcp->pr_nmap)
247 return (NULL);
248 pcp->pr_map++;
249 if (pcp->pr_pgoff < pcp->pr_npage) {
250 pcp->pr_pdaddr = (caddr_t)(uintptr_t)
251 ((uintptr_t)pcp->pr_pdaddr +
252 (pcp->pr_npage - pcp->pr_pgoff));
253 pcp->pr_pgoff = pcp->pr_npage;
254 }
255 /*
256 * Skip to next 64-bit-aligned address to get the next prasmap_t.
257 */
258 pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
259 pap = (prasmap_t *)pcp->pr_pdaddr;
260 pcp->pr_pgoff = 0;
261 pcp->pr_npage = pap->pr_npage;
262 pcp->pr_pagesize = pap->pr_pagesize;
263 pcp->pr_addr = pap->pr_vaddr;
264 pcp->pr_pdaddr = pap + 1;
265
266 /*
267 * Skip any known nonpageable mappings. Currently, the only one
268 * detected is the schedctl page.
269 */
270 if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
271 MA_ANON)) == 0 && pap->pr_npage == 1) {
272 debug("identified nonpageable schedctl mapping at %p\n",
273 (void *)pcp->pr_addr);
274 goto next;
275 }
276
277 /*
278 * Skip mappings with no resident pages. If the xmap does not
279 * correspond to the pagedata for any reason, it will be ignored.
280 */
281 pcp->pr_rss = -1;
282 pcp->pr_pg_rss = -1;
283 for (i = 0; i < pcp->pr_nxmap; i++) {
284 prxmap_t *xmap = &pcp->pr_xmap[i];
285
286 if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
287 (pcp->pr_npage * pcp->pr_pagesize)) {
288 pcp->pr_rss = xmap->pr_rss;
289 /*
290 * Remove COW pages from the pageable RSS count.
291 */
292 if ((xmap->pr_mflags & MA_SHARED) == 0)
293 pcp->pr_pg_rss = xmap->pr_anon;
294 break;
295 }
296 }
297 if (pcp->pr_rss == 0) {
298 debug("identified nonresident mapping at 0x%p\n",
299 (void *)pcp->pr_addr);
300 goto next;
301 } else if (pcp->pr_pg_rss == 0) {
302 debug("identified unpageable mapping at 0x%p\n",
303 (void *)pcp->pr_addr);
304 goto next;
305 }
306
307 return (pcp->pr_addr);
308 }
309
310 /*
311 * Advance a prpageheader_cur_t to the mapping's next page, returning its
312 * address, or NULL if there is none.
313 */
314 static void *
315 advance_prpageheader_cur(prpageheader_cur_t *pcp)
316 {
317 ASSERT(pcp->pr_pgoff < pcp->pr_npage);
318 if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
319 return (NULL);
320 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
321 pcp->pr_pgoff++;
322
323 ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
324 return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
325 }
326
327 /*
328 * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
329 * of an address space.
330 */
331 static void *
332 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
333 prxmap_t *xmap, int nxmap)
334 {
335 bzero(pcp, sizeof (*pcp));
336 pcp->pr_nmap = php->pr_nmap;
337 pcp->pr_map = -1;
338 pcp->pr_prpageheader = php;
339 pcp->pr_xmap = xmap;
340 pcp->pr_nxmap = nxmap;
341 pcp->pr_pdaddr = (prpageheader_t *)php + 1;
342
343 return ((void *)advance_prpageheader_cur_nextmapping(pcp));
344 }
345
346 /*
347 * Position a prpageheader_cur_t to the mapped address greater or equal to the
348 * given value.
349 */
350 static void *
351 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
352 prxmap_t *xmap, int nxmap, void *naddr)
353 {
354 void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
355
356 while (addr != NULL && addr <= naddr)
357 if (naddr < (void *)((caddr_t)pcp->pr_addr +
358 pcp->pr_pagesize * pcp->pr_npage)) {
359 uint64_t pgdiff = ((uintptr_t)naddr -
360 (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
361 pcp->pr_pgoff += pgdiff;
362 pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
363 addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
364 pcp->pr_pgoff;
365 break;
366 } else
367 addr =
368 (void *)advance_prpageheader_cur_nextmapping(pcp);
369
370 return (addr);
371 }
372
373 static void
374 revoke_pagedata(rfd_t *rfd)
375 {
376 lprocess_t *lpc = rfd->rfd_data;
377
378 st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
379 " process %d\n", (int)lpc->lpc_pid);
380 ASSERT(lpc->lpc_pgdata_fd != -1);
381 lpc->lpc_pgdata_fd = -1;
382 }
383
384 #ifdef DEBUG
385 static void
386 mklmapping(lmapping_t **lm, prpageheader_t *pgh)
387 {
388 prpageheader_cur_t cur;
389 void *addr;
390
391 addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
392 ASSERT(*lm == NULL);
393 while (addr != NULL) {
394 (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
395 cur.pr_pagesize);
396 addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
397 }
398 }
399
400 static void
401 lmapping_dump(lmapping_t *lm)
402 {
403 debug("lm: %p\n", (void *)lm);
404 while (lm != NULL) {
405 debug("\t(%p, %llx\n", (void *)lm->lm_addr,
406 (unsigned long long)lm->lm_size);
407 lm = lm->lm_next;
408 }
409 }
410 #endif /* DEBUG */
411
412 /*
413 * OR two prpagedata_t which are supposedly snapshots of the same address
414 * space. Intersecting mappings with different page sizes are tolerated but
415 * not normalized (not accurate). If the mappings of the two snapshots differ
416 * in any regard, the supplied mappings_changed flag will be set.
417 */
418 static void
419 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
420 {
421 prpageheader_cur_t src_cur;
422 prpageheader_cur_t dst_cur;
423 uintptr_t src_addr;
424 uintptr_t dst_addr;
425 int mappings_changed = 0;
426
427 /*
428 * OR source pagedata with the destination, for pages of intersecting
429 * mappings.
430 */
431 src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
432 dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
433 while (src_addr != NULL && dst_addr != NULL) {
434 while (src_addr == dst_addr && src_addr != NULL) {
435 *(char *)dst_cur.pr_pdaddr |=
436 *(char *)src_cur.pr_pdaddr;
437 src_addr = (uintptr_t)advance_prpageheader_cur(
438 &src_cur);
439 dst_addr = (uintptr_t)advance_prpageheader_cur(
440 &dst_cur);
441 }
442 if (src_addr != dst_addr)
443 mappings_changed = 1;
444 src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
445 dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
446 while (src_addr != dst_addr && src_addr != NULL && dst_addr !=
447 NULL) {
448 mappings_changed = 1;
449 if (src_addr < dst_addr)
450 src_addr = advance_prpageheader_cur_nextmapping(
451 &src_cur);
452 else
453 dst_addr = advance_prpageheader_cur_nextmapping(
454 &dst_cur);
455 }
456 }
457
458 *mappings_changedp = mappings_changed;
459 }
460
461 /*
462 * Merge the current pagedata with that on hand. If the pagedata is
463 * unretrievable for any reason, such as the process having exited or being a
464 * zombie, a nonzero value is returned, the process should be marked
465 * unscannable, and future attempts to scan it should be avoided, since the
466 * symptom is probably permament. If the mappings of either pagedata
467 * differ in any respect, the supplied callback will be invoked once.
468 */
469 static int
470 merge_current_pagedata(lprocess_t *lpc,
471 void(*mappings_changed_cb) (lprocess_t *))
472 {
473 prpageheader_t *pghp;
474 int mappings_changed = 0;
475 uint64_t cnt;
476
477 if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
478 0) {
479 char pathbuf[PROC_PATH_MAX];
480
481 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
482 (int)lpc->lpc_pid);
483 if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
484 revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
485 get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
486 return (-1);
487 debug("starting/resuming pagedata collection for %d\n",
488 (int)lpc->lpc_pid);
489 }
490
491 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
492 if (cnt != 0 || lpc->lpc_rss != 0)
493 debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
494 (int)lpc->lpc_pid, (unsigned long long)cnt,
495 (unsigned long long)lpc->lpc_rss);
496 if (lpc->lpc_prpageheader != NULL) {
497 /*
498 * OR the two snapshots.
499 */
500 #ifdef DEBUG
501 lmapping_t *old = NULL;
502 lmapping_t *new = NULL;
503
504 mklmapping(&new, pghp);
505 mklmapping(&old, lpc->lpc_prpageheader);
506 #endif /* DEBUG */
507 OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
508 #ifdef DEBUG
509 if (((mappings_changed != 0) ^
510 (lmapping_dump_diff(old, new) != 0))) {
511 debug("lmapping_changed inconsistent with lmapping\n");
512 debug("old\n");
513 lmapping_dump(old);
514 debug("new\n");
515 lmapping_dump(new);
516 debug("ignored\n");
517 lmapping_dump(lpc->lpc_ignore);
518 ASSERT(0);
519 }
520 lmapping_free(&new);
521 lmapping_free(&old);
522 #endif /* DEBUG */
523 free(lpc->lpc_prpageheader);
524 } else
525 mappings_changed = 1;
526 lpc->lpc_prpageheader = pghp;
527
528 cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
529 if (cnt != 0 || lpc->lpc_rss != 0)
530 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
531 (int)lpc->lpc_pid, (unsigned long long)cnt,
532 (unsigned long long)lpc->lpc_rss);
533 if (mappings_changed != 0) {
534 debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
535 if (mappings_changed_cb != NULL)
536 mappings_changed_cb(lpc);
537 }
538 return (0);
539 }
540
541 /*
542 * Attempt to page out a region of the given process's address space. May
543 * return nonzero if not all of the pages may are pageable, for any reason.
544 */
545 static int
546 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
547 {
548 int res;
549
550 if (end <= start)
551 return (0);
552
553 errno = 0;
554 res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
555 (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
556 debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
557
558 /*
559 * EBUSY indicates none of the pages have backing store allocated, or
560 * some pages were locked, which are less interesting than other
561 * conditions, which are noted.
562 */
563 if (res != 0)
564 if (errno == EBUSY)
565 res = 0;
566 else
567 debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
568 (void *)start, (long long)(end - start), errno);
569
570 return (res);
571 }
572
573 /*
574 * Compute the delta of the victim process's RSS since the last call. If the
575 * psinfo cannot be obtained, no work is done, and no error is returned; it is
576 * up to the caller to detect the process' termination via other means.
577 */
578 static int64_t
579 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
580 {
581 int64_t d_rss = 0;
582
583 if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
584 lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
585 d_rss = (int64_t)new_psinfo->pr_rssize -
586 (int64_t)old_psinfo->pr_rssize;
587 if (d_rss < 0)
588 vic->lpc_collection->lcol_stat.lcols_pg_eff +=
589 (- d_rss);
590 *old_psinfo = *new_psinfo;
591 }
592
593 return (d_rss);
594 }
595
596 static void
597 unignore_mappings(lprocess_t *lpc)
598 {
599 lmapping_free(&lpc->lpc_ignore);
600 }
601
602 static void
603 unignore_referenced_mappings(lprocess_t *lpc)
604 {
605 prpageheader_cur_t cur;
606 void *vicaddr;
607
608 vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
609 while (vicaddr != NULL) {
610 if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
611 != 0) {
612 if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
613 cur.pr_npage * cur.pr_pagesize) == 0)
614 debug("removed mapping 0x%p+0t%llukB from"
615 " ignored set\n", (void *)cur.pr_addr,
616 (unsigned long long)(cur.pr_npage *
617 cur.pr_pagesize / 1024));
618 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
619 &cur);
620 } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
621 vicaddr = (void *)advance_prpageheader_cur_nextmapping(
622 &cur);
623 }
624 }
625
626 /*
627 * Resume scanning, starting with the last victim, if it is still valid, or any
628 * other one, otherwise.
629 */
630 void
631 scan(lcollection_t *lcol, int64_t excess)
632 {
633 lprocess_t *vic, *lpc;
634 void *vicaddr, *endaddr, *nvicaddr;
635 prpageheader_cur_t cur;
636 psinfo_t old_psinfo, new_psinfo;
637 hrtime_t scan_start;
638 int res, resumed;
639 uint64_t col_unrm_size;
640
641 st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
642 (long long)excess);
643
644 /*
645 * Determine the address to start scanning at, depending on whether
646 * scanning can be resumed.
647 */
648 endaddr = NULL;
649 if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
650 lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
651 vicaddr = lcol->lcol_resaddr;
652 st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
653 (int)vic->lpc_pid);
654 resumed = 1;
655 } else {
656 vicaddr = NULL;
657 resumed = 0;
658 }
659
660 scan_start = gethrtime();
661 /*
662 * Obtain the most current pagedata for the processes that might be
663 * scanned, and remove from the ignored set any mappings which have
664 * referenced or modified pages (in the hopes that the pageability of
665 * the mapping's pages may have changed). Determine if the
666 * unreferenced and unmodified portion is impossibly small to suffice
667 * to reduce the excess completely. If so, ignore these bits so that
668 * even working set will be paged out.
669 */
670 col_unrm_size = 0;
671 lpc = vic;
672 while (lpc != NULL && should_run) {
673 if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
674 st_debug(STDL_NORMAL, lcol, "process %d:"
675 " exited/temporarily unscannable",
676 (int)lpc->lpc_pid);
677 goto next;
678 }
679 debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
680 (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
681 (unsigned long long)lpc->lpc_size);
682 col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
683
684 if ((lcol->lcol_stat.lcols_scan_count %
685 RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
686 /*
687 * Periodically clear the set of ignored mappings.
688 * This will allow processes whose ignored segments'
689 * pageability have changed (without a corresponding
690 * reference or modification to a page) to be
691 * recognized.
692 */
693 if (lcol->lcol_stat.lcols_scan_count > 0)
694 unignore_mappings(lpc);
695 } else {
696 /*
697 * Ensure mappings with referenced or modified pages
698 * are not in the ignored set. Their usage might mean
699 * the condition which made them unpageable is gone.
700 */
701 unignore_referenced_mappings(lpc);
702 }
703 next:
704 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
705 lpc->lpc_next) : NULL;
706 }
707 if (col_unrm_size < excess) {
708 lpc = vic;
709 debug("will not reduce excess with only unreferenced pages\n");
710 while (lpc != NULL && should_run) {
711 if (lpc->lpc_prpageheader != NULL) {
712 (void) count_pages(lpc->lpc_prpageheader,
713 CP_CLEAR, 0, 0);
714 if (lpc->lpc_pgdata_fd >= 0) {
715 if (rfd_close(lpc->lpc_pgdata_fd) != 0)
716 debug("coud not close %d"
717 " lpc_pgdata_fd %d",
718 (int)lpc->lpc_pid,
719 lpc->lpc_pgdata_fd);
720 lpc->lpc_pgdata_fd = -1;
721 }
722 }
723 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
724 lpc->lpc_next) : NULL;
725 }
726 }
727
728 /*
729 * Examine each process for pages to remove until the excess is
730 * reduced.
731 */
732 while (vic != NULL && excess > 0 && should_run) {
733 /*
734 * Skip processes whose death was reported when the merging of
735 * pagedata was attempted.
736 */
737 if (vic->lpc_prpageheader == NULL)
738 goto nextproc;
739
740 /*
741 * Obtain optional segment residency information.
742 */
743 if (lpc_xmap_update(vic) != 0)
744 st_debug(STDL_NORMAL, lcol, "process %d: xmap"
745 " unreadable; ignoring", (int)vic->lpc_pid);
746
747 #ifdef DEBUG_MSG
748 {
749 void *ovicaddr = vicaddr;
750 #endif /* DEBUG_MSG */
751 vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
752 vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
753 #ifdef DEBUG_MSG
754 st_debug(STDL_NORMAL, lcol, "trying to resume from"
755 " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
756 }
757 #endif /* DEBUG_MSG */
758
759 /*
760 * Take control of the victim.
761 */
762 if (get_psinfo(vic->lpc_pid, &old_psinfo,
763 vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
764 vic, vic) != 0) {
765 st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
766 (int)vic->lpc_pid);
767 goto nextproc;
768 }
769 (void) rfd_reserve(PGRAB_FD_COUNT);
770 if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
771 st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
772 (int)vic->lpc_pid, res);
773 goto nextproc;
774 }
775 if (Pcreate_agent(scan_pr) != 0) {
776 st_debug(STDL_NORMAL, lcol, "cannot control %d",
777 (int)vic->lpc_pid);
778 goto nextproc;
779 }
780 /*
781 * Be very pessimistic about the state of the agent LWP --
782 * verify it's actually stopped.
783 */
784 errno = 0;
785 while (Pstate(scan_pr) == PS_RUN)
786 (void) Pwait(scan_pr, 0);
787 if (Pstate(scan_pr) != PS_STOP) {
788 st_debug(STDL_NORMAL, lcol, "agent not in expected"
789 " state (%d)", Pstate(scan_pr));
790 goto nextproc;
791 }
792
793 /*
794 * Within the victim's address space, find contiguous ranges of
795 * unreferenced pages to page out.
796 */
797 st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
798 (int)vic->lpc_pid);
799 while (excess > 0 && vicaddr != NULL && should_run) {
800 /*
801 * Skip mappings in the ignored set. Mappings get
802 * placed in the ignored set when all their resident
803 * pages are unreference and unmodified, yet unpageable
804 * -- such as when they are locked, or involved in
805 * asynchronous I/O. They will be scanned again when
806 * some page is referenced or modified.
807 */
808 if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
809 cur.pr_npage * cur.pr_pagesize)) {
810 debug("ignored mapping at 0x%p\n",
811 (void *)cur.pr_addr);
812 /*
813 * Update statistics.
814 */
815 lcol->lcol_stat.lcols_pg_att +=
816 cur.pr_npage * cur.pr_pagesize / 1024;
817
818 vicaddr = (void *)
819 advance_prpageheader_cur_nextmapping(&cur);
820 continue;
821 }
822
823 /*
824 * Determine a range of unreferenced pages to page out,
825 * and clear the R/M bits in the preceding referenced
826 * range.
827 */
828 st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
829 " npage %llu\n", vicaddr,
830 (unsigned long long)cur.pr_npage);
831 while (vicaddr != NULL &&
832 *(caddr_t)cur.pr_pdaddr != 0) {
833 *(caddr_t)cur.pr_pdaddr = 0;
834 vicaddr = advance_prpageheader_cur(&cur);
835 }
836 st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
837 " %p\n", vicaddr, cur.pr_pdaddr);
838 if (vicaddr == NULL) {
839 /*
840 * The end of mapping was reached before any
841 * unreferenced pages were seen.
842 */
843 vicaddr = (void *)
844 advance_prpageheader_cur_nextmapping(&cur);
845 continue;
846 }
847 do
848 endaddr = advance_prpageheader_cur(&cur);
849 while (endaddr != NULL &&
850 *(caddr_t)cur.pr_pdaddr == 0 &&
851 (((intptr_t)endaddr - (intptr_t)vicaddr) /
852 1024) < excess);
853 st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
854 endaddr, *(caddr_t)cur.pr_pdaddr);
855
856 /*
857 * Page out from vicaddr to the end of the mapping, or
858 * endaddr if set, then continue scanning after
859 * endaddr, or the next mapping, if not set.
860 */
861 nvicaddr = endaddr;
862 if (endaddr == NULL)
863 endaddr = (caddr_t)cur.pr_addr +
864 cur.pr_pagesize * cur.pr_npage;
865 if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
866 0) {
867 int64_t d_rss, att;
868 int willignore = 0;
869
870 excess += (d_rss = rss_delta(
871 &new_psinfo, &old_psinfo, vic));
872
873 /*
874 * If this pageout attempt was unsuccessful
875 * (the resident portion was not affected), and
876 * was for the whole mapping, put it in the
877 * ignored set, so it will not be scanned again
878 * until some page is referenced or modified.
879 */
880 if (d_rss >= 0 && (void *)cur.pr_addr ==
881 vicaddr && (cur.pr_pagesize * cur.pr_npage)
882 == ((uintptr_t)endaddr -
883 (uintptr_t)vicaddr)) {
884 if (lmapping_insert(
885 &vic->lpc_ignore,
886 cur.pr_addr,
887 cur.pr_pagesize *
888 cur.pr_npage) != 0)
889 debug("not enough memory to add"
890 " mapping at %p to ignored"
891 " set\n",
892 (void *)cur.pr_addr);
893 willignore = 1;
894 }
895
896 /*
897 * Update statistics.
898 */
899 lcol->lcol_stat.lcols_pg_att += (att =
900 ((intptr_t)endaddr - (intptr_t)vicaddr) /
901 1024);
902 st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
903 "+0t(%llu/%llu)kB%s\n", vicaddr,
904 (unsigned long long)((d_rss <
905 0) ? - d_rss : 0), (unsigned long long)att,
906 willignore ? " (will ignore)" : "");
907 } else {
908 st_debug(STDL_NORMAL, lcol,
909 "process %d: exited/unscannable\n",
910 (int)vic->lpc_pid);
911 vic->lpc_unscannable = 1;
912 goto nextproc;
913 }
914
915 /*
916 * Update the statistics file, if it's time.
917 */
918 check_update_statistics();
919
920 vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
921 *)advance_prpageheader_cur_nextmapping(&cur);
922 }
923 excess += rss_delta(&new_psinfo, &old_psinfo, vic);
924 st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
925 (long long)excess);
926 nextproc:
927 /*
928 * If a process was grabbed, release it, destroying its agent.
929 */
930 if (scan_pr != NULL) {
931 (void) Prelease(scan_pr, 0);
932 scan_pr = NULL;
933 }
934 lcol->lcol_victim = vic;
935 /*
936 * Scan the collection at most once. Only if scanning was not
937 * aborted for any reason, and the end of lprocess has not been
938 * reached, determine the next victim and scan it.
939 */
940 if (vic != NULL) {
941 if (vic->lpc_next != NULL) {
942 /*
943 * Determine the next process to be scanned.
944 */
945 if (excess > 0) {
946 vic = get_valid_victim(lcol,
947 vic->lpc_next);
948 vicaddr = 0;
949 }
950 } else {
951 /*
952 * A complete scan of the collection was made,
953 * so tick the scan counter and stop scanning
954 * until the next request.
955 */
956 lcol->lcol_stat.lcols_scan_count++;
957 lcol->lcol_stat.lcols_scan_time_complete
958 = lcol->lcol_stat.lcols_scan_time;
959 /*
960 * If an excess still exists, tick the
961 * "ineffective scan" counter, signalling that
962 * the cap may be uneforceable.
963 */
964 if (resumed == 0 && excess > 0)
965 lcol->lcol_stat
966 .lcols_scan_ineffective++;
967 /*
968 * Scanning should start at the beginning of
969 * the process list at the next request.
970 */
971 if (excess > 0)
972 vic = NULL;
973 }
974 }
975 }
976 lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
977 st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
978 (long long)excess);
979
980 lcol->lcol_resaddr = vicaddr;
981 if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
982 lcol->lcol_victim = get_valid_victim(lcol,
983 lcol->lcol_victim->lpc_next);
984 }
985 }
986
987 /*
988 * Abort the scan in progress, and destroy the agent LWP of any grabbed
989 * processes.
990 */
991 void
992 scan_abort(void)
993 {
994 if (scan_pr != NULL)
995 (void) Prelease(scan_pr, NULL);
996 }
997
998 static void
999 revoke_xmap(rfd_t *rfd)
1000 {
1001 lprocess_t *lpc = rfd->rfd_data;
1002
1003 debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
1004 ASSERT(lpc->lpc_xmap_fd != -1);
1005 lpc->lpc_xmap_fd = -1;
1006 }
1007
1008 /*
1009 * Retrieve the process's current xmap , which is used to determine the size of
1010 * the resident portion of its segments. Return zero if successful.
1011 */
1012 static int
1013 lpc_xmap_update(lprocess_t *lpc)
1014 {
1015 int res;
1016 struct stat st;
1017
1018 free(lpc->lpc_xmap);
1019 lpc->lpc_xmap = NULL;
1020 lpc->lpc_nxmap = -1;
1021
1022 if (lpc->lpc_xmap_fd == -1) {
1023 char pathbuf[PROC_PATH_MAX];
1024
1025 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
1026 (int)lpc->lpc_pid);
1027 if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
1028 revoke_xmap, lpc, O_RDONLY, 0)) < 0)
1029 return (-1);
1030 }
1031
1032 redo:
1033 errno = 0;
1034 if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
1035 debug("cannot stat xmap\n");
1036 (void) rfd_close(lpc->lpc_xmap_fd);
1037 lpc->lpc_xmap_fd = -1;
1038 return (-1);
1039 }
1040
1041 if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
1042 debug("xmap wrong size\n");
1043 (void) rfd_close(lpc->lpc_xmap_fd);
1044 lpc->lpc_xmap_fd = -1;
1045 return (-1);
1046 }
1047
1048 lpc->lpc_xmap = malloc(st.st_size);
1049 if (lpc->lpc_xmap == NULL) {
1050 debug("cannot malloc() %ld bytes for xmap", st.st_size);
1051 (void) rfd_close(lpc->lpc_xmap_fd);
1052 lpc->lpc_xmap_fd = -1;
1053 return (-1);
1054 }
1055
1056 if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
1057 st.st_size) {
1058 free(lpc->lpc_xmap);
1059 lpc->lpc_xmap = NULL;
1060 if (res > 0) {
1061 debug("xmap changed size, retrying\n");
1062 goto redo;
1063 } else {
1064 debug("cannot read xmap");
1065 return (-1);
1066 }
1067 }
1068 lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
1069
1070 return (0);
1071 }