1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 #include <sys/mman.h>
  29 #include <sys/param.h>
  30 #include <sys/stat.h>
  31 #include <sys/types.h>
  32 #include <assert.h>
  33 #include <errno.h>
  34 #include <fcntl.h>
  35 #include <libproc.h>
  36 #include <limits.h>
  37 #include <procfs.h>
  38 #include <stdio.h>
  39 #include <stdlib.h>
  40 #include <strings.h>
  41 #include <time.h>
  42 #include <unistd.h>
  43 #include "rcapd.h"
  44 #include "rcapd_rfd.h"
  45 #include "rcapd_mapping.h"
  46 #include "utils.h"
  47 
  48 static int lpc_xmap_update(lprocess_t *);
  49 #ifdef DEBUG
  50 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
  51 #endif /* DEBUG */
  52 
  53 /*
  54  * The number of file descriptors required to grab a process and create an
  55  * agent in it.
  56  */
  57 #define PGRAB_FD_COUNT          10
  58 
  59 /*
  60  * Record a position in an address space as it corresponds to a prpageheader_t
  61  * and affiliated structures.
  62  */
  63 typedef struct prpageheader_cur {
  64         int pr_nmap;            /* number of mappings in address space */
  65         int pr_map;             /* number of this mapping */
  66         uint64_t pr_pgoff;      /* page offset into mapping */
  67         uint64_t pr_npage;      /* number of pages in mapping */
  68         uint64_t pr_pagesize;   /* page size of mapping */
  69         uintptr_t pr_addr;      /* base of mapping */
  70         prpageheader_t *pr_prpageheader;        /* associated page header */
  71         void *pr_pdaddr;        /* address of page's byte in pagedata */
  72         prxmap_t *pr_xmap;      /* array containing per-segment information */
  73         int pr_nxmap;           /* number of xmaps in array */
  74         int64_t pr_rss;         /* number of resident pages in mapping, */
  75                                 /* or -1 if xmap is out of sync */
  76         int64_t pr_pg_rss;      /* number of pageable pages in mapping, or -1 */
  77 } prpageheader_cur_t;
  78 
  79 static struct ps_prochandle *scan_pr;   /* currently-scanned process's handle */
  80 
  81 typedef enum {
  82         STDL_NORMAL,
  83         STDL_HIGH
  84 } st_debug_level_t;
  85 
  86 /*
  87  * Output a scanning-related debug message.
  88  */
  89 /*PRINTFLIKE3*/ /*ARGSUSED*/
  90 static void
  91 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
  92 {
  93 #ifdef DEBUG_MSG
  94         va_list alist;
  95         char *buf;
  96         size_t len;
  97 
  98         if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
  99             : RCM_DEBUG))
 100                 return;
 101 
 102         len = strlen(msg) + LINELEN;
 103         buf = malloc(len);
 104         if (buf == NULL)
 105                 return;
 106         (void) snprintf(buf, len, "%s %s scanner %s",
 107             (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
 108             lcol->lcol_name, msg);
 109 
 110         va_start(alist, msg);
 111         vdprintfe(RCM_DEBUG, buf, alist);
 112         va_end(alist);
 113 
 114         free(buf);
 115 #endif /* DEBUG_MSG */
 116 }
 117 
 118 /*
 119  * Determine the collection's current victim, based on its last.  The last will
 120  * be returned, or, if invalid, any other valid process, if the collection has
 121  * any.
 122  */
 123 static lprocess_t *
 124 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
 125 {
 126         if (lpc == NULL || !lcollection_member(lcol, lpc))
 127                 lpc = lcol->lcol_lprocess;
 128 
 129         /*
 130          * Find the next scannable process, and make it the victim.
 131          */
 132         while (lpc != NULL && lpc->lpc_unscannable != 0)
 133                 lpc = lpc->lpc_next;
 134 
 135         return (lpc);
 136 }
 137 
 138 /*
 139  * Get a process's combined current pagedata (per-page referenced and modified
 140  * bits) and set the supplied pointer to it.  The caller is responsible for
 141  * freeing the data.  If the pagedata is unreadable, a nonzero value is
 142  * returned, and errno is set.  Otherwise, 0 is returned.
 143  */
 144 static int
 145 get_pagedata(prpageheader_t **pghpp, int fd)
 146 {
 147         int res;
 148         struct stat st;
 149 
 150 redo:
 151         errno = 0;
 152         if (fstat(fd, &st) != 0) {
 153                 debug("cannot stat pagedata\n");
 154                 return (-1);
 155         }
 156 
 157         errno = 0;
 158         *pghpp = malloc(st.st_size);
 159         if (*pghpp == NULL) {
 160                 debug("cannot malloc() %ld bytes for pagedata", st.st_size);
 161                 return (-1);
 162         }
 163         (void) bzero(*pghpp, st.st_size);
 164 
 165         errno = 0;
 166         if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
 167                 free(*pghpp);
 168                 *pghpp = NULL;
 169                 if (res > 0 || errno == E2BIG) {
 170                         debug("pagedata changed size, retrying\n");
 171                         goto redo;
 172                 } else {
 173                         debug("cannot read pagedata");
 174                         return (-1);
 175                 }
 176         }
 177 
 178         return (0);
 179 }
 180 
 181 /*
 182  * Return the count of kilobytes of pages represented by the given pagedata
 183  * which meet the given criteria, having pages which are in all of the states
 184  * specified by the mask, and in none of the states in the notmask.  If the
 185  * CP_CLEAR flag is set, the pagedata will also be cleared.
 186  */
 187 #define CP_CLEAR        1
 188 static uint64_t
 189 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
 190 {
 191         int map;
 192         caddr_t cur, end;
 193         prpageheader_t pgh = *pghp;
 194         prasmap_t *asmapp;
 195         uint64_t count = 0;
 196 
 197         cur = (caddr_t)pghp + sizeof (*pghp);
 198         for (map = 0; map < pgh.pr_nmap; map++) {
 199                 asmapp = (prasmap_t *)(uintptr_t)cur;
 200                 cur += sizeof (*asmapp);
 201                 end = cur + asmapp->pr_npage;
 202                 while (cur < end) {
 203                         if ((*cur & mask) == mask && (*cur & notmask) == 0)
 204                                 count += asmapp->pr_pagesize / 1024;
 205                         if ((flags & CP_CLEAR) != 0)
 206                                 *cur = 0;
 207                         cur++;
 208                 }
 209 
 210                 /*
 211                  * Skip to next 64-bit-aligned address to get the next
 212                  * prasmap_t.
 213                  */
 214                 cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
 215         }
 216 
 217         return (count);
 218 }
 219 
 220 /*
 221  * Return the amount of memory (in kilobytes) that hasn't been referenced or
 222  * modified, which memory which will be paged out first.  Should be written to
 223  * exclude nonresident pages when sufficient interfaces exist.
 224  */
 225 static uint64_t
 226 unrm_size(lprocess_t *lpc)
 227 {
 228         return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
 229             0, PG_MODIFIED | PG_REFERENCED));
 230 }
 231 
 232 /*
 233  * Advance a prpageheader_cur_t to the address space's next mapping, returning
 234  * its address, or NULL if there is none.  Any known nonpageable or nonresident
 235  * mappings will be skipped over.
 236  */
 237 static uintptr_t
 238 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
 239 {
 240         prasmap_t *pap;
 241         int i;
 242 
 243 next:
 244         ASSERT(pcp->pr_map < pcp->pr_nmap);
 245         if ((pcp->pr_map + 1) == pcp->pr_nmap)
 246                 return (NULL);
 247         pcp->pr_map++;
 248         if (pcp->pr_pgoff < pcp->pr_npage) {
 249                 pcp->pr_pdaddr = (caddr_t)(uintptr_t)
 250                     ((uintptr_t)pcp->pr_pdaddr +
 251                     (pcp->pr_npage - pcp->pr_pgoff));
 252                 pcp->pr_pgoff = pcp->pr_npage;
 253         }
 254         /*
 255          * Skip to next 64-bit-aligned address to get the next prasmap_t.
 256          */
 257         pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
 258         pap = (prasmap_t *)pcp->pr_pdaddr;
 259         pcp->pr_pgoff = 0;
 260         pcp->pr_npage = pap->pr_npage;
 261         pcp->pr_pagesize = pap->pr_pagesize;
 262         pcp->pr_addr = pap->pr_vaddr;
 263         pcp->pr_pdaddr = pap + 1;
 264 
 265         /*
 266          * Skip any known nonpageable mappings.  Currently, the only one
 267          * detected is the schedctl page.
 268          */
 269         if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
 270             MA_ANON)) == 0 && pap->pr_npage == 1) {
 271                 debug("identified nonpageable schedctl mapping at %p\n",
 272                     (void *)pcp->pr_addr);
 273                 goto next;
 274         }
 275 
 276         /*
 277          * Skip mappings with no resident pages.  If the xmap does not
 278          * correspond to the pagedata for any reason, it will be ignored.
 279          */
 280         pcp->pr_rss = -1;
 281         pcp->pr_pg_rss = -1;
 282         for (i = 0; i < pcp->pr_nxmap; i++) {
 283                 prxmap_t *xmap = &pcp->pr_xmap[i];
 284 
 285                 if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
 286                     (pcp->pr_npage * pcp->pr_pagesize)) {
 287                         pcp->pr_rss = xmap->pr_rss;
 288                         /*
 289                          * Remove COW pages from the pageable RSS count.
 290                          */
 291                         if ((xmap->pr_mflags & MA_SHARED) == 0)
 292                                 pcp->pr_pg_rss = xmap->pr_anon;
 293                         break;
 294                 }
 295         }
 296         if (pcp->pr_rss == 0) {
 297                 debug("identified nonresident mapping at 0x%p\n",
 298                     (void *)pcp->pr_addr);
 299                 goto next;
 300         } else if (pcp->pr_pg_rss == 0) {
 301                 debug("identified unpageable mapping at 0x%p\n",
 302                     (void *)pcp->pr_addr);
 303                 goto next;
 304         }
 305 
 306         return (pcp->pr_addr);
 307 }
 308 
 309 /*
 310  * Advance a prpageheader_cur_t to the mapping's next page, returning its
 311  * address, or NULL if there is none.
 312  */
 313 static void *
 314 advance_prpageheader_cur(prpageheader_cur_t *pcp)
 315 {
 316         ASSERT(pcp->pr_pgoff < pcp->pr_npage);
 317         if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
 318                 return (NULL);
 319         pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
 320         pcp->pr_pgoff++;
 321 
 322         ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
 323         return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
 324 }
 325 
 326 /*
 327  * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
 328  * of an address space.
 329  */
 330 static void *
 331 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
 332     prxmap_t *xmap, int nxmap)
 333 {
 334         bzero(pcp, sizeof (*pcp));
 335         pcp->pr_nmap = php->pr_nmap;
 336         pcp->pr_map = -1;
 337         pcp->pr_prpageheader = php;
 338         pcp->pr_xmap = xmap;
 339         pcp->pr_nxmap = nxmap;
 340         pcp->pr_pdaddr = (prpageheader_t *)php + 1;
 341 
 342         return ((void *)advance_prpageheader_cur_nextmapping(pcp));
 343 }
 344 
 345 /*
 346  * Position a prpageheader_cur_t to the mapped address greater or equal to the
 347  * given value.
 348  */
 349 static void *
 350 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
 351     prxmap_t *xmap, int nxmap, void *naddr)
 352 {
 353         void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
 354 
 355         while (addr != NULL && addr <= naddr)
 356                 if (naddr < (void *)((caddr_t)pcp->pr_addr +
 357                     pcp->pr_pagesize * pcp->pr_npage)) {
 358                         uint64_t pgdiff = ((uintptr_t)naddr -
 359                             (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
 360                         pcp->pr_pgoff += pgdiff;
 361                         pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
 362                         addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
 363                             pcp->pr_pgoff;
 364                         break;
 365                 } else
 366                         addr =
 367                             (void *)advance_prpageheader_cur_nextmapping(pcp);
 368 
 369         return (addr);
 370 }
 371 
 372 static void
 373 revoke_pagedata(rfd_t *rfd)
 374 {
 375         lprocess_t *lpc = rfd->rfd_data;
 376 
 377         st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
 378             " process %d\n", (int)lpc->lpc_pid);
 379         ASSERT(lpc->lpc_pgdata_fd != -1);
 380         lpc->lpc_pgdata_fd = -1;
 381 }
 382 
 383 #ifdef DEBUG
 384 static void
 385 mklmapping(lmapping_t **lm, prpageheader_t *pgh)
 386 {
 387         prpageheader_cur_t cur;
 388         void *addr;
 389 
 390         addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
 391         ASSERT(*lm == NULL);
 392         while (addr != NULL) {
 393                 (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
 394                     cur.pr_pagesize);
 395                 addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
 396         }
 397 }
 398 
 399 static void
 400 lmapping_dump(lmapping_t *lm)
 401 {
 402         debug("lm: %p\n", (void *)lm);
 403         while (lm != NULL) {
 404                 debug("\t(%p, %llx\n", (void *)lm->lm_addr,
 405                     (unsigned long long)lm->lm_size);
 406                 lm = lm->lm_next;
 407         }
 408 }
 409 #endif /* DEBUG */
 410 
 411 /*
 412  * OR two prpagedata_t which are supposedly snapshots of the same address
 413  * space.  Intersecting mappings with different page sizes are tolerated but
 414  * not normalized (not accurate).  If the mappings of the two snapshots differ
 415  * in any regard, the supplied mappings_changed flag will be set.
 416  */
 417 static void
 418 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
 419 {
 420         prpageheader_cur_t src_cur;
 421         prpageheader_cur_t dst_cur;
 422         uintptr_t src_addr;
 423         uintptr_t dst_addr;
 424         int mappings_changed = 0;
 425 
 426         /*
 427          * OR source pagedata with the destination, for pages of intersecting
 428          * mappings.
 429          */
 430         src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
 431         dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
 432         while (src_addr != NULL && dst_addr != NULL) {
 433                 while (src_addr == dst_addr && src_addr != NULL) {
 434                         *(char *)dst_cur.pr_pdaddr |=
 435                             *(char *)src_cur.pr_pdaddr;
 436                         src_addr = (uintptr_t)advance_prpageheader_cur(
 437                             &src_cur);
 438                         dst_addr = (uintptr_t)advance_prpageheader_cur(
 439                             &dst_cur);
 440                 }
 441                 if (src_addr != dst_addr)
 442                         mappings_changed = 1;
 443                 src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
 444                 dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
 445                 while (src_addr != dst_addr && src_addr != NULL && dst_addr !=
 446                     NULL) {
 447                         mappings_changed = 1;
 448                         if (src_addr < dst_addr)
 449                                 src_addr = advance_prpageheader_cur_nextmapping(
 450                                     &src_cur);
 451                         else
 452                                 dst_addr = advance_prpageheader_cur_nextmapping(
 453                                     &dst_cur);
 454                 }
 455         }
 456 
 457         *mappings_changedp = mappings_changed;
 458 }
 459 
 460 /*
 461  * Merge the current pagedata with that on hand.  If the pagedata is
 462  * unretrievable for any reason, such as the process having exited or being a
 463  * zombie, a nonzero value is returned, the process should be marked
 464  * unscannable, and future attempts to scan it should be avoided, since the
 465  * symptom is probably permament.  If the mappings of either pagedata
 466  * differ in any respect, the supplied callback will be invoked once.
 467  */
 468 static int
 469 merge_current_pagedata(lprocess_t *lpc,
 470     void(*mappings_changed_cb) (lprocess_t *))
 471 {
 472         prpageheader_t *pghp;
 473         int mappings_changed = 0;
 474         uint64_t cnt;
 475 
 476         if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
 477             0) {
 478                 char pathbuf[PROC_PATH_MAX];
 479 
 480                 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
 481                     (int)lpc->lpc_pid);
 482                 if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
 483                     revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
 484                     get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
 485                         return (-1);
 486                 debug("starting/resuming pagedata collection for %d\n",
 487                     (int)lpc->lpc_pid);
 488         }
 489 
 490         cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
 491         if (cnt != 0 || lpc->lpc_rss != 0)
 492                 debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
 493                     (int)lpc->lpc_pid, (unsigned long long)cnt,
 494                     (unsigned long long)lpc->lpc_rss);
 495         if (lpc->lpc_prpageheader != NULL) {
 496                 /*
 497                  * OR the two snapshots.
 498                  */
 499 #ifdef DEBUG
 500                 lmapping_t *old = NULL;
 501                 lmapping_t *new = NULL;
 502 
 503                 mklmapping(&new, pghp);
 504                 mklmapping(&old, lpc->lpc_prpageheader);
 505 #endif /* DEBUG */
 506                 OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
 507 #ifdef DEBUG
 508                 if (((mappings_changed != 0) ^
 509                     (lmapping_dump_diff(old, new) != 0))) {
 510                         debug("lmapping_changed inconsistent with lmapping\n");
 511                         debug("old\n");
 512                         lmapping_dump(old);
 513                         debug("new\n");
 514                         lmapping_dump(new);
 515                         debug("ignored\n");
 516                         lmapping_dump(lpc->lpc_ignore);
 517                         ASSERT(0);
 518                 }
 519                 lmapping_free(&new);
 520                 lmapping_free(&old);
 521 #endif /* DEBUG */
 522                 free(lpc->lpc_prpageheader);
 523         } else
 524                 mappings_changed = 1;
 525         lpc->lpc_prpageheader = pghp;
 526 
 527         cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
 528         if (cnt != 0 || lpc->lpc_rss != 0)
 529                 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
 530                     (int)lpc->lpc_pid, (unsigned long long)cnt,
 531                     (unsigned long long)lpc->lpc_rss);
 532         if (mappings_changed != 0) {
 533                 debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
 534                 if (mappings_changed_cb != NULL)
 535                         mappings_changed_cb(lpc);
 536         }
 537         return (0);
 538 }
 539 
 540 /*
 541  * Attempt to page out a region of the given process's address space.  May
 542  * return nonzero if not all of the pages may are pageable, for any reason.
 543  */
 544 static int
 545 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
 546 {
 547         int res;
 548 
 549         if (end <= start)
 550                 return (0);
 551 
 552         errno = 0;
 553         res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
 554             (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0);
 555         debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
 556 
 557         /*
 558          * EBUSY indicates none of the pages have backing store allocated, or
 559          * some pages were locked, which are less interesting than other
 560          * conditions, which are noted.
 561          */
 562         if (res != 0)
 563                 if (errno == EBUSY)
 564                         res = 0;
 565                 else
 566                         debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
 567                             (void *)start, (long long)(end - start), errno);
 568 
 569         return (res);
 570 }
 571 
 572 /*
 573  * Compute the delta of the victim process's RSS since the last call.  If the
 574  * psinfo cannot be obtained, no work is done, and no error is returned; it is
 575  * up to the caller to detect the process' termination via other means.
 576  */
 577 static int64_t
 578 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
 579 {
 580         int64_t d_rss = 0;
 581 
 582         if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
 583             lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
 584                 d_rss = (int64_t)new_psinfo->pr_rssize -
 585                     (int64_t)old_psinfo->pr_rssize;
 586                 if (d_rss < 0)
 587                         vic->lpc_collection->lcol_stat.lcols_pg_eff +=
 588                             (- d_rss);
 589                 *old_psinfo = *new_psinfo;
 590         }
 591 
 592         return (d_rss);
 593 }
 594 
 595 static void
 596 unignore_mappings(lprocess_t *lpc)
 597 {
 598         lmapping_free(&lpc->lpc_ignore);
 599 }
 600 
 601 static void
 602 unignore_referenced_mappings(lprocess_t *lpc)
 603 {
 604         prpageheader_cur_t cur;
 605         void *vicaddr;
 606 
 607         vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
 608         while (vicaddr != NULL) {
 609                 if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
 610                     != 0) {
 611                         if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
 612                             cur.pr_npage * cur.pr_pagesize) == 0)
 613                                 debug("removed mapping 0x%p+0t%llukB from"
 614                                     " ignored set\n", (void *)cur.pr_addr,
 615                                     (unsigned long long)(cur.pr_npage *
 616                                     cur.pr_pagesize / 1024));
 617                         vicaddr = (void *)advance_prpageheader_cur_nextmapping(
 618                             &cur);
 619                 } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
 620                         vicaddr = (void *)advance_prpageheader_cur_nextmapping(
 621                             &cur);
 622         }
 623 }
 624 
 625 /*
 626  * Resume scanning, starting with the last victim, if it is still valid, or any
 627  * other one, otherwise.
 628  */
 629 void
 630 scan(lcollection_t *lcol, int64_t excess)
 631 {
 632         lprocess_t *vic, *lpc;
 633         void *vicaddr, *endaddr, *nvicaddr;
 634         prpageheader_cur_t cur;
 635         psinfo_t old_psinfo, new_psinfo;
 636         hrtime_t scan_start;
 637         int res, resumed;
 638         uint64_t col_unrm_size;
 639 
 640         st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
 641             (long long)excess);
 642 
 643         /*
 644          * Determine the address to start scanning at, depending on whether
 645          * scanning can be resumed.
 646          */
 647         endaddr = NULL;
 648         if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
 649             lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
 650                 vicaddr = lcol->lcol_resaddr;
 651                 st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
 652                     (int)vic->lpc_pid);
 653                 resumed = 1;
 654         } else {
 655                 vicaddr = NULL;
 656                 resumed = 0;
 657         }
 658 
 659         scan_start = gethrtime();
 660         /*
 661          * Obtain the most current pagedata for the processes that might be
 662          * scanned, and remove from the ignored set any mappings which have
 663          * referenced or modified pages (in the hopes that the pageability of
 664          * the mapping's pages may have changed).  Determine if the
 665          * unreferenced and unmodified portion is impossibly small to suffice
 666          * to reduce the excess completely.  If so, ignore these bits so that
 667          * even working set will be paged out.
 668          */
 669         col_unrm_size = 0;
 670         lpc = vic;
 671         while (lpc != NULL && should_run) {
 672                 if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
 673                         st_debug(STDL_NORMAL, lcol, "process %d:"
 674                             " exited/temporarily unscannable",
 675                             (int)lpc->lpc_pid);
 676                         goto next;
 677                 }
 678                 debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
 679                     (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
 680                     (unsigned long long)lpc->lpc_size);
 681                 col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
 682 
 683                 if ((lcol->lcol_stat.lcols_scan_count %
 684                     RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
 685                         /*
 686                          * Periodically clear the set of ignored mappings.
 687                          * This will allow processes whose ignored segments'
 688                          * pageability have changed (without a corresponding
 689                          * reference or modification to a page) to be
 690                          * recognized.
 691                          */
 692                         if (lcol->lcol_stat.lcols_scan_count > 0)
 693                                 unignore_mappings(lpc);
 694                 } else {
 695                         /*
 696                          * Ensure mappings with referenced or modified pages
 697                          * are not in the ignored set.  Their usage might mean
 698                          * the condition which made them unpageable is gone.
 699                          */
 700                         unignore_referenced_mappings(lpc);
 701                 }
 702 next:
 703                 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
 704                     lpc->lpc_next) : NULL;
 705         }
 706         if (col_unrm_size < excess) {
 707                 lpc = vic;
 708                 debug("will not reduce excess with only unreferenced pages\n");
 709                 while (lpc != NULL && should_run) {
 710                         if (lpc->lpc_prpageheader != NULL) {
 711                                 (void) count_pages(lpc->lpc_prpageheader,
 712                                     CP_CLEAR, 0, 0);
 713                                 if (lpc->lpc_pgdata_fd >= 0) {
 714                                         if (rfd_close(lpc->lpc_pgdata_fd) != 0)
 715                                                 debug("coud not close %d"
 716                                                     " lpc_pgdata_fd %d",
 717                                                     (int)lpc->lpc_pid,
 718                                                     lpc->lpc_pgdata_fd);
 719                                         lpc->lpc_pgdata_fd = -1;
 720                                 }
 721                         }
 722                         lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
 723                             lpc->lpc_next) : NULL;
 724                 }
 725         }
 726 
 727         /*
 728          * Examine each process for pages to remove until the excess is
 729          * reduced.
 730          */
 731         while (vic != NULL && excess > 0 && should_run) {
 732                 /*
 733                  * Skip processes whose death was reported when the merging of
 734                  * pagedata was attempted.
 735                  */
 736                 if (vic->lpc_prpageheader == NULL)
 737                         goto nextproc;
 738 
 739                 /*
 740                  * Obtain optional segment residency information.
 741                  */
 742                 if (lpc_xmap_update(vic) != 0)
 743                         st_debug(STDL_NORMAL, lcol, "process %d: xmap"
 744                             " unreadable; ignoring", (int)vic->lpc_pid);
 745 
 746 #ifdef DEBUG_MSG
 747                 {
 748                         void *ovicaddr = vicaddr;
 749 #endif /* DEBUG_MSG */
 750                 vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
 751                     vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
 752 #ifdef DEBUG_MSG
 753                         st_debug(STDL_NORMAL, lcol, "trying to resume from"
 754                             " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
 755                 }
 756 #endif /* DEBUG_MSG */
 757 
 758                 /*
 759                  * Take control of the victim.
 760                  */
 761                 if (get_psinfo(vic->lpc_pid, &old_psinfo,
 762                     vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
 763                     vic, vic) != 0) {
 764                         st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
 765                             (int)vic->lpc_pid);
 766                         goto nextproc;
 767                 }
 768                 (void) rfd_reserve(PGRAB_FD_COUNT);
 769                 if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
 770                         st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
 771                             (int)vic->lpc_pid, res);
 772                         goto nextproc;
 773                 }
 774                 if (Pcreate_agent(scan_pr) != 0) {
 775                         st_debug(STDL_NORMAL, lcol, "cannot control %d",
 776                             (int)vic->lpc_pid);
 777                         goto nextproc;
 778                 }
 779                 /*
 780                  * Be very pessimistic about the state of the agent LWP --
 781                  * verify it's actually stopped.
 782                  */
 783                 errno = 0;
 784                 while (Pstate(scan_pr) == PS_RUN)
 785                         (void) Pwait(scan_pr, 0);
 786                 if (Pstate(scan_pr) != PS_STOP) {
 787                         st_debug(STDL_NORMAL, lcol, "agent not in expected"
 788                             " state (%d)", Pstate(scan_pr));
 789                         goto nextproc;
 790                 }
 791 
 792                 /*
 793                  * Within the victim's address space, find contiguous ranges of
 794                  * unreferenced pages to page out.
 795                  */
 796                 st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
 797                     (int)vic->lpc_pid);
 798                 while (excess > 0 && vicaddr != NULL && should_run) {
 799                         /*
 800                          * Skip mappings in the ignored set.  Mappings get
 801                          * placed in the ignored set when all their resident
 802                          * pages are unreference and unmodified, yet unpageable
 803                          * -- such as when they are locked, or involved in
 804                          * asynchronous I/O.  They will be scanned again when
 805                          * some page is referenced or modified.
 806                          */
 807                         if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
 808                             cur.pr_npage * cur.pr_pagesize)) {
 809                                 debug("ignored mapping at 0x%p\n",
 810                                     (void *)cur.pr_addr);
 811                                 /*
 812                                  * Update statistics.
 813                                  */
 814                                 lcol->lcol_stat.lcols_pg_att +=
 815                                     cur.pr_npage * cur.pr_pagesize / 1024;
 816 
 817                                 vicaddr = (void *)
 818                                     advance_prpageheader_cur_nextmapping(&cur);
 819                                 continue;
 820                         }
 821 
 822                         /*
 823                          * Determine a range of unreferenced pages to page out,
 824                          * and clear the R/M bits in the preceding referenced
 825                          * range.
 826                          */
 827                         st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
 828                             " npage %llu\n", vicaddr,
 829                             (unsigned long long)cur.pr_npage);
 830                         while (vicaddr != NULL &&
 831                             *(caddr_t)cur.pr_pdaddr != 0) {
 832                                 *(caddr_t)cur.pr_pdaddr = 0;
 833                                 vicaddr = advance_prpageheader_cur(&cur);
 834                         }
 835                         st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
 836                             " %p\n", vicaddr, cur.pr_pdaddr);
 837                         if (vicaddr == NULL) {
 838                                 /*
 839                                  * The end of mapping was reached before any
 840                                  * unreferenced pages were seen.
 841                                  */
 842                                 vicaddr = (void *)
 843                                     advance_prpageheader_cur_nextmapping(&cur);
 844                                 continue;
 845                         }
 846                         do
 847                                 endaddr = advance_prpageheader_cur(&cur);
 848                         while (endaddr != NULL &&
 849                             *(caddr_t)cur.pr_pdaddr == 0 &&
 850                             (((intptr_t)endaddr - (intptr_t)vicaddr) /
 851                                 1024) < excess);
 852                         st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
 853                             endaddr, *(caddr_t)cur.pr_pdaddr);
 854 
 855                         /*
 856                          * Page out from vicaddr to the end of the mapping, or
 857                          * endaddr if set, then continue scanning after
 858                          * endaddr, or the next mapping, if not set.
 859                          */
 860                         nvicaddr = endaddr;
 861                         if (endaddr == NULL)
 862                                 endaddr = (caddr_t)cur.pr_addr +
 863                                     cur.pr_pagesize * cur.pr_npage;
 864                         if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
 865                             0) {
 866                                 int64_t d_rss, att;
 867                                 int willignore = 0;
 868 
 869                                 excess += (d_rss = rss_delta(
 870                                     &new_psinfo, &old_psinfo, vic));
 871 
 872                                 /*
 873                                  * If this pageout attempt was unsuccessful
 874                                  * (the resident portion was not affected), and
 875                                  * was for the whole mapping, put it in the
 876                                  * ignored set, so it will not be scanned again
 877                                  * until some page is referenced or modified.
 878                                  */
 879                                 if (d_rss >= 0 && (void *)cur.pr_addr ==
 880                                     vicaddr && (cur.pr_pagesize * cur.pr_npage)
 881                                     == ((uintptr_t)endaddr -
 882                                     (uintptr_t)vicaddr)) {
 883                                         if (lmapping_insert(
 884                                             &vic->lpc_ignore,
 885                                             cur.pr_addr,
 886                                             cur.pr_pagesize *
 887                                             cur.pr_npage) != 0)
 888                                                 debug("not enough memory to add"
 889                                                     " mapping at %p to ignored"
 890                                                     " set\n",
 891                                                     (void *)cur.pr_addr);
 892                                         willignore = 1;
 893                                 }
 894 
 895                                 /*
 896                                  * Update statistics.
 897                                  */
 898                                 lcol->lcol_stat.lcols_pg_att += (att =
 899                                     ((intptr_t)endaddr - (intptr_t)vicaddr) /
 900                                     1024);
 901                                 st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
 902                                     "+0t(%llu/%llu)kB%s\n", vicaddr,
 903                                     (unsigned long long)((d_rss <
 904                                     0) ? - d_rss : 0), (unsigned long long)att,
 905                                     willignore ? " (will ignore)" : "");
 906                         } else {
 907                                 st_debug(STDL_NORMAL, lcol,
 908                                     "process %d: exited/unscannable\n",
 909                                     (int)vic->lpc_pid);
 910                                 vic->lpc_unscannable = 1;
 911                                 goto nextproc;
 912                         }
 913 
 914                         /*
 915                          * Update the statistics file, if it's time.
 916                          */
 917                         check_update_statistics();
 918 
 919                         vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
 920                             *)advance_prpageheader_cur_nextmapping(&cur);
 921                 }
 922                 excess += rss_delta(&new_psinfo, &old_psinfo, vic);
 923                 st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
 924                     (long long)excess);
 925 nextproc:
 926                 /*
 927                  * If a process was grabbed, release it, destroying its agent.
 928                  */
 929                 if (scan_pr != NULL) {
 930                         (void) Prelease(scan_pr, 0);
 931                         scan_pr = NULL;
 932                 }
 933                 lcol->lcol_victim = vic;
 934                 /*
 935                  * Scan the collection at most once.  Only if scanning was not
 936                  * aborted for any reason, and the end of lprocess has not been
 937                  * reached, determine the next victim and scan it.
 938                  */
 939                 if (vic != NULL) {
 940                         if (vic->lpc_next != NULL) {
 941                                 /*
 942                                  * Determine the next process to be scanned.
 943                                  */
 944                                 if (excess > 0) {
 945                                         vic = get_valid_victim(lcol,
 946                                             vic->lpc_next);
 947                                         vicaddr = 0;
 948                                 }
 949                         } else {
 950                                 /*
 951                                  * A complete scan of the collection was made,
 952                                  * so tick the scan counter and stop scanning
 953                                  * until the next request.
 954                                  */
 955                                 lcol->lcol_stat.lcols_scan_count++;
 956                                 lcol->lcol_stat.lcols_scan_time_complete
 957                                     = lcol->lcol_stat.lcols_scan_time;
 958                                 /*
 959                                  * If an excess still exists, tick the
 960                                  * "ineffective scan" counter, signalling that
 961                                  * the cap may be uneforceable.
 962                                  */
 963                                 if (resumed == 0 && excess > 0)
 964                                         lcol->lcol_stat
 965                                             .lcols_scan_ineffective++;
 966                                 /*
 967                                  * Scanning should start at the beginning of
 968                                  * the process list at the next request.
 969                                  */
 970                                 if (excess > 0)
 971                                         vic = NULL;
 972                         }
 973                 }
 974         }
 975         lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
 976         st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
 977             (long long)excess);
 978 
 979         lcol->lcol_resaddr = vicaddr;
 980         if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
 981                 lcol->lcol_victim = get_valid_victim(lcol,
 982                     lcol->lcol_victim->lpc_next);
 983         }
 984 }
 985 
 986 /*
 987  * Abort the scan in progress, and destroy the agent LWP of any grabbed
 988  * processes.
 989  */
 990 void
 991 scan_abort(void)
 992 {
 993         if (scan_pr != NULL)
 994                 (void) Prelease(scan_pr, NULL);
 995 }
 996 
 997 static void
 998 revoke_xmap(rfd_t *rfd)
 999 {
1000         lprocess_t *lpc = rfd->rfd_data;
1001 
1002         debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
1003         ASSERT(lpc->lpc_xmap_fd != -1);
1004         lpc->lpc_xmap_fd = -1;
1005 }
1006 
1007 /*
1008  * Retrieve the process's current xmap , which is used to determine the size of
1009  * the resident portion of its segments.  Return zero if successful.
1010  */
1011 static int
1012 lpc_xmap_update(lprocess_t *lpc)
1013 {
1014         int res;
1015         struct stat st;
1016 
1017         free(lpc->lpc_xmap);
1018         lpc->lpc_xmap = NULL;
1019         lpc->lpc_nxmap = -1;
1020 
1021         if (lpc->lpc_xmap_fd == -1) {
1022                 char pathbuf[PROC_PATH_MAX];
1023 
1024                 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
1025                     (int)lpc->lpc_pid);
1026                 if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
1027                     revoke_xmap, lpc, O_RDONLY, 0)) < 0)
1028                         return (-1);
1029         }
1030 
1031 redo:
1032         errno = 0;
1033         if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
1034                 debug("cannot stat xmap\n");
1035                 (void) rfd_close(lpc->lpc_xmap_fd);
1036                 lpc->lpc_xmap_fd = -1;
1037                 return (-1);
1038         }
1039 
1040         if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
1041                 debug("xmap wrong size\n");
1042                 (void) rfd_close(lpc->lpc_xmap_fd);
1043                 lpc->lpc_xmap_fd = -1;
1044                 return (-1);
1045         }
1046 
1047         lpc->lpc_xmap = malloc(st.st_size);
1048         if (lpc->lpc_xmap == NULL) {
1049                 debug("cannot malloc() %ld bytes for xmap", st.st_size);
1050                 (void) rfd_close(lpc->lpc_xmap_fd);
1051                 lpc->lpc_xmap_fd = -1;
1052                 return (-1);
1053         }
1054 
1055         if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
1056             st.st_size) {
1057                 free(lpc->lpc_xmap);
1058                 lpc->lpc_xmap = NULL;
1059                 if (res > 0) {
1060                         debug("xmap changed size, retrying\n");
1061                         goto redo;
1062                 } else {
1063                         debug("cannot read xmap");
1064                         return (-1);
1065                 }
1066         }
1067         lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
1068 
1069         return (0);
1070 }