1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2012 Joyent, Inc.  All rights reserved.
  25  */
  26 
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28 
  29 #include <sys/mman.h>
  30 #include <sys/param.h>
  31 #include <sys/stat.h>
  32 #include <sys/types.h>
  33 #include <assert.h>
  34 #include <errno.h>
  35 #include <fcntl.h>
  36 #include <libproc.h>
  37 #include <limits.h>
  38 #include <procfs.h>
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <strings.h>
  42 #include <time.h>
  43 #include <unistd.h>
  44 #include "rcapd.h"
  45 #include "rcapd_rfd.h"
  46 #include "rcapd_mapping.h"
  47 #include "utils.h"
  48 
  49 static int lpc_xmap_update(lprocess_t *);
  50 #ifdef DEBUG
  51 extern int lmapping_dump_diff(lmapping_t *lm1, lmapping_t *lm2);
  52 #endif /* DEBUG */
  53 
  54 /*
  55  * The number of file descriptors required to grab a process and create an
  56  * agent in it.
  57  */
  58 #define PGRAB_FD_COUNT          10
  59 
  60 /*
  61  * Record a position in an address space as it corresponds to a prpageheader_t
  62  * and affiliated structures.
  63  */
  64 typedef struct prpageheader_cur {
  65         int pr_nmap;            /* number of mappings in address space */
  66         int pr_map;             /* number of this mapping */
  67         uint64_t pr_pgoff;      /* page offset into mapping */
  68         uint64_t pr_npage;      /* number of pages in mapping */
  69         uint64_t pr_pagesize;   /* page size of mapping */
  70         uintptr_t pr_addr;      /* base of mapping */
  71         prpageheader_t *pr_prpageheader;        /* associated page header */
  72         void *pr_pdaddr;        /* address of page's byte in pagedata */
  73         prxmap_t *pr_xmap;      /* array containing per-segment information */
  74         int pr_nxmap;           /* number of xmaps in array */
  75         int64_t pr_rss;         /* number of resident pages in mapping, */
  76                                 /* or -1 if xmap is out of sync */
  77         int64_t pr_pg_rss;      /* number of pageable pages in mapping, or -1 */
  78 } prpageheader_cur_t;
  79 
  80 static struct ps_prochandle *scan_pr;   /* currently-scanned process's handle */
  81 
  82 typedef enum {
  83         STDL_NORMAL,
  84         STDL_HIGH
  85 } st_debug_level_t;
  86 
  87 /*
  88  * Output a scanning-related debug message.
  89  */
  90 /*PRINTFLIKE3*/ /*ARGSUSED*/
  91 static void
  92 st_debug(st_debug_level_t level, lcollection_t *lcol, char *msg, ...)
  93 {
  94 #ifdef DEBUG_MSG
  95         va_list alist;
  96         char *buf;
  97         size_t len;
  98 
  99         if (get_message_priority() < ((level == STDL_HIGH) ? RCM_DEBUG_HIGH
 100             : RCM_DEBUG))
 101                 return;
 102 
 103         len = strlen(msg) + LINELEN;
 104         buf = malloc(len);
 105         if (buf == NULL)
 106                 return;
 107         (void) snprintf(buf, len, "%s %s scanner %s",
 108             (lcol->lcol_id.rcid_type == RCIDT_PROJECT ? "project" : "zone"),
 109             lcol->lcol_name, msg);
 110 
 111         va_start(alist, msg);
 112         vdprintfe(RCM_DEBUG, buf, alist);
 113         va_end(alist);
 114 
 115         free(buf);
 116 #endif /* DEBUG_MSG */
 117 }
 118 
 119 /*
 120  * Determine the collection's current victim, based on its last.  The last will
 121  * be returned, or, if invalid, any other valid process, if the collection has
 122  * any.
 123  */
 124 static lprocess_t *
 125 get_valid_victim(lcollection_t *lcol, lprocess_t *lpc)
 126 {
 127         if (lpc == NULL || !lcollection_member(lcol, lpc))
 128                 lpc = lcol->lcol_lprocess;
 129 
 130         /*
 131          * Find the next scannable process, and make it the victim.
 132          */
 133         while (lpc != NULL && lpc->lpc_unscannable != 0)
 134                 lpc = lpc->lpc_next;
 135 
 136         return (lpc);
 137 }
 138 
 139 /*
 140  * Get a process's combined current pagedata (per-page referenced and modified
 141  * bits) and set the supplied pointer to it.  The caller is responsible for
 142  * freeing the data.  If the pagedata is unreadable, a nonzero value is
 143  * returned, and errno is set.  Otherwise, 0 is returned.
 144  */
 145 static int
 146 get_pagedata(prpageheader_t **pghpp, int fd)
 147 {
 148         int res;
 149         struct stat st;
 150 
 151 redo:
 152         errno = 0;
 153         if (fstat(fd, &st) != 0) {
 154                 debug("cannot stat pagedata\n");
 155                 return (-1);
 156         }
 157 
 158         errno = 0;
 159         *pghpp = malloc(st.st_size);
 160         if (*pghpp == NULL) {
 161                 debug("cannot malloc() %ld bytes for pagedata", st.st_size);
 162                 return (-1);
 163         }
 164         (void) bzero(*pghpp, st.st_size);
 165 
 166         errno = 0;
 167         if ((res = read(fd, *pghpp, st.st_size)) != st.st_size) {
 168                 free(*pghpp);
 169                 *pghpp = NULL;
 170                 if (res > 0 || errno == E2BIG) {
 171                         debug("pagedata changed size, retrying\n");
 172                         goto redo;
 173                 } else {
 174                         debug("cannot read pagedata");
 175                         return (-1);
 176                 }
 177         }
 178 
 179         return (0);
 180 }
 181 
 182 /*
 183  * Return the count of kilobytes of pages represented by the given pagedata
 184  * which meet the given criteria, having pages which are in all of the states
 185  * specified by the mask, and in none of the states in the notmask.  If the
 186  * CP_CLEAR flag is set, the pagedata will also be cleared.
 187  */
 188 #define CP_CLEAR        1
 189 static uint64_t
 190 count_pages(prpageheader_t *pghp, int flags, int mask, int notmask)
 191 {
 192         int map;
 193         caddr_t cur, end;
 194         prpageheader_t pgh = *pghp;
 195         prasmap_t *asmapp;
 196         uint64_t count = 0;
 197 
 198         cur = (caddr_t)pghp + sizeof (*pghp);
 199         for (map = 0; map < pgh.pr_nmap; map++) {
 200                 asmapp = (prasmap_t *)(uintptr_t)cur;
 201                 cur += sizeof (*asmapp);
 202                 end = cur + asmapp->pr_npage;
 203                 while (cur < end) {
 204                         if ((*cur & mask) == mask && (*cur & notmask) == 0)
 205                                 count += asmapp->pr_pagesize / 1024;
 206                         if ((flags & CP_CLEAR) != 0)
 207                                 *cur = 0;
 208                         cur++;
 209                 }
 210 
 211                 /*
 212                  * Skip to next 64-bit-aligned address to get the next
 213                  * prasmap_t.
 214                  */
 215                 cur = (caddr_t)((intptr_t)(cur + 7) & ~7);
 216         }
 217 
 218         return (count);
 219 }
 220 
 221 /*
 222  * Return the amount of memory (in kilobytes) that hasn't been referenced or
 223  * modified, which memory which will be paged out first.  Should be written to
 224  * exclude nonresident pages when sufficient interfaces exist.
 225  */
 226 static uint64_t
 227 unrm_size(lprocess_t *lpc)
 228 {
 229         return (count_pages(lpc->lpc_prpageheader, CP_CLEAR,
 230             0, PG_MODIFIED | PG_REFERENCED));
 231 }
 232 
 233 /*
 234  * Advance a prpageheader_cur_t to the address space's next mapping, returning
 235  * its address, or NULL if there is none.  Any known nonpageable or nonresident
 236  * mappings will be skipped over.
 237  */
 238 static uintptr_t
 239 advance_prpageheader_cur_nextmapping(prpageheader_cur_t *pcp)
 240 {
 241         prasmap_t *pap;
 242         int i;
 243 
 244 next:
 245         ASSERT(pcp->pr_map < pcp->pr_nmap);
 246         if ((pcp->pr_map + 1) == pcp->pr_nmap)
 247                 return (NULL);
 248         pcp->pr_map++;
 249         if (pcp->pr_pgoff < pcp->pr_npage) {
 250                 pcp->pr_pdaddr = (caddr_t)(uintptr_t)
 251                     ((uintptr_t)pcp->pr_pdaddr +
 252                     (pcp->pr_npage - pcp->pr_pgoff));
 253                 pcp->pr_pgoff = pcp->pr_npage;
 254         }
 255         /*
 256          * Skip to next 64-bit-aligned address to get the next prasmap_t.
 257          */
 258         pcp->pr_pdaddr = (caddr_t)(((uintptr_t)pcp->pr_pdaddr + 7) & ~7);
 259         pap = (prasmap_t *)pcp->pr_pdaddr;
 260         pcp->pr_pgoff = 0;
 261         pcp->pr_npage = pap->pr_npage;
 262         pcp->pr_pagesize = pap->pr_pagesize;
 263         pcp->pr_addr = pap->pr_vaddr;
 264         pcp->pr_pdaddr = pap + 1;
 265 
 266         /*
 267          * Skip any known nonpageable mappings.  Currently, the only one
 268          * detected is the schedctl page.
 269          */
 270         if ((pap->pr_mflags ^ (MA_SHARED | MA_READ | MA_WRITE | MA_EXEC |
 271             MA_ANON)) == 0 && pap->pr_npage == 1) {
 272                 debug("identified nonpageable schedctl mapping at %p\n",
 273                     (void *)pcp->pr_addr);
 274                 goto next;
 275         }
 276 
 277         /*
 278          * Skip mappings with no resident pages.  If the xmap does not
 279          * correspond to the pagedata for any reason, it will be ignored.
 280          */
 281         pcp->pr_rss = -1;
 282         pcp->pr_pg_rss = -1;
 283         for (i = 0; i < pcp->pr_nxmap; i++) {
 284                 prxmap_t *xmap = &pcp->pr_xmap[i];
 285 
 286                 if (pcp->pr_addr == xmap->pr_vaddr && xmap->pr_size ==
 287                     (pcp->pr_npage * pcp->pr_pagesize)) {
 288                         pcp->pr_rss = xmap->pr_rss;
 289                         /*
 290                          * Remove COW pages from the pageable RSS count.
 291                          */
 292                         if ((xmap->pr_mflags & MA_SHARED) == 0)
 293                                 pcp->pr_pg_rss = xmap->pr_anon;
 294                         break;
 295                 }
 296         }
 297         if (pcp->pr_rss == 0) {
 298                 debug("identified nonresident mapping at 0x%p\n",
 299                     (void *)pcp->pr_addr);
 300                 goto next;
 301         } else if (pcp->pr_pg_rss == 0) {
 302                 debug("identified unpageable mapping at 0x%p\n",
 303                     (void *)pcp->pr_addr);
 304                 goto next;
 305         }
 306 
 307         return (pcp->pr_addr);
 308 }
 309 
 310 /*
 311  * Advance a prpageheader_cur_t to the mapping's next page, returning its
 312  * address, or NULL if there is none.
 313  */
 314 static void *
 315 advance_prpageheader_cur(prpageheader_cur_t *pcp)
 316 {
 317         ASSERT(pcp->pr_pgoff < pcp->pr_npage);
 318         if ((pcp->pr_pgoff + 1) == pcp->pr_npage)
 319                 return (NULL);
 320         pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + 1;
 321         pcp->pr_pgoff++;
 322 
 323         ASSERT((*(char *)pcp->pr_pdaddr & ~(PG_MODIFIED | PG_REFERENCED)) == 0);
 324         return ((caddr_t)pcp->pr_addr + pcp->pr_pgoff * pcp->pr_pagesize);
 325 }
 326 
 327 /*
 328  * Initialize a prpageheader_cur_t, positioned at the first page of the mapping
 329  * of an address space.
 330  */
 331 static void *
 332 set_prpageheader_cur(prpageheader_cur_t *pcp, prpageheader_t *php,
 333     prxmap_t *xmap, int nxmap)
 334 {
 335         bzero(pcp, sizeof (*pcp));
 336         pcp->pr_nmap = php->pr_nmap;
 337         pcp->pr_map = -1;
 338         pcp->pr_prpageheader = php;
 339         pcp->pr_xmap = xmap;
 340         pcp->pr_nxmap = nxmap;
 341         pcp->pr_pdaddr = (prpageheader_t *)php + 1;
 342 
 343         return ((void *)advance_prpageheader_cur_nextmapping(pcp));
 344 }
 345 
 346 /*
 347  * Position a prpageheader_cur_t to the mapped address greater or equal to the
 348  * given value.
 349  */
 350 static void *
 351 set_prpageheader_cur_addr(prpageheader_cur_t *pcp, prpageheader_t *php,
 352     prxmap_t *xmap, int nxmap, void *naddr)
 353 {
 354         void *addr = set_prpageheader_cur(pcp, php, xmap, nxmap);
 355 
 356         while (addr != NULL && addr <= naddr)
 357                 if (naddr < (void *)((caddr_t)pcp->pr_addr +
 358                     pcp->pr_pagesize * pcp->pr_npage)) {
 359                         uint64_t pgdiff = ((uintptr_t)naddr -
 360                             (uintptr_t)pcp->pr_addr) / pcp->pr_pagesize;
 361                         pcp->pr_pgoff += pgdiff;
 362                         pcp->pr_pdaddr = (caddr_t)pcp->pr_pdaddr + pgdiff;
 363                         addr = (caddr_t)pcp->pr_addr + pcp->pr_pagesize *
 364                             pcp->pr_pgoff;
 365                         break;
 366                 } else
 367                         addr =
 368                             (void *)advance_prpageheader_cur_nextmapping(pcp);
 369 
 370         return (addr);
 371 }
 372 
 373 static void
 374 revoke_pagedata(rfd_t *rfd)
 375 {
 376         lprocess_t *lpc = rfd->rfd_data;
 377 
 378         st_debug(STDL_NORMAL, lpc->lpc_collection, "revoking pagedata for"
 379             " process %d\n", (int)lpc->lpc_pid);
 380         ASSERT(lpc->lpc_pgdata_fd != -1);
 381         lpc->lpc_pgdata_fd = -1;
 382 }
 383 
 384 #ifdef DEBUG
 385 static void
 386 mklmapping(lmapping_t **lm, prpageheader_t *pgh)
 387 {
 388         prpageheader_cur_t cur;
 389         void *addr;
 390 
 391         addr = set_prpageheader_cur(&cur, pgh, NULL, -1);
 392         ASSERT(*lm == NULL);
 393         while (addr != NULL) {
 394                 (void) lmapping_insert(lm, cur.pr_addr, cur.pr_npage *
 395                     cur.pr_pagesize);
 396                 addr = (void *)advance_prpageheader_cur_nextmapping(&cur);
 397         }
 398 }
 399 
 400 static void
 401 lmapping_dump(lmapping_t *lm)
 402 {
 403         debug("lm: %p\n", (void *)lm);
 404         while (lm != NULL) {
 405                 debug("\t(%p, %llx\n", (void *)lm->lm_addr,
 406                     (unsigned long long)lm->lm_size);
 407                 lm = lm->lm_next;
 408         }
 409 }
 410 #endif /* DEBUG */
 411 
 412 /*
 413  * OR two prpagedata_t which are supposedly snapshots of the same address
 414  * space.  Intersecting mappings with different page sizes are tolerated but
 415  * not normalized (not accurate).  If the mappings of the two snapshots differ
 416  * in any regard, the supplied mappings_changed flag will be set.
 417  */
 418 static void
 419 OR_pagedata(prpageheader_t *src, prpageheader_t *dst, int *mappings_changedp)
 420 {
 421         prpageheader_cur_t src_cur;
 422         prpageheader_cur_t dst_cur;
 423         uintptr_t src_addr;
 424         uintptr_t dst_addr;
 425         int mappings_changed = 0;
 426 
 427         /*
 428          * OR source pagedata with the destination, for pages of intersecting
 429          * mappings.
 430          */
 431         src_addr = (uintptr_t)set_prpageheader_cur(&src_cur, src, NULL, -1);
 432         dst_addr = (uintptr_t)set_prpageheader_cur(&dst_cur, dst, NULL, -1);
 433         while (src_addr != NULL && dst_addr != NULL) {
 434                 while (src_addr == dst_addr && src_addr != NULL) {
 435                         *(char *)dst_cur.pr_pdaddr |=
 436                             *(char *)src_cur.pr_pdaddr;
 437                         src_addr = (uintptr_t)advance_prpageheader_cur(
 438                             &src_cur);
 439                         dst_addr = (uintptr_t)advance_prpageheader_cur(
 440                             &dst_cur);
 441                 }
 442                 if (src_addr != dst_addr)
 443                         mappings_changed = 1;
 444                 src_addr = advance_prpageheader_cur_nextmapping(&src_cur);
 445                 dst_addr = advance_prpageheader_cur_nextmapping(&dst_cur);
 446                 while (src_addr != dst_addr && src_addr != NULL && dst_addr !=
 447                     NULL) {
 448                         mappings_changed = 1;
 449                         if (src_addr < dst_addr)
 450                                 src_addr = advance_prpageheader_cur_nextmapping(
 451                                     &src_cur);
 452                         else
 453                                 dst_addr = advance_prpageheader_cur_nextmapping(
 454                                     &dst_cur);
 455                 }
 456         }
 457 
 458         *mappings_changedp = mappings_changed;
 459 }
 460 
 461 /*
 462  * Merge the current pagedata with that on hand.  If the pagedata is
 463  * unretrievable for any reason, such as the process having exited or being a
 464  * zombie, a nonzero value is returned, the process should be marked
 465  * unscannable, and future attempts to scan it should be avoided, since the
 466  * symptom is probably permament.  If the mappings of either pagedata
 467  * differ in any respect, the supplied callback will be invoked once.
 468  */
 469 static int
 470 merge_current_pagedata(lprocess_t *lpc,
 471     void(*mappings_changed_cb) (lprocess_t *))
 472 {
 473         prpageheader_t *pghp;
 474         int mappings_changed = 0;
 475         uint64_t cnt;
 476 
 477         if (lpc->lpc_pgdata_fd < 0 || get_pagedata(&pghp, lpc->lpc_pgdata_fd) !=
 478             0) {
 479                 char pathbuf[PROC_PATH_MAX];
 480 
 481                 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/pagedata",
 482                     (int)lpc->lpc_pid);
 483                 if ((lpc->lpc_pgdata_fd = rfd_open(pathbuf, 1, RFD_PAGEDATA,
 484                     revoke_pagedata, lpc, O_RDONLY, 0)) < 0 ||
 485                     get_pagedata(&pghp, lpc->lpc_pgdata_fd) != 0)
 486                         return (-1);
 487                 debug("starting/resuming pagedata collection for %d\n",
 488                     (int)lpc->lpc_pid);
 489         }
 490 
 491         cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
 492         if (cnt != 0 || lpc->lpc_rss != 0)
 493                 debug("process %d: %llu/%llukB rfd/mdfd since last read\n",
 494                     (int)lpc->lpc_pid, (unsigned long long)cnt,
 495                     (unsigned long long)lpc->lpc_rss);
 496         if (lpc->lpc_prpageheader != NULL) {
 497                 /*
 498                  * OR the two snapshots.
 499                  */
 500 #ifdef DEBUG
 501                 lmapping_t *old = NULL;
 502                 lmapping_t *new = NULL;
 503 
 504                 mklmapping(&new, pghp);
 505                 mklmapping(&old, lpc->lpc_prpageheader);
 506 #endif /* DEBUG */
 507                 OR_pagedata(lpc->lpc_prpageheader, pghp, &mappings_changed);
 508 #ifdef DEBUG
 509                 if (((mappings_changed != 0) ^
 510                     (lmapping_dump_diff(old, new) != 0))) {
 511                         debug("lmapping_changed inconsistent with lmapping\n");
 512                         debug("old\n");
 513                         lmapping_dump(old);
 514                         debug("new\n");
 515                         lmapping_dump(new);
 516                         debug("ignored\n");
 517                         lmapping_dump(lpc->lpc_ignore);
 518                         ASSERT(0);
 519                 }
 520                 lmapping_free(&new);
 521                 lmapping_free(&old);
 522 #endif /* DEBUG */
 523                 free(lpc->lpc_prpageheader);
 524         } else
 525                 mappings_changed = 1;
 526         lpc->lpc_prpageheader = pghp;
 527 
 528         cnt = count_pages(pghp, 0, PG_MODIFIED | PG_REFERENCED, 0);
 529         if (cnt != 0 || lpc->lpc_rss != 0)
 530                 debug("process %d: %llu/%llukB rfd/mdfd since hand swept\n",
 531                     (int)lpc->lpc_pid, (unsigned long long)cnt,
 532                     (unsigned long long)lpc->lpc_rss);
 533         if (mappings_changed != 0) {
 534                 debug("process %d: mappings changed\n", (int)lpc->lpc_pid);
 535                 if (mappings_changed_cb != NULL)
 536                         mappings_changed_cb(lpc);
 537         }
 538         return (0);
 539 }
 540 
 541 /*
 542  * Attempt to page out a region of the given process's address space.  May
 543  * return nonzero if not all of the pages may are pageable, for any reason.
 544  */
 545 static int
 546 pageout(pid_t pid, struct ps_prochandle *Pr, caddr_t start, caddr_t end)
 547 {
 548         int res;
 549 
 550         if (end <= start)
 551                 return (0);
 552 
 553         errno = 0;
 554         res = pr_memcntl(Pr, start, (end - start), MC_SYNC,
 555             (caddr_t)(MS_ASYNC | MS_INVALCURPROC), 0, 0);
 556         debug_high("pr_memcntl [%p-%p): %d", (void *)start, (void *)end, res);
 557 
 558         /*
 559          * EBUSY indicates none of the pages have backing store allocated, or
 560          * some pages were locked, which are less interesting than other
 561          * conditions, which are noted.
 562          */
 563         if (res != 0)
 564                 if (errno == EBUSY)
 565                         res = 0;
 566                 else
 567                         debug("%d: can't pageout %p+%llx (errno %d)", (int)pid,
 568                             (void *)start, (long long)(end - start), errno);
 569 
 570         return (res);
 571 }
 572 
 573 /*
 574  * Compute the delta of the victim process's RSS since the last call.  If the
 575  * psinfo cannot be obtained, no work is done, and no error is returned; it is
 576  * up to the caller to detect the process' termination via other means.
 577  */
 578 static int64_t
 579 rss_delta(psinfo_t *new_psinfo, psinfo_t *old_psinfo, lprocess_t *vic)
 580 {
 581         int64_t d_rss = 0;
 582 
 583         if (get_psinfo(vic->lpc_pid, new_psinfo, vic->lpc_psinfo_fd,
 584             lprocess_update_psinfo_fd_cb, vic, vic) == 0) {
 585                 d_rss = (int64_t)new_psinfo->pr_rssize -
 586                     (int64_t)old_psinfo->pr_rssize;
 587                 if (d_rss < 0)
 588                         vic->lpc_collection->lcol_stat.lcols_pg_eff +=
 589                             (- d_rss);
 590                 *old_psinfo = *new_psinfo;
 591         }
 592 
 593         return (d_rss);
 594 }
 595 
 596 static void
 597 unignore_mappings(lprocess_t *lpc)
 598 {
 599         lmapping_free(&lpc->lpc_ignore);
 600 }
 601 
 602 static void
 603 unignore_referenced_mappings(lprocess_t *lpc)
 604 {
 605         prpageheader_cur_t cur;
 606         void *vicaddr;
 607 
 608         vicaddr = set_prpageheader_cur(&cur, lpc->lpc_prpageheader, NULL, -1);
 609         while (vicaddr != NULL) {
 610                 if (((*(char *)cur.pr_pdaddr) & (PG_REFERENCED | PG_MODIFIED))
 611                     != 0) {
 612                         if (lmapping_remove(&lpc->lpc_ignore, cur.pr_addr,
 613                             cur.pr_npage * cur.pr_pagesize) == 0)
 614                                 debug("removed mapping 0x%p+0t%llukB from"
 615                                     " ignored set\n", (void *)cur.pr_addr,
 616                                     (unsigned long long)(cur.pr_npage *
 617                                     cur.pr_pagesize / 1024));
 618                         vicaddr = (void *)advance_prpageheader_cur_nextmapping(
 619                             &cur);
 620                 } else if ((vicaddr = advance_prpageheader_cur(&cur)) == NULL)
 621                         vicaddr = (void *)advance_prpageheader_cur_nextmapping(
 622                             &cur);
 623         }
 624 }
 625 
 626 /*
 627  * Resume scanning, starting with the last victim, if it is still valid, or any
 628  * other one, otherwise.
 629  */
 630 void
 631 scan(lcollection_t *lcol, int64_t excess)
 632 {
 633         lprocess_t *vic, *lpc;
 634         void *vicaddr, *endaddr, *nvicaddr;
 635         prpageheader_cur_t cur;
 636         psinfo_t old_psinfo, new_psinfo;
 637         hrtime_t scan_start;
 638         int res, resumed;
 639         uint64_t col_unrm_size;
 640 
 641         st_debug(STDL_NORMAL, lcol, "starting to scan, excess %lldk\n",
 642             (long long)excess);
 643 
 644         /*
 645          * Determine the address to start scanning at, depending on whether
 646          * scanning can be resumed.
 647          */
 648         endaddr = NULL;
 649         if ((vic = get_valid_victim(lcol, lcol->lcol_victim)) ==
 650             lcol->lcol_victim && lcol->lcol_resaddr != NULL) {
 651                 vicaddr = lcol->lcol_resaddr;
 652                 st_debug(STDL_NORMAL, lcol, "resuming process %d\n",
 653                     (int)vic->lpc_pid);
 654                 resumed = 1;
 655         } else {
 656                 vicaddr = NULL;
 657                 resumed = 0;
 658         }
 659 
 660         scan_start = gethrtime();
 661         /*
 662          * Obtain the most current pagedata for the processes that might be
 663          * scanned, and remove from the ignored set any mappings which have
 664          * referenced or modified pages (in the hopes that the pageability of
 665          * the mapping's pages may have changed).  Determine if the
 666          * unreferenced and unmodified portion is impossibly small to suffice
 667          * to reduce the excess completely.  If so, ignore these bits so that
 668          * even working set will be paged out.
 669          */
 670         col_unrm_size = 0;
 671         lpc = vic;
 672         while (lpc != NULL && should_run) {
 673                 if (merge_current_pagedata(lpc, unignore_mappings) != 0) {
 674                         st_debug(STDL_NORMAL, lcol, "process %d:"
 675                             " exited/temporarily unscannable",
 676                             (int)lpc->lpc_pid);
 677                         goto next;
 678                 }
 679                 debug("process %d: %llu/%llukB scannable\n", (int)lpc->lpc_pid,
 680                     (unsigned long long)(lpc->lpc_unrm = unrm_size(lpc)),
 681                     (unsigned long long)lpc->lpc_size);
 682                 col_unrm_size += lpc->lpc_unrm = unrm_size(lpc);
 683 
 684                 if ((lcol->lcol_stat.lcols_scan_count %
 685                     RCAPD_IGNORED_SET_FLUSH_IVAL) == 0) {
 686                         /*
 687                          * Periodically clear the set of ignored mappings.
 688                          * This will allow processes whose ignored segments'
 689                          * pageability have changed (without a corresponding
 690                          * reference or modification to a page) to be
 691                          * recognized.
 692                          */
 693                         if (lcol->lcol_stat.lcols_scan_count > 0)
 694                                 unignore_mappings(lpc);
 695                 } else {
 696                         /*
 697                          * Ensure mappings with referenced or modified pages
 698                          * are not in the ignored set.  Their usage might mean
 699                          * the condition which made them unpageable is gone.
 700                          */
 701                         unignore_referenced_mappings(lpc);
 702                 }
 703 next:
 704                 lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
 705                     lpc->lpc_next) : NULL;
 706         }
 707         if (col_unrm_size < excess) {
 708                 lpc = vic;
 709                 debug("will not reduce excess with only unreferenced pages\n");
 710                 while (lpc != NULL && should_run) {
 711                         if (lpc->lpc_prpageheader != NULL) {
 712                                 (void) count_pages(lpc->lpc_prpageheader,
 713                                     CP_CLEAR, 0, 0);
 714                                 if (lpc->lpc_pgdata_fd >= 0) {
 715                                         if (rfd_close(lpc->lpc_pgdata_fd) != 0)
 716                                                 debug("coud not close %d"
 717                                                     " lpc_pgdata_fd %d",
 718                                                     (int)lpc->lpc_pid,
 719                                                     lpc->lpc_pgdata_fd);
 720                                         lpc->lpc_pgdata_fd = -1;
 721                                 }
 722                         }
 723                         lpc = lpc->lpc_next != NULL ? get_valid_victim(lcol,
 724                             lpc->lpc_next) : NULL;
 725                 }
 726         }
 727 
 728         /*
 729          * Examine each process for pages to remove until the excess is
 730          * reduced.
 731          */
 732         while (vic != NULL && excess > 0 && should_run) {
 733                 /*
 734                  * Skip processes whose death was reported when the merging of
 735                  * pagedata was attempted.
 736                  */
 737                 if (vic->lpc_prpageheader == NULL)
 738                         goto nextproc;
 739 
 740                 /*
 741                  * Obtain optional segment residency information.
 742                  */
 743                 if (lpc_xmap_update(vic) != 0)
 744                         st_debug(STDL_NORMAL, lcol, "process %d: xmap"
 745                             " unreadable; ignoring", (int)vic->lpc_pid);
 746 
 747 #ifdef DEBUG_MSG
 748                 {
 749                         void *ovicaddr = vicaddr;
 750 #endif /* DEBUG_MSG */
 751                 vicaddr = set_prpageheader_cur_addr(&cur, vic->lpc_prpageheader,
 752                     vic->lpc_xmap, vic->lpc_nxmap, vicaddr);
 753 #ifdef DEBUG_MSG
 754                         st_debug(STDL_NORMAL, lcol, "trying to resume from"
 755                             " 0x%p, next 0x%p\n", ovicaddr, vicaddr);
 756                 }
 757 #endif /* DEBUG_MSG */
 758 
 759                 /*
 760                  * Take control of the victim.
 761                  */
 762                 if (get_psinfo(vic->lpc_pid, &old_psinfo,
 763                     vic->lpc_psinfo_fd, lprocess_update_psinfo_fd_cb,
 764                     vic, vic) != 0) {
 765                         st_debug(STDL_NORMAL, lcol, "cannot get %d psinfo",
 766                             (int)vic->lpc_pid);
 767                         goto nextproc;
 768                 }
 769                 (void) rfd_reserve(PGRAB_FD_COUNT);
 770                 if ((scan_pr = Pgrab(vic->lpc_pid, 0, &res)) == NULL) {
 771                         st_debug(STDL_NORMAL, lcol, "cannot grab %d (%d)",
 772                             (int)vic->lpc_pid, res);
 773                         goto nextproc;
 774                 }
 775                 if (Pcreate_agent(scan_pr) != 0) {
 776                         st_debug(STDL_NORMAL, lcol, "cannot control %d",
 777                             (int)vic->lpc_pid);
 778                         goto nextproc;
 779                 }
 780                 /*
 781                  * Be very pessimistic about the state of the agent LWP --
 782                  * verify it's actually stopped.
 783                  */
 784                 errno = 0;
 785                 while (Pstate(scan_pr) == PS_RUN)
 786                         (void) Pwait(scan_pr, 0);
 787                 if (Pstate(scan_pr) != PS_STOP) {
 788                         st_debug(STDL_NORMAL, lcol, "agent not in expected"
 789                             " state (%d)", Pstate(scan_pr));
 790                         goto nextproc;
 791                 }
 792 
 793                 /*
 794                  * Within the victim's address space, find contiguous ranges of
 795                  * unreferenced pages to page out.
 796                  */
 797                 st_debug(STDL_NORMAL, lcol, "paging out process %d\n",
 798                     (int)vic->lpc_pid);
 799                 while (excess > 0 && vicaddr != NULL && should_run) {
 800                         /*
 801                          * Skip mappings in the ignored set.  Mappings get
 802                          * placed in the ignored set when all their resident
 803                          * pages are unreference and unmodified, yet unpageable
 804                          * -- such as when they are locked, or involved in
 805                          * asynchronous I/O.  They will be scanned again when
 806                          * some page is referenced or modified.
 807                          */
 808                         if (lmapping_contains(vic->lpc_ignore, cur.pr_addr,
 809                             cur.pr_npage * cur.pr_pagesize)) {
 810                                 debug("ignored mapping at 0x%p\n",
 811                                     (void *)cur.pr_addr);
 812                                 /*
 813                                  * Update statistics.
 814                                  */
 815                                 lcol->lcol_stat.lcols_pg_att +=
 816                                     cur.pr_npage * cur.pr_pagesize / 1024;
 817 
 818                                 vicaddr = (void *)
 819                                     advance_prpageheader_cur_nextmapping(&cur);
 820                                 continue;
 821                         }
 822 
 823                         /*
 824                          * Determine a range of unreferenced pages to page out,
 825                          * and clear the R/M bits in the preceding referenced
 826                          * range.
 827                          */
 828                         st_debug(STDL_HIGH, lcol, "start from mapping at 0x%p,"
 829                             " npage %llu\n", vicaddr,
 830                             (unsigned long long)cur.pr_npage);
 831                         while (vicaddr != NULL &&
 832                             *(caddr_t)cur.pr_pdaddr != 0) {
 833                                 *(caddr_t)cur.pr_pdaddr = 0;
 834                                 vicaddr = advance_prpageheader_cur(&cur);
 835                         }
 836                         st_debug(STDL_HIGH, lcol, "advance, vicaddr %p, pdaddr"
 837                             " %p\n", vicaddr, cur.pr_pdaddr);
 838                         if (vicaddr == NULL) {
 839                                 /*
 840                                  * The end of mapping was reached before any
 841                                  * unreferenced pages were seen.
 842                                  */
 843                                 vicaddr = (void *)
 844                                     advance_prpageheader_cur_nextmapping(&cur);
 845                                 continue;
 846                         }
 847                         do
 848                                 endaddr = advance_prpageheader_cur(&cur);
 849                         while (endaddr != NULL &&
 850                             *(caddr_t)cur.pr_pdaddr == 0 &&
 851                             (((intptr_t)endaddr - (intptr_t)vicaddr) /
 852                                 1024) < excess);
 853                         st_debug(STDL_HIGH, lcol, "endaddr %p, *cur %d\n",
 854                             endaddr, *(caddr_t)cur.pr_pdaddr);
 855 
 856                         /*
 857                          * Page out from vicaddr to the end of the mapping, or
 858                          * endaddr if set, then continue scanning after
 859                          * endaddr, or the next mapping, if not set.
 860                          */
 861                         nvicaddr = endaddr;
 862                         if (endaddr == NULL)
 863                                 endaddr = (caddr_t)cur.pr_addr +
 864                                     cur.pr_pagesize * cur.pr_npage;
 865                         if (pageout(vic->lpc_pid, scan_pr, vicaddr, endaddr) ==
 866                             0) {
 867                                 int64_t d_rss, att;
 868                                 int willignore = 0;
 869 
 870                                 excess += (d_rss = rss_delta(
 871                                     &new_psinfo, &old_psinfo, vic));
 872 
 873                                 /*
 874                                  * If this pageout attempt was unsuccessful
 875                                  * (the resident portion was not affected), and
 876                                  * was for the whole mapping, put it in the
 877                                  * ignored set, so it will not be scanned again
 878                                  * until some page is referenced or modified.
 879                                  */
 880                                 if (d_rss >= 0 && (void *)cur.pr_addr ==
 881                                     vicaddr && (cur.pr_pagesize * cur.pr_npage)
 882                                     == ((uintptr_t)endaddr -
 883                                     (uintptr_t)vicaddr)) {
 884                                         if (lmapping_insert(
 885                                             &vic->lpc_ignore,
 886                                             cur.pr_addr,
 887                                             cur.pr_pagesize *
 888                                             cur.pr_npage) != 0)
 889                                                 debug("not enough memory to add"
 890                                                     " mapping at %p to ignored"
 891                                                     " set\n",
 892                                                     (void *)cur.pr_addr);
 893                                         willignore = 1;
 894                                 }
 895 
 896                                 /*
 897                                  * Update statistics.
 898                                  */
 899                                 lcol->lcol_stat.lcols_pg_att += (att =
 900                                     ((intptr_t)endaddr - (intptr_t)vicaddr) /
 901                                     1024);
 902                                 st_debug(STDL_NORMAL, lcol, "paged out 0x%p"
 903                                     "+0t(%llu/%llu)kB%s\n", vicaddr,
 904                                     (unsigned long long)((d_rss <
 905                                     0) ? - d_rss : 0), (unsigned long long)att,
 906                                     willignore ? " (will ignore)" : "");
 907                         } else {
 908                                 st_debug(STDL_NORMAL, lcol,
 909                                     "process %d: exited/unscannable\n",
 910                                     (int)vic->lpc_pid);
 911                                 vic->lpc_unscannable = 1;
 912                                 goto nextproc;
 913                         }
 914 
 915                         /*
 916                          * Update the statistics file, if it's time.
 917                          */
 918                         check_update_statistics();
 919 
 920                         vicaddr = (nvicaddr != NULL) ? nvicaddr : (void
 921                             *)advance_prpageheader_cur_nextmapping(&cur);
 922                 }
 923                 excess += rss_delta(&new_psinfo, &old_psinfo, vic);
 924                 st_debug(STDL_NORMAL, lcol, "done, excess %lld\n",
 925                     (long long)excess);
 926 nextproc:
 927                 /*
 928                  * If a process was grabbed, release it, destroying its agent.
 929                  */
 930                 if (scan_pr != NULL) {
 931                         (void) Prelease(scan_pr, 0);
 932                         scan_pr = NULL;
 933                 }
 934                 lcol->lcol_victim = vic;
 935                 /*
 936                  * Scan the collection at most once.  Only if scanning was not
 937                  * aborted for any reason, and the end of lprocess has not been
 938                  * reached, determine the next victim and scan it.
 939                  */
 940                 if (vic != NULL) {
 941                         if (vic->lpc_next != NULL) {
 942                                 /*
 943                                  * Determine the next process to be scanned.
 944                                  */
 945                                 if (excess > 0) {
 946                                         vic = get_valid_victim(lcol,
 947                                             vic->lpc_next);
 948                                         vicaddr = 0;
 949                                 }
 950                         } else {
 951                                 /*
 952                                  * A complete scan of the collection was made,
 953                                  * so tick the scan counter and stop scanning
 954                                  * until the next request.
 955                                  */
 956                                 lcol->lcol_stat.lcols_scan_count++;
 957                                 lcol->lcol_stat.lcols_scan_time_complete
 958                                     = lcol->lcol_stat.lcols_scan_time;
 959                                 /*
 960                                  * If an excess still exists, tick the
 961                                  * "ineffective scan" counter, signalling that
 962                                  * the cap may be uneforceable.
 963                                  */
 964                                 if (resumed == 0 && excess > 0)
 965                                         lcol->lcol_stat
 966                                             .lcols_scan_ineffective++;
 967                                 /*
 968                                  * Scanning should start at the beginning of
 969                                  * the process list at the next request.
 970                                  */
 971                                 if (excess > 0)
 972                                         vic = NULL;
 973                         }
 974                 }
 975         }
 976         lcol->lcol_stat.lcols_scan_time += (gethrtime() - scan_start);
 977         st_debug(STDL_HIGH, lcol, "done scanning; excess %lld\n",
 978             (long long)excess);
 979 
 980         lcol->lcol_resaddr = vicaddr;
 981         if (lcol->lcol_resaddr == NULL && lcol->lcol_victim != NULL) {
 982                 lcol->lcol_victim = get_valid_victim(lcol,
 983                     lcol->lcol_victim->lpc_next);
 984         }
 985 }
 986 
 987 /*
 988  * Abort the scan in progress, and destroy the agent LWP of any grabbed
 989  * processes.
 990  */
 991 void
 992 scan_abort(void)
 993 {
 994         if (scan_pr != NULL)
 995                 (void) Prelease(scan_pr, NULL);
 996 }
 997 
 998 static void
 999 revoke_xmap(rfd_t *rfd)
1000 {
1001         lprocess_t *lpc = rfd->rfd_data;
1002 
1003         debug("revoking xmap for process %d\n", (int)lpc->lpc_pid);
1004         ASSERT(lpc->lpc_xmap_fd != -1);
1005         lpc->lpc_xmap_fd = -1;
1006 }
1007 
1008 /*
1009  * Retrieve the process's current xmap , which is used to determine the size of
1010  * the resident portion of its segments.  Return zero if successful.
1011  */
1012 static int
1013 lpc_xmap_update(lprocess_t *lpc)
1014 {
1015         int res;
1016         struct stat st;
1017 
1018         free(lpc->lpc_xmap);
1019         lpc->lpc_xmap = NULL;
1020         lpc->lpc_nxmap = -1;
1021 
1022         if (lpc->lpc_xmap_fd == -1) {
1023                 char pathbuf[PROC_PATH_MAX];
1024 
1025                 (void) snprintf(pathbuf, sizeof (pathbuf), "/proc/%d/xmap",
1026                     (int)lpc->lpc_pid);
1027                 if ((lpc->lpc_xmap_fd = rfd_open(pathbuf, 1, RFD_XMAP,
1028                     revoke_xmap, lpc, O_RDONLY, 0)) < 0)
1029                         return (-1);
1030         }
1031 
1032 redo:
1033         errno = 0;
1034         if (fstat(lpc->lpc_xmap_fd, &st) != 0) {
1035                 debug("cannot stat xmap\n");
1036                 (void) rfd_close(lpc->lpc_xmap_fd);
1037                 lpc->lpc_xmap_fd = -1;
1038                 return (-1);
1039         }
1040 
1041         if ((st.st_size % sizeof (*lpc->lpc_xmap)) != 0) {
1042                 debug("xmap wrong size\n");
1043                 (void) rfd_close(lpc->lpc_xmap_fd);
1044                 lpc->lpc_xmap_fd = -1;
1045                 return (-1);
1046         }
1047 
1048         lpc->lpc_xmap = malloc(st.st_size);
1049         if (lpc->lpc_xmap == NULL) {
1050                 debug("cannot malloc() %ld bytes for xmap", st.st_size);
1051                 (void) rfd_close(lpc->lpc_xmap_fd);
1052                 lpc->lpc_xmap_fd = -1;
1053                 return (-1);
1054         }
1055 
1056         if ((res = pread(lpc->lpc_xmap_fd, lpc->lpc_xmap, st.st_size, 0)) !=
1057             st.st_size) {
1058                 free(lpc->lpc_xmap);
1059                 lpc->lpc_xmap = NULL;
1060                 if (res > 0) {
1061                         debug("xmap changed size, retrying\n");
1062                         goto redo;
1063                 } else {
1064                         debug("cannot read xmap");
1065                         return (-1);
1066                 }
1067         }
1068         lpc->lpc_nxmap = st.st_size / sizeof (*lpc->lpc_xmap);
1069 
1070         return (0);
1071 }