1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
  25  * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T        */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40 
  41 /*
  42  * VM - physical page management.
  43  */
  44 
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/errno.h>
  50 #include <sys/time.h>
  51 #include <sys/vnode.h>
  52 #include <sys/vm.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/swap.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/tuneable.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/cpuvar.h>
  59 #include <sys/callb.h>
  60 #include <sys/debug.h>
  61 #include <sys/tnf_probe.h>
  62 #include <sys/condvar_impl.h>
  63 #include <sys/mem_config.h>
  64 #include <sys/mem_cage.h>
  65 #include <sys/kmem.h>
  66 #include <sys/atomic.h>
  67 #include <sys/strlog.h>
  68 #include <sys/mman.h>
  69 #include <sys/ontrap.h>
  70 #include <sys/lgrp.h>
  71 #include <sys/vfs.h>
  72 
  73 #include <vm/hat.h>
  74 #include <vm/anon.h>
  75 #include <vm/page.h>
  76 #include <vm/seg.h>
  77 #include <vm/pvn.h>
  78 #include <vm/seg_kmem.h>
  79 #include <vm/vm_dep.h>
  80 #include <sys/vm_usage.h>
  81 #include <fs/fs_subr.h>
  82 #include <sys/ddi.h>
  83 #include <sys/modctl.h>
  84 
  85 static pgcnt_t max_page_get;    /* max page_get request size in pages */
  86 pgcnt_t total_pages = 0;        /* total number of pages (used by /proc) */
  87 
  88 /*
  89  * freemem_lock protects all freemem variables:
  90  * availrmem. Also this lock protects the globals which track the
  91  * availrmem changes for accurate kernel footprint calculation.
  92  * See below for an explanation of these
  93  * globals.
  94  */
  95 kmutex_t freemem_lock;
  96 pgcnt_t availrmem;
  97 pgcnt_t availrmem_initial;
  98 
  99 /*
 100  * These globals track availrmem changes to get a more accurate
 101  * estimate of tke kernel size. Historically pp_kernel is used for
 102  * kernel size and is based on availrmem. But availrmem is adjusted for
 103  * locked pages in the system not just for kernel locked pages.
 104  * These new counters will track the pages locked through segvn and
 105  * by explicit user locking.
 106  *
 107  * pages_locked : How many pages are locked because of user specified
 108  * locking through mlock or plock.
 109  *
 110  * pages_useclaim,pages_claimed : These two variables track the
 111  * claim adjustments because of the protection changes on a segvn segment.
 112  *
 113  * All these globals are protected by the same lock which protects availrmem.
 114  */
 115 pgcnt_t pages_locked = 0;
 116 pgcnt_t pages_useclaim = 0;
 117 pgcnt_t pages_claimed = 0;
 118 
 119 
 120 /*
 121  * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
 122  */
 123 static kmutex_t new_freemem_lock;
 124 static uint_t   freemem_wait;   /* someone waiting for freemem */
 125 static kcondvar_t freemem_cv;
 126 
 127 /*
 128  * The logical page free list is maintained as two lists, the 'free'
 129  * and the 'cache' lists.
 130  * The free list contains those pages that should be reused first.
 131  *
 132  * The implementation of the lists is machine dependent.
 133  * page_get_freelist(), page_get_cachelist(),
 134  * page_list_sub(), and page_list_add()
 135  * form the interface to the machine dependent implementation.
 136  *
 137  * Pages with p_free set are on the cache list.
 138  * Pages with p_free and p_age set are on the free list,
 139  *
 140  * A page may be locked while on either list.
 141  */
 142 
 143 /*
 144  * free list accounting stuff.
 145  *
 146  *
 147  * Spread out the value for the number of pages on the
 148  * page free and page cache lists.  If there is just one
 149  * value, then it must be under just one lock.
 150  * The lock contention and cache traffic are a real bother.
 151  *
 152  * When we acquire and then drop a single pcf lock
 153  * we can start in the middle of the array of pcf structures.
 154  * If we acquire more than one pcf lock at a time, we need to
 155  * start at the front to avoid deadlocking.
 156  *
 157  * pcf_count holds the number of pages in each pool.
 158  *
 159  * pcf_block is set when page_create_get_something() has asked the
 160  * PSM page freelist and page cachelist routines without specifying
 161  * a color and nothing came back.  This is used to block anything
 162  * else from moving pages from one list to the other while the
 163  * lists are searched again.  If a page is freeed while pcf_block is
 164  * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
 165  * of clearning pcf_block, doing the wakeups, etc.
 166  */
 167 
 168 #define MAX_PCF_FANOUT NCPU
 169 static uint_t pcf_fanout = 1; /* Will get changed at boot time */
 170 static uint_t pcf_fanout_mask = 0;
 171 
 172 struct pcf {
 173         kmutex_t        pcf_lock;       /* protects the structure */
 174         uint_t          pcf_count;      /* page count */
 175         uint_t          pcf_wait;       /* number of waiters */
 176         uint_t          pcf_block;      /* pcgs flag to page_free() */
 177         uint_t          pcf_reserve;    /* pages freed after pcf_block set */
 178         uint_t          pcf_fill[10];   /* to line up on the caches */
 179 };
 180 
 181 /*
 182  * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
 183  * it will hash the cpu to).  This is done to prevent a drain condition
 184  * from happening.  This drain condition will occur when pcf_count decrement
 185  * occurs on cpu A and the increment of pcf_count always occurs on cpu B.  An
 186  * example of this shows up with device interrupts.  The dma buffer is allocated
 187  * by the cpu requesting the IO thus the pcf_count is decremented based on that.
 188  * When the memory is returned by the interrupt thread, the pcf_count will be
 189  * incremented based on the cpu servicing the interrupt.
 190  */
 191 static struct pcf pcf[MAX_PCF_FANOUT];
 192 #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
 193         (randtick() >> 24)) & (pcf_fanout_mask))
 194 
 195 static int pcf_decrement_bucket(pgcnt_t);
 196 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
 197 
 198 kmutex_t        pcgs_lock;              /* serializes page_create_get_ */
 199 kmutex_t        pcgs_cagelock;          /* serializes NOSLEEP cage allocs */
 200 kmutex_t        pcgs_wait_lock;         /* used for delay in pcgs */
 201 static kcondvar_t       pcgs_cv;        /* cv for delay in pcgs */
 202 
 203 #ifdef VM_STATS
 204 
 205 /*
 206  * No locks, but so what, they are only statistics.
 207  */
 208 
 209 static struct page_tcnt {
 210         int     pc_free_cache;          /* free's into cache list */
 211         int     pc_free_dontneed;       /* free's with dontneed */
 212         int     pc_free_pageout;        /* free's from pageout */
 213         int     pc_free_free;           /* free's into free list */
 214         int     pc_free_pages;          /* free's into large page free list */
 215         int     pc_destroy_pages;       /* large page destroy's */
 216         int     pc_get_cache;           /* get's from cache list */
 217         int     pc_get_free;            /* get's from free list */
 218         int     pc_reclaim;             /* reclaim's */
 219         int     pc_abortfree;           /* abort's of free pages */
 220         int     pc_find_hit;            /* find's that find page */
 221         int     pc_find_miss;           /* find's that don't find page */
 222         int     pc_destroy_free;        /* # of free pages destroyed */
 223 #define PC_HASH_CNT     (4*PAGE_HASHAVELEN)
 224         int     pc_find_hashlen[PC_HASH_CNT+1];
 225         int     pc_addclaim_pages;
 226         int     pc_subclaim_pages;
 227         int     pc_free_replacement_page[2];
 228         int     pc_try_demote_pages[6];
 229         int     pc_demote_pages[2];
 230 } pagecnt;
 231 
 232 uint_t  hashin_count;
 233 uint_t  hashin_not_held;
 234 uint_t  hashin_already;
 235 
 236 uint_t  hashout_count;
 237 uint_t  hashout_not_held;
 238 
 239 uint_t  page_create_count;
 240 uint_t  page_create_not_enough;
 241 uint_t  page_create_not_enough_again;
 242 uint_t  page_create_zero;
 243 uint_t  page_create_hashout;
 244 uint_t  page_create_page_lock_failed;
 245 uint_t  page_create_trylock_failed;
 246 uint_t  page_create_found_one;
 247 uint_t  page_create_hashin_failed;
 248 uint_t  page_create_dropped_phm;
 249 
 250 uint_t  page_create_new;
 251 uint_t  page_create_exists;
 252 uint_t  page_create_putbacks;
 253 uint_t  page_create_overshoot;
 254 
 255 uint_t  page_reclaim_zero;
 256 uint_t  page_reclaim_zero_locked;
 257 
 258 uint_t  page_rename_exists;
 259 uint_t  page_rename_count;
 260 
 261 uint_t  page_lookup_cnt[20];
 262 uint_t  page_lookup_nowait_cnt[10];
 263 uint_t  page_find_cnt;
 264 uint_t  page_exists_cnt;
 265 uint_t  page_exists_forreal_cnt;
 266 uint_t  page_lookup_dev_cnt;
 267 uint_t  get_cachelist_cnt;
 268 uint_t  page_create_cnt[10];
 269 uint_t  alloc_pages[9];
 270 uint_t  page_exphcontg[19];
 271 uint_t  page_create_large_cnt[10];
 272 
 273 #endif
 274 
 275 static inline page_t *
 276 page_hash_search(ulong_t index, vnode_t *vnode, u_offset_t off)
 277 {
 278         uint_t mylen = 0;
 279         page_t *page;
 280 
 281         for (page = page_hash[index]; page; page = page->p_hash, mylen++)
 282                 if (page->p_vnode == vnode && page->p_offset == off)
 283                         break;
 284 
 285 #ifdef  VM_STATS
 286         if (page != NULL)
 287                 pagecnt.pc_find_hit++;
 288         else
 289                 pagecnt.pc_find_miss++;
 290 
 291         pagecnt.pc_find_hashlen[MIN(mylen, PC_HASH_CNT)]++;
 292 #endif
 293 
 294         return (page);
 295 }
 296 
 297 
 298 #ifdef DEBUG
 299 #define MEMSEG_SEARCH_STATS
 300 #endif
 301 
 302 #ifdef MEMSEG_SEARCH_STATS
 303 struct memseg_stats {
 304     uint_t nsearch;
 305     uint_t nlastwon;
 306     uint_t nhashwon;
 307     uint_t nnotfound;
 308 } memseg_stats;
 309 
 310 #define MEMSEG_STAT_INCR(v) \
 311         atomic_inc_32(&memseg_stats.v)
 312 #else
 313 #define MEMSEG_STAT_INCR(x)
 314 #endif
 315 
 316 struct memseg *memsegs;         /* list of memory segments */
 317 
 318 /*
 319  * /etc/system tunable to control large page allocation hueristic.
 320  *
 321  * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
 322  * for large page allocation requests.  If a large page is not readily
 323  * avaliable on the local freelists we will go through additional effort
 324  * to create a large page, potentially moving smaller pages around to coalesce
 325  * larger pages in the local lgroup.
 326  * Default value of LPAP_DEFAULT will go to remote freelists if large pages
 327  * are not readily available in the local lgroup.
 328  */
 329 enum lpap {
 330         LPAP_DEFAULT,   /* default large page allocation policy */
 331         LPAP_LOCAL      /* local large page allocation policy */
 332 };
 333 
 334 enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
 335 
 336 static void page_init_mem_config(void);
 337 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
 338 static void page_do_hashout(page_t *);
 339 static void page_capture_init();
 340 int page_capture_take_action(page_t *, uint_t, void *);
 341 
 342 static void page_demote_vp_pages(page_t *);
 343 
 344 
 345 void
 346 pcf_init(void)
 347 {
 348         if (boot_ncpus != -1) {
 349                 pcf_fanout = boot_ncpus;
 350         } else {
 351                 pcf_fanout = max_ncpus;
 352         }
 353 #ifdef sun4v
 354         /*
 355          * Force at least 4 buckets if possible for sun4v.
 356          */
 357         pcf_fanout = MAX(pcf_fanout, 4);
 358 #endif /* sun4v */
 359 
 360         /*
 361          * Round up to the nearest power of 2.
 362          */
 363         pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
 364         if (!ISP2(pcf_fanout)) {
 365                 pcf_fanout = 1 << highbit(pcf_fanout);
 366 
 367                 if (pcf_fanout > MAX_PCF_FANOUT) {
 368                         pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
 369                 }
 370         }
 371         pcf_fanout_mask = pcf_fanout - 1;
 372 }
 373 
 374 /*
 375  * vm subsystem related initialization
 376  */
 377 void
 378 vm_init(void)
 379 {
 380         boolean_t callb_vm_cpr(void *, int);
 381 
 382         (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
 383         page_init_mem_config();
 384         page_retire_init();
 385         vm_usage_init();
 386         page_capture_init();
 387 }
 388 
 389 /*
 390  * This function is called at startup and when memory is added or deleted.
 391  */
 392 void
 393 init_pages_pp_maximum()
 394 {
 395         static pgcnt_t p_min;
 396         static pgcnt_t pages_pp_maximum_startup;
 397         static pgcnt_t avrmem_delta;
 398         static int init_done;
 399         static int user_set;    /* true if set in /etc/system */
 400 
 401         if (init_done == 0) {
 402 
 403                 /* If the user specified a value, save it */
 404                 if (pages_pp_maximum != 0) {
 405                         user_set = 1;
 406                         pages_pp_maximum_startup = pages_pp_maximum;
 407                 }
 408 
 409                 /*
 410                  * Setting of pages_pp_maximum is based first time
 411                  * on the value of availrmem just after the start-up
 412                  * allocations. To preserve this relationship at run
 413                  * time, use a delta from availrmem_initial.
 414                  */
 415                 ASSERT(availrmem_initial >= availrmem);
 416                 avrmem_delta = availrmem_initial - availrmem;
 417 
 418                 /* The allowable floor of pages_pp_maximum */
 419                 p_min = tune.t_minarmem + 100;
 420 
 421                 /* Make sure we don't come through here again. */
 422                 init_done = 1;
 423         }
 424         /*
 425          * Determine pages_pp_maximum, the number of currently available
 426          * pages (availrmem) that can't be `locked'. If not set by
 427          * the user, we set it to 4% of the currently available memory
 428          * plus 4MB.
 429          * But we also insist that it be greater than tune.t_minarmem;
 430          * otherwise a process could lock down a lot of memory, get swapped
 431          * out, and never have enough to get swapped back in.
 432          */
 433         if (user_set)
 434                 pages_pp_maximum = pages_pp_maximum_startup;
 435         else
 436                 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
 437                     + btop(4 * 1024 * 1024);
 438 
 439         if (pages_pp_maximum <= p_min) {
 440                 pages_pp_maximum = p_min;
 441         }
 442 }
 443 
 444 void
 445 set_max_page_get(pgcnt_t target_total_pages)
 446 {
 447         max_page_get = target_total_pages / 2;
 448 }
 449 
 450 static pgcnt_t pending_delete;
 451 
 452 /*ARGSUSED*/
 453 static void
 454 page_mem_config_post_add(
 455         void *arg,
 456         pgcnt_t delta_pages)
 457 {
 458         set_max_page_get(total_pages - pending_delete);
 459         init_pages_pp_maximum();
 460 }
 461 
 462 /*ARGSUSED*/
 463 static int
 464 page_mem_config_pre_del(
 465         void *arg,
 466         pgcnt_t delta_pages)
 467 {
 468         pgcnt_t nv;
 469 
 470         nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
 471         set_max_page_get(total_pages - nv);
 472         return (0);
 473 }
 474 
 475 /*ARGSUSED*/
 476 static void
 477 page_mem_config_post_del(
 478         void *arg,
 479         pgcnt_t delta_pages,
 480         int cancelled)
 481 {
 482         pgcnt_t nv;
 483 
 484         nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
 485         set_max_page_get(total_pages - nv);
 486         if (!cancelled)
 487                 init_pages_pp_maximum();
 488 }
 489 
 490 static kphysm_setup_vector_t page_mem_config_vec = {
 491         KPHYSM_SETUP_VECTOR_VERSION,
 492         page_mem_config_post_add,
 493         page_mem_config_pre_del,
 494         page_mem_config_post_del,
 495 };
 496 
 497 static void
 498 page_init_mem_config(void)
 499 {
 500         int ret;
 501 
 502         ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
 503         ASSERT(ret == 0);
 504 }
 505 
 506 /*
 507  * Evenly spread out the PCF counters for large free pages
 508  */
 509 static void
 510 page_free_large_ctr(pgcnt_t npages)
 511 {
 512         static struct pcf       *p = pcf;
 513         pgcnt_t                 lump;
 514 
 515         freemem += npages;
 516 
 517         lump = roundup(npages, pcf_fanout) / pcf_fanout;
 518 
 519         while (npages > 0) {
 520 
 521                 ASSERT(!p->pcf_block);
 522 
 523                 if (lump < npages) {
 524                         p->pcf_count += (uint_t)lump;
 525                         npages -= lump;
 526                 } else {
 527                         p->pcf_count += (uint_t)npages;
 528                         npages = 0;
 529                 }
 530 
 531                 ASSERT(!p->pcf_wait);
 532 
 533                 if (++p > &pcf[pcf_fanout - 1])
 534                         p = pcf;
 535         }
 536 
 537         ASSERT(npages == 0);
 538 }
 539 
 540 /*
 541  * Add a physical chunk of memory to the system free lists during startup.
 542  * Platform specific startup() allocates the memory for the page structs.
 543  *
 544  * num  - number of page structures
 545  * base - page number (pfn) to be associated with the first page.
 546  *
 547  * Since we are doing this during startup (ie. single threaded), we will
 548  * use shortcut routines to avoid any locking overhead while putting all
 549  * these pages on the freelists.
 550  *
 551  * NOTE: Any changes performed to page_free(), must also be performed to
 552  *       add_physmem() since this is how we initialize all page_t's at
 553  *       boot time.
 554  */
 555 void
 556 add_physmem(
 557         page_t  *pp,
 558         pgcnt_t num,
 559         pfn_t   pnum)
 560 {
 561         page_t  *root = NULL;
 562         uint_t  szc = page_num_pagesizes() - 1;
 563         pgcnt_t large = page_get_pagecnt(szc);
 564         pgcnt_t cnt = 0;
 565 
 566         TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
 567             "add_physmem:pp %p num %lu", pp, num);
 568 
 569         /*
 570          * Arbitrarily limit the max page_get request
 571          * to 1/2 of the page structs we have.
 572          */
 573         total_pages += num;
 574         set_max_page_get(total_pages);
 575 
 576         PLCNT_MODIFY_MAX(pnum, (long)num);
 577 
 578         /*
 579          * The physical space for the pages array
 580          * representing ram pages has already been
 581          * allocated.  Here we initialize each lock
 582          * in the page structure, and put each on
 583          * the free list
 584          */
 585         for (; num; pp++, pnum++, num--) {
 586 
 587                 /*
 588                  * this needs to fill in the page number
 589                  * and do any other arch specific initialization
 590                  */
 591                 add_physmem_cb(pp, pnum);
 592 
 593                 pp->p_lckcnt = 0;
 594                 pp->p_cowcnt = 0;
 595                 pp->p_slckcnt = 0;
 596 
 597                 /*
 598                  * Initialize the page lock as unlocked, since nobody
 599                  * can see or access this page yet.
 600                  */
 601                 pp->p_selock = 0;
 602 
 603                 /*
 604                  * Initialize IO lock
 605                  */
 606                 page_iolock_init(pp);
 607 
 608                 /*
 609                  * initialize other fields in the page_t
 610                  */
 611                 PP_SETFREE(pp);
 612                 page_clr_all_props(pp);
 613                 PP_SETAGED(pp);
 614                 pp->p_offset = (u_offset_t)-1;
 615                 pp->p_next = pp;
 616                 pp->p_prev = pp;
 617 
 618                 /*
 619                  * Simple case: System doesn't support large pages.
 620                  */
 621                 if (szc == 0) {
 622                         pp->p_szc = 0;
 623                         page_free_at_startup(pp);
 624                         continue;
 625                 }
 626 
 627                 /*
 628                  * Handle unaligned pages, we collect them up onto
 629                  * the root page until we have a full large page.
 630                  */
 631                 if (!IS_P2ALIGNED(pnum, large)) {
 632 
 633                         /*
 634                          * If not in a large page,
 635                          * just free as small page.
 636                          */
 637                         if (root == NULL) {
 638                                 pp->p_szc = 0;
 639                                 page_free_at_startup(pp);
 640                                 continue;
 641                         }
 642 
 643                         /*
 644                          * Link a constituent page into the large page.
 645                          */
 646                         pp->p_szc = szc;
 647                         page_list_concat(&root, &pp);
 648 
 649                         /*
 650                          * When large page is fully formed, free it.
 651                          */
 652                         if (++cnt == large) {
 653                                 page_free_large_ctr(cnt);
 654                                 page_list_add_pages(root, PG_LIST_ISINIT);
 655                                 root = NULL;
 656                                 cnt = 0;
 657                         }
 658                         continue;
 659                 }
 660 
 661                 /*
 662                  * At this point we have a page number which
 663                  * is aligned. We assert that we aren't already
 664                  * in a different large page.
 665                  */
 666                 ASSERT(IS_P2ALIGNED(pnum, large));
 667                 ASSERT(root == NULL && cnt == 0);
 668 
 669                 /*
 670                  * If insufficient number of pages left to form
 671                  * a large page, just free the small page.
 672                  */
 673                 if (num < large) {
 674                         pp->p_szc = 0;
 675                         page_free_at_startup(pp);
 676                         continue;
 677                 }
 678 
 679                 /*
 680                  * Otherwise start a new large page.
 681                  */
 682                 pp->p_szc = szc;
 683                 cnt++;
 684                 root = pp;
 685         }
 686         ASSERT(root == NULL && cnt == 0);
 687 }
 688 
 689 /*
 690  * Find a page representing the specified [vp, offset].
 691  * If we find the page but it is intransit coming in,
 692  * it will have an "exclusive" lock and we wait for
 693  * the i/o to complete.  A page found on the free list
 694  * is always reclaimed and then locked.  On success, the page
 695  * is locked, its data is valid and it isn't on the free
 696  * list, while a NULL is returned if the page doesn't exist.
 697  */
 698 page_t *
 699 page_lookup(vnode_t *vp, u_offset_t off, se_t se)
 700 {
 701         return (page_lookup_create(vp, off, se, NULL, NULL, 0));
 702 }
 703 
 704 /*
 705  * Find a page representing the specified [vp, offset].
 706  * We either return the one we found or, if passed in,
 707  * create one with identity of [vp, offset] of the
 708  * pre-allocated page. If we find existing page but it is
 709  * intransit coming in, it will have an "exclusive" lock
 710  * and we wait for the i/o to complete.  A page found on
 711  * the free list is always reclaimed and then locked.
 712  * On success, the page is locked, its data is valid and
 713  * it isn't on the free list, while a NULL is returned
 714  * if the page doesn't exist and newpp is NULL;
 715  */
 716 page_t *
 717 page_lookup_create(
 718         vnode_t *vp,
 719         u_offset_t off,
 720         se_t se,
 721         page_t *newpp,
 722         spgcnt_t *nrelocp,
 723         int flags)
 724 {
 725         page_t          *pp;
 726         kmutex_t        *phm;
 727         ulong_t         index;
 728         uint_t          hash_locked;
 729         uint_t          es;
 730 
 731         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 732         VM_STAT_ADD(page_lookup_cnt[0]);
 733         ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
 734 
 735         /*
 736          * Acquire the appropriate page hash lock since
 737          * we have to search the hash list.  Pages that
 738          * hash to this list can't change identity while
 739          * this lock is held.
 740          */
 741         hash_locked = 0;
 742         index = PAGE_HASH_FUNC(vp, off);
 743         phm = NULL;
 744 top:
 745         pp = page_hash_search(index, vp, off);
 746         if (pp != NULL) {
 747                 VM_STAT_ADD(page_lookup_cnt[1]);
 748                 es = (newpp != NULL) ? 1 : 0;
 749                 es |= flags;
 750                 if (!hash_locked) {
 751                         VM_STAT_ADD(page_lookup_cnt[2]);
 752                         if (!page_try_reclaim_lock(pp, se, es)) {
 753                                 /*
 754                                  * On a miss, acquire the phm.  Then
 755                                  * next time, page_lock() will be called,
 756                                  * causing a wait if the page is busy.
 757                                  * just looping with page_trylock() would
 758                                  * get pretty boring.
 759                                  */
 760                                 VM_STAT_ADD(page_lookup_cnt[3]);
 761                                 phm = PAGE_HASH_MUTEX(index);
 762                                 mutex_enter(phm);
 763                                 hash_locked = 1;
 764                                 goto top;
 765                         }
 766                 } else {
 767                         VM_STAT_ADD(page_lookup_cnt[4]);
 768                         if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
 769                                 VM_STAT_ADD(page_lookup_cnt[5]);
 770                                 goto top;
 771                         }
 772                 }
 773 
 774                 /*
 775                  * Since `pp' is locked it can not change identity now.
 776                  * Reconfirm we locked the correct page.
 777                  *
 778                  * Both the p_vnode and p_offset *must* be cast volatile
 779                  * to force a reload of their values: The page_hash_search
 780                  * function will have stuffed p_vnode and p_offset into
 781                  * registers before calling page_trylock(); another thread,
 782                  * actually holding the hash lock, could have changed the
 783                  * page's identity in memory, but our registers would not
 784                  * be changed, fooling the reconfirmation.  If the hash
 785                  * lock was held during the search, the casting would
 786                  * not be needed.
 787                  */
 788                 VM_STAT_ADD(page_lookup_cnt[6]);
 789                 if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 790                     ((volatile u_offset_t)(pp->p_offset) != off)) {
 791                         VM_STAT_ADD(page_lookup_cnt[7]);
 792                         if (hash_locked) {
 793                                 panic("page_lookup_create: lost page %p",
 794                                     (void *)pp);
 795                                 /*NOTREACHED*/
 796                         }
 797                         page_unlock(pp);
 798                         phm = PAGE_HASH_MUTEX(index);
 799                         mutex_enter(phm);
 800                         hash_locked = 1;
 801                         goto top;
 802                 }
 803 
 804                 /*
 805                  * If page_trylock() was called, then pp may still be on
 806                  * the cachelist (can't be on the free list, it would not
 807                  * have been found in the search).  If it is on the
 808                  * cachelist it must be pulled now. To pull the page from
 809                  * the cachelist, it must be exclusively locked.
 810                  *
 811                  * The other big difference between page_trylock() and
 812                  * page_lock(), is that page_lock() will pull the
 813                  * page from whatever free list (the cache list in this
 814                  * case) the page is on.  If page_trylock() was used
 815                  * above, then we have to do the reclaim ourselves.
 816                  */
 817                 if ((!hash_locked) && (PP_ISFREE(pp))) {
 818                         ASSERT(PP_ISAGED(pp) == 0);
 819                         VM_STAT_ADD(page_lookup_cnt[8]);
 820 
 821                         /*
 822                          * page_relcaim will insure that we
 823                          * have this page exclusively
 824                          */
 825 
 826                         if (!page_reclaim(pp, NULL)) {
 827                                 /*
 828                                  * Page_reclaim dropped whatever lock
 829                                  * we held.
 830                                  */
 831                                 VM_STAT_ADD(page_lookup_cnt[9]);
 832                                 phm = PAGE_HASH_MUTEX(index);
 833                                 mutex_enter(phm);
 834                                 hash_locked = 1;
 835                                 goto top;
 836                         } else if (se == SE_SHARED && newpp == NULL) {
 837                                 VM_STAT_ADD(page_lookup_cnt[10]);
 838                                 page_downgrade(pp);
 839                         }
 840                 }
 841 
 842                 if (hash_locked) {
 843                         mutex_exit(phm);
 844                 }
 845 
 846                 if (newpp != NULL && pp->p_szc < newpp->p_szc &&
 847                     PAGE_EXCL(pp) && nrelocp != NULL) {
 848                         ASSERT(nrelocp != NULL);
 849                         (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
 850                             NULL);
 851                         if (*nrelocp > 0) {
 852                                 VM_STAT_COND_ADD(*nrelocp == 1,
 853                                     page_lookup_cnt[11]);
 854                                 VM_STAT_COND_ADD(*nrelocp > 1,
 855                                     page_lookup_cnt[12]);
 856                                 pp = newpp;
 857                                 se = SE_EXCL;
 858                         } else {
 859                                 if (se == SE_SHARED) {
 860                                         page_downgrade(pp);
 861                                 }
 862                                 VM_STAT_ADD(page_lookup_cnt[13]);
 863                         }
 864                 } else if (newpp != NULL && nrelocp != NULL) {
 865                         if (PAGE_EXCL(pp) && se == SE_SHARED) {
 866                                 page_downgrade(pp);
 867                         }
 868                         VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
 869                             page_lookup_cnt[14]);
 870                         VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
 871                             page_lookup_cnt[15]);
 872                         VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
 873                             page_lookup_cnt[16]);
 874                 } else if (newpp != NULL && PAGE_EXCL(pp)) {
 875                         se = SE_EXCL;
 876                 }
 877         } else if (!hash_locked) {
 878                 VM_STAT_ADD(page_lookup_cnt[17]);
 879                 phm = PAGE_HASH_MUTEX(index);
 880                 mutex_enter(phm);
 881                 hash_locked = 1;
 882                 goto top;
 883         } else if (newpp != NULL) {
 884                 /*
 885                  * If we have a preallocated page then
 886                  * insert it now and basically behave like
 887                  * page_create.
 888                  */
 889                 VM_STAT_ADD(page_lookup_cnt[18]);
 890                 /*
 891                  * Since we hold the page hash mutex and
 892                  * just searched for this page, page_hashin
 893                  * had better not fail.  If it does, that
 894                  * means some thread did not follow the
 895                  * page hash mutex rules.  Panic now and
 896                  * get it over with.  As usual, go down
 897                  * holding all the locks.
 898                  */
 899                 ASSERT(MUTEX_HELD(phm));
 900                 if (!page_hashin(newpp, vp, off, phm)) {
 901                         ASSERT(MUTEX_HELD(phm));
 902                         panic("page_lookup_create: hashin failed %p %p %llx %p",
 903                             (void *)newpp, (void *)vp, off, (void *)phm);
 904                         /*NOTREACHED*/
 905                 }
 906                 ASSERT(MUTEX_HELD(phm));
 907                 mutex_exit(phm);
 908                 phm = NULL;
 909                 page_set_props(newpp, P_REF);
 910                 page_io_lock(newpp);
 911                 pp = newpp;
 912                 se = SE_EXCL;
 913         } else {
 914                 VM_STAT_ADD(page_lookup_cnt[19]);
 915                 mutex_exit(phm);
 916         }
 917 
 918         ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 919 
 920         ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
 921 
 922         return (pp);
 923 }
 924 
 925 /*
 926  * Search the hash list for the page representing the
 927  * specified [vp, offset] and return it locked.  Skip
 928  * free pages and pages that cannot be locked as requested.
 929  * Used while attempting to kluster pages.
 930  */
 931 page_t *
 932 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
 933 {
 934         page_t          *pp;
 935         kmutex_t        *phm;
 936         ulong_t         index;
 937         uint_t          locked;
 938 
 939         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 940         VM_STAT_ADD(page_lookup_nowait_cnt[0]);
 941 
 942         index = PAGE_HASH_FUNC(vp, off);
 943         pp = page_hash_search(index, vp, off);
 944         locked = 0;
 945         if (pp == NULL) {
 946 top:
 947                 VM_STAT_ADD(page_lookup_nowait_cnt[1]);
 948                 locked = 1;
 949                 phm = PAGE_HASH_MUTEX(index);
 950                 mutex_enter(phm);
 951                 pp = page_hash_search(index, vp, off);
 952         }
 953 
 954         if (pp == NULL || PP_ISFREE(pp)) {
 955                 VM_STAT_ADD(page_lookup_nowait_cnt[2]);
 956                 pp = NULL;
 957         } else {
 958                 if (!page_trylock(pp, se)) {
 959                         VM_STAT_ADD(page_lookup_nowait_cnt[3]);
 960                         pp = NULL;
 961                 } else {
 962                         VM_STAT_ADD(page_lookup_nowait_cnt[4]);
 963                         /*
 964                          * See the comment in page_lookup()
 965                          */
 966                         if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 967                             ((u_offset_t)(pp->p_offset) != off)) {
 968                                 VM_STAT_ADD(page_lookup_nowait_cnt[5]);
 969                                 if (locked) {
 970                                         panic("page_lookup_nowait %p",
 971                                             (void *)pp);
 972                                         /*NOTREACHED*/
 973                                 }
 974                                 page_unlock(pp);
 975                                 goto top;
 976                         }
 977                         if (PP_ISFREE(pp)) {
 978                                 VM_STAT_ADD(page_lookup_nowait_cnt[6]);
 979                                 page_unlock(pp);
 980                                 pp = NULL;
 981                         }
 982                 }
 983         }
 984         if (locked) {
 985                 VM_STAT_ADD(page_lookup_nowait_cnt[7]);
 986                 mutex_exit(phm);
 987         }
 988 
 989         ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 990 
 991         return (pp);
 992 }
 993 
 994 /*
 995  * Search the hash list for a page with the specified [vp, off]
 996  * that is known to exist and is already locked.  This routine
 997  * is typically used by segment SOFTUNLOCK routines.
 998  */
 999 page_t *
1000 page_find(vnode_t *vp, u_offset_t off)
1001 {
1002         page_t          *pp;
1003         kmutex_t        *phm;
1004         ulong_t         index;
1005 
1006         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1007         VM_STAT_ADD(page_find_cnt);
1008 
1009         index = PAGE_HASH_FUNC(vp, off);
1010         phm = PAGE_HASH_MUTEX(index);
1011 
1012         mutex_enter(phm);
1013         pp = page_hash_search(index, vp, off);
1014         mutex_exit(phm);
1015 
1016         ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1017         return (pp);
1018 }
1019 
1020 /*
1021  * Determine whether a page with the specified [vp, off]
1022  * currently exists in the system.  Obviously this should
1023  * only be considered as a hint since nothing prevents the
1024  * page from disappearing or appearing immediately after
1025  * the return from this routine. Subsequently, we don't
1026  * even bother to lock the list.
1027  */
1028 page_t *
1029 page_exists(vnode_t *vp, u_offset_t off)
1030 {
1031         ulong_t         index;
1032 
1033         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1034         VM_STAT_ADD(page_exists_cnt);
1035 
1036         index = PAGE_HASH_FUNC(vp, off);
1037 
1038         return (page_hash_search(index, vp, off));
1039 }
1040 
1041 /*
1042  * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1043  * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
1044  * with these pages locked SHARED. If necessary reclaim pages from
1045  * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1046  *
1047  * If we fail to lock pages still return 1 if pages exist and contiguous.
1048  * But in this case return value is just a hint. ppa array won't be filled.
1049  * Caller should initialize ppa[0] as NULL to distinguish return value.
1050  *
1051  * Returns 0 if pages don't exist or not physically contiguous.
1052  *
1053  * This routine doesn't work for anonymous(swapfs) pages.
1054  */
1055 int
1056 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1057 {
1058         pgcnt_t pages;
1059         pfn_t pfn;
1060         page_t *rootpp;
1061         pgcnt_t i;
1062         pgcnt_t j;
1063         u_offset_t save_off = off;
1064         ulong_t index;
1065         kmutex_t *phm;
1066         page_t *pp;
1067         uint_t pszc;
1068         int loopcnt = 0;
1069 
1070         ASSERT(szc != 0);
1071         ASSERT(vp != NULL);
1072         ASSERT(!IS_SWAPFSVP(vp));
1073         ASSERT(!VN_ISKAS(vp));
1074 
1075 again:
1076         if (++loopcnt > 3) {
1077                 VM_STAT_ADD(page_exphcontg[0]);
1078                 return (0);
1079         }
1080 
1081         index = PAGE_HASH_FUNC(vp, off);
1082         phm = PAGE_HASH_MUTEX(index);
1083 
1084         mutex_enter(phm);
1085         pp = page_hash_search(index, vp, off);
1086         mutex_exit(phm);
1087 
1088         VM_STAT_ADD(page_exphcontg[1]);
1089 
1090         if (pp == NULL) {
1091                 VM_STAT_ADD(page_exphcontg[2]);
1092                 return (0);
1093         }
1094 
1095         pages = page_get_pagecnt(szc);
1096         rootpp = pp;
1097         pfn = rootpp->p_pagenum;
1098 
1099         if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1100                 VM_STAT_ADD(page_exphcontg[3]);
1101                 if (!page_trylock(pp, SE_SHARED)) {
1102                         VM_STAT_ADD(page_exphcontg[4]);
1103                         return (1);
1104                 }
1105                 /*
1106                  * Also check whether p_pagenum was modified by DR.
1107                  */
1108                 if (pp->p_szc != pszc || pp->p_vnode != vp ||
1109                     pp->p_offset != off || pp->p_pagenum != pfn) {
1110                         VM_STAT_ADD(page_exphcontg[5]);
1111                         page_unlock(pp);
1112                         off = save_off;
1113                         goto again;
1114                 }
1115                 /*
1116                  * szc was non zero and vnode and offset matched after we
1117                  * locked the page it means it can't become free on us.
1118                  */
1119                 ASSERT(!PP_ISFREE(pp));
1120                 if (!IS_P2ALIGNED(pfn, pages)) {
1121                         page_unlock(pp);
1122                         return (0);
1123                 }
1124                 ppa[0] = pp;
1125                 pp++;
1126                 off += PAGESIZE;
1127                 pfn++;
1128                 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1129                         if (!page_trylock(pp, SE_SHARED)) {
1130                                 VM_STAT_ADD(page_exphcontg[6]);
1131                                 pp--;
1132                                 while (i-- > 0) {
1133                                         page_unlock(pp);
1134                                         pp--;
1135                                 }
1136                                 ppa[0] = NULL;
1137                                 return (1);
1138                         }
1139                         if (pp->p_szc != pszc) {
1140                                 VM_STAT_ADD(page_exphcontg[7]);
1141                                 page_unlock(pp);
1142                                 pp--;
1143                                 while (i-- > 0) {
1144                                         page_unlock(pp);
1145                                         pp--;
1146                                 }
1147                                 ppa[0] = NULL;
1148                                 off = save_off;
1149                                 goto again;
1150                         }
1151                         /*
1152                          * szc the same as for previous already locked pages
1153                          * with right identity. Since this page had correct
1154                          * szc after we locked it can't get freed or destroyed
1155                          * and therefore must have the expected identity.
1156                          */
1157                         ASSERT(!PP_ISFREE(pp));
1158                         if (pp->p_vnode != vp ||
1159                             pp->p_offset != off) {
1160                                 panic("page_exists_physcontig: "
1161                                     "large page identity doesn't match");
1162                         }
1163                         ppa[i] = pp;
1164                         ASSERT(pp->p_pagenum == pfn);
1165                 }
1166                 VM_STAT_ADD(page_exphcontg[8]);
1167                 ppa[pages] = NULL;
1168                 return (1);
1169         } else if (pszc >= szc) {
1170                 VM_STAT_ADD(page_exphcontg[9]);
1171                 if (!IS_P2ALIGNED(pfn, pages)) {
1172                         return (0);
1173                 }
1174                 return (1);
1175         }
1176 
1177         if (!IS_P2ALIGNED(pfn, pages)) {
1178                 VM_STAT_ADD(page_exphcontg[10]);
1179                 return (0);
1180         }
1181 
1182         if (page_numtomemseg_nolock(pfn) !=
1183             page_numtomemseg_nolock(pfn + pages - 1)) {
1184                 VM_STAT_ADD(page_exphcontg[11]);
1185                 return (0);
1186         }
1187 
1188         /*
1189          * We loop up 4 times across pages to promote page size.
1190          * We're extra cautious to promote page size atomically with respect
1191          * to everybody else.  But we can probably optimize into 1 loop if
1192          * this becomes an issue.
1193          */
1194 
1195         for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1196                 if (!page_trylock(pp, SE_EXCL)) {
1197                         VM_STAT_ADD(page_exphcontg[12]);
1198                         break;
1199                 }
1200                 /*
1201                  * Check whether p_pagenum was modified by DR.
1202                  */
1203                 if (pp->p_pagenum != pfn) {
1204                         page_unlock(pp);
1205                         break;
1206                 }
1207                 if (pp->p_vnode != vp ||
1208                     pp->p_offset != off) {
1209                         VM_STAT_ADD(page_exphcontg[13]);
1210                         page_unlock(pp);
1211                         break;
1212                 }
1213                 if (pp->p_szc >= szc) {
1214                         ASSERT(i == 0);
1215                         page_unlock(pp);
1216                         off = save_off;
1217                         goto again;
1218                 }
1219         }
1220 
1221         if (i != pages) {
1222                 VM_STAT_ADD(page_exphcontg[14]);
1223                 --pp;
1224                 while (i-- > 0) {
1225                         page_unlock(pp);
1226                         --pp;
1227                 }
1228                 return (0);
1229         }
1230 
1231         pp = rootpp;
1232         for (i = 0; i < pages; i++, pp++) {
1233                 if (PP_ISFREE(pp)) {
1234                         VM_STAT_ADD(page_exphcontg[15]);
1235                         ASSERT(!PP_ISAGED(pp));
1236                         ASSERT(pp->p_szc == 0);
1237                         if (!page_reclaim(pp, NULL)) {
1238                                 break;
1239                         }
1240                 } else {
1241                         ASSERT(pp->p_szc < szc);
1242                         VM_STAT_ADD(page_exphcontg[16]);
1243                         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1244                 }
1245         }
1246         if (i < pages) {
1247                 VM_STAT_ADD(page_exphcontg[17]);
1248                 /*
1249                  * page_reclaim failed because we were out of memory.
1250                  * drop the rest of the locks and return because this page
1251                  * must be already reallocated anyway.
1252                  */
1253                 pp = rootpp;
1254                 for (j = 0; j < pages; j++, pp++) {
1255                         if (j != i) {
1256                                 page_unlock(pp);
1257                         }
1258                 }
1259                 return (0);
1260         }
1261 
1262         off = save_off;
1263         pp = rootpp;
1264         for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1265                 ASSERT(PAGE_EXCL(pp));
1266                 ASSERT(!PP_ISFREE(pp));
1267                 ASSERT(!hat_page_is_mapped(pp));
1268                 ASSERT(pp->p_vnode == vp);
1269                 ASSERT(pp->p_offset == off);
1270                 pp->p_szc = szc;
1271         }
1272         pp = rootpp;
1273         for (i = 0; i < pages; i++, pp++) {
1274                 if (ppa == NULL) {
1275                         page_unlock(pp);
1276                 } else {
1277                         ppa[i] = pp;
1278                         page_downgrade(ppa[i]);
1279                 }
1280         }
1281         if (ppa != NULL) {
1282                 ppa[pages] = NULL;
1283         }
1284         VM_STAT_ADD(page_exphcontg[18]);
1285         ASSERT(vp->v_pages != NULL);
1286         return (1);
1287 }
1288 
1289 /*
1290  * Determine whether a page with the specified [vp, off]
1291  * currently exists in the system and if so return its
1292  * size code. Obviously this should only be considered as
1293  * a hint since nothing prevents the page from disappearing
1294  * or appearing immediately after the return from this routine.
1295  */
1296 int
1297 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1298 {
1299         page_t          *pp;
1300         kmutex_t        *phm;
1301         ulong_t         index;
1302         int             rc = 0;
1303 
1304         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1305         ASSERT(szc != NULL);
1306         VM_STAT_ADD(page_exists_forreal_cnt);
1307 
1308         index = PAGE_HASH_FUNC(vp, off);
1309         phm = PAGE_HASH_MUTEX(index);
1310 
1311         mutex_enter(phm);
1312         pp = page_hash_search(index, vp, off);
1313         if (pp != NULL) {
1314                 *szc = pp->p_szc;
1315                 rc = 1;
1316         }
1317         mutex_exit(phm);
1318         return (rc);
1319 }
1320 
1321 /* wakeup threads waiting for pages in page_create_get_something() */
1322 void
1323 wakeup_pcgs(void)
1324 {
1325         if (!CV_HAS_WAITERS(&pcgs_cv))
1326                 return;
1327         cv_broadcast(&pcgs_cv);
1328 }
1329 
1330 /*
1331  * 'freemem' is used all over the kernel as an indication of how many
1332  * pages are free (either on the cache list or on the free page list)
1333  * in the system.  In very few places is a really accurate 'freemem'
1334  * needed.  To avoid contention of the lock protecting a the
1335  * single freemem, it was spread out into NCPU buckets.  Set_freemem
1336  * sets freemem to the total of all NCPU buckets.  It is called from
1337  * clock() on each TICK.
1338  */
1339 void
1340 set_freemem()
1341 {
1342         struct pcf      *p;
1343         ulong_t         t;
1344         uint_t          i;
1345 
1346         t = 0;
1347         p = pcf;
1348         for (i = 0;  i < pcf_fanout; i++) {
1349                 t += p->pcf_count;
1350                 p++;
1351         }
1352         freemem = t;
1353 
1354         /*
1355          * Don't worry about grabbing mutex.  It's not that
1356          * critical if we miss a tick or two.  This is
1357          * where we wakeup possible delayers in
1358          * page_create_get_something().
1359          */
1360         wakeup_pcgs();
1361 }
1362 
1363 ulong_t
1364 get_freemem()
1365 {
1366         struct pcf      *p;
1367         ulong_t         t;
1368         uint_t          i;
1369 
1370         t = 0;
1371         p = pcf;
1372         for (i = 0; i < pcf_fanout; i++) {
1373                 t += p->pcf_count;
1374                 p++;
1375         }
1376         /*
1377          * We just calculated it, might as well set it.
1378          */
1379         freemem = t;
1380         return (t);
1381 }
1382 
1383 /*
1384  * Acquire all of the page cache & free (pcf) locks.
1385  */
1386 void
1387 pcf_acquire_all()
1388 {
1389         struct pcf      *p;
1390         uint_t          i;
1391 
1392         p = pcf;
1393         for (i = 0; i < pcf_fanout; i++) {
1394                 mutex_enter(&p->pcf_lock);
1395                 p++;
1396         }
1397 }
1398 
1399 /*
1400  * Release all the pcf_locks.
1401  */
1402 void
1403 pcf_release_all()
1404 {
1405         struct pcf      *p;
1406         uint_t          i;
1407 
1408         p = pcf;
1409         for (i = 0; i < pcf_fanout; i++) {
1410                 mutex_exit(&p->pcf_lock);
1411                 p++;
1412         }
1413 }
1414 
1415 /*
1416  * Inform the VM system that we need some pages freed up.
1417  * Calls must be symmetric, e.g.:
1418  *
1419  *      page_needfree(100);
1420  *      wait a bit;
1421  *      page_needfree(-100);
1422  */
1423 void
1424 page_needfree(spgcnt_t npages)
1425 {
1426         mutex_enter(&new_freemem_lock);
1427         needfree += npages;
1428         mutex_exit(&new_freemem_lock);
1429 }
1430 
1431 /*
1432  * Throttle for page_create(): try to prevent freemem from dropping
1433  * below throttlefree.  We can't provide a 100% guarantee because
1434  * KM_NOSLEEP allocations, page_reclaim(), and various other things
1435  * nibble away at the freelist.  However, we can block all PG_WAIT
1436  * allocations until memory becomes available.  The motivation is
1437  * that several things can fall apart when there's no free memory:
1438  *
1439  * (1) If pageout() needs memory to push a page, the system deadlocks.
1440  *
1441  * (2) By (broken) specification, timeout(9F) can neither fail nor
1442  *     block, so it has no choice but to panic the system if it
1443  *     cannot allocate a callout structure.
1444  *
1445  * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1446  *     it panics if it cannot allocate a callback structure.
1447  *
1448  * (4) Untold numbers of third-party drivers have not yet been hardened
1449  *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1450  *     success and panic the system with a data fault on failure.
1451  *     (The long-term solution to this particular problem is to ship
1452  *     hostile fault-injecting DEBUG kernels with the DDK.)
1453  *
1454  * It is theoretically impossible to guarantee success of non-blocking
1455  * allocations, but in practice, this throttle is very hard to break.
1456  */
1457 static int
1458 page_create_throttle(pgcnt_t npages, int flags)
1459 {
1460         ulong_t fm;
1461         uint_t  i;
1462         pgcnt_t tf;     /* effective value of throttlefree */
1463 
1464         /*
1465          * Normal priority allocations.
1466          */
1467         if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1468                 ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1469                 return (freemem >= npages + throttlefree);
1470         }
1471 
1472         /*
1473          * Never deny pages when:
1474          * - it's a thread that cannot block [NOMEMWAIT()]
1475          * - the allocation cannot block and must not fail
1476          * - the allocation cannot block and is pageout dispensated
1477          */
1478         if (NOMEMWAIT() ||
1479             ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1480             ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1481                 return (1);
1482 
1483         /*
1484          * If the allocation can't block, we look favorably upon it
1485          * unless we're below pageout_reserve.  In that case we fail
1486          * the allocation because we want to make sure there are a few
1487          * pages available for pageout.
1488          */
1489         if ((flags & PG_WAIT) == 0)
1490                 return (freemem >= npages + pageout_reserve);
1491 
1492         /* Calculate the effective throttlefree value */
1493         tf = throttlefree -
1494             ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1495 
1496         cv_signal(&proc_pageout->p_cv);
1497 
1498         for (;;) {
1499                 fm = 0;
1500                 pcf_acquire_all();
1501                 mutex_enter(&new_freemem_lock);
1502                 for (i = 0; i < pcf_fanout; i++) {
1503                         fm += pcf[i].pcf_count;
1504                         pcf[i].pcf_wait++;
1505                         mutex_exit(&pcf[i].pcf_lock);
1506                 }
1507                 freemem = fm;
1508                 if (freemem >= npages + tf) {
1509                         mutex_exit(&new_freemem_lock);
1510                         break;
1511                 }
1512                 needfree += npages;
1513                 freemem_wait++;
1514                 cv_wait(&freemem_cv, &new_freemem_lock);
1515                 freemem_wait--;
1516                 needfree -= npages;
1517                 mutex_exit(&new_freemem_lock);
1518         }
1519         return (1);
1520 }
1521 
1522 /*
1523  * page_create_wait() is called to either coalesce pages from the
1524  * different pcf buckets or to wait because there simply are not
1525  * enough pages to satisfy the caller's request.
1526  *
1527  * Sadly, this is called from platform/vm/vm_machdep.c
1528  */
1529 int
1530 page_create_wait(pgcnt_t npages, uint_t flags)
1531 {
1532         pgcnt_t         total;
1533         uint_t          i;
1534         struct pcf      *p;
1535 
1536         /*
1537          * Wait until there are enough free pages to satisfy our
1538          * entire request.
1539          * We set needfree += npages before prodding pageout, to make sure
1540          * it does real work when npages > lotsfree > freemem.
1541          */
1542         VM_STAT_ADD(page_create_not_enough);
1543 
1544         ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1545 checkagain:
1546         if ((flags & PG_NORELOC) &&
1547             kcage_freemem < kcage_throttlefree + npages)
1548                 (void) kcage_create_throttle(npages, flags);
1549 
1550         if (freemem < npages + throttlefree)
1551                 if (!page_create_throttle(npages, flags))
1552                         return (0);
1553 
1554         if (pcf_decrement_bucket(npages) ||
1555             pcf_decrement_multiple(&total, npages, 0))
1556                 return (1);
1557 
1558         /*
1559          * All of the pcf locks are held, there are not enough pages
1560          * to satisfy the request (npages < total).
1561          * Be sure to acquire the new_freemem_lock before dropping
1562          * the pcf locks.  This prevents dropping wakeups in page_free().
1563          * The order is always pcf_lock then new_freemem_lock.
1564          *
1565          * Since we hold all the pcf locks, it is a good time to set freemem.
1566          *
1567          * If the caller does not want to wait, return now.
1568          * Else turn the pageout daemon loose to find something
1569          * and wait till it does.
1570          *
1571          */
1572         freemem = total;
1573 
1574         if ((flags & PG_WAIT) == 0) {
1575                 pcf_release_all();
1576 
1577                 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1578                 "page_create_nomem:npages %ld freemem %ld", npages, freemem);
1579                 return (0);
1580         }
1581 
1582         ASSERT(proc_pageout != NULL);
1583         cv_signal(&proc_pageout->p_cv);
1584 
1585         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1586             "page_create_sleep_start: freemem %ld needfree %ld",
1587             freemem, needfree);
1588 
1589         /*
1590          * We are going to wait.
1591          * We currently hold all of the pcf_locks,
1592          * get the new_freemem_lock (it protects freemem_wait),
1593          * before dropping the pcf_locks.
1594          */
1595         mutex_enter(&new_freemem_lock);
1596 
1597         p = pcf;
1598         for (i = 0; i < pcf_fanout; i++) {
1599                 p->pcf_wait++;
1600                 mutex_exit(&p->pcf_lock);
1601                 p++;
1602         }
1603 
1604         needfree += npages;
1605         freemem_wait++;
1606 
1607         cv_wait(&freemem_cv, &new_freemem_lock);
1608 
1609         freemem_wait--;
1610         needfree -= npages;
1611 
1612         mutex_exit(&new_freemem_lock);
1613 
1614         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1615             "page_create_sleep_end: freemem %ld needfree %ld",
1616             freemem, needfree);
1617 
1618         VM_STAT_ADD(page_create_not_enough_again);
1619         goto checkagain;
1620 }
1621 /*
1622  * A routine to do the opposite of page_create_wait().
1623  */
1624 void
1625 page_create_putback(spgcnt_t npages)
1626 {
1627         struct pcf      *p;
1628         pgcnt_t         lump;
1629         uint_t          *which;
1630 
1631         /*
1632          * When a contiguous lump is broken up, we have to
1633          * deal with lots of pages (min 64) so lets spread
1634          * the wealth around.
1635          */
1636         lump = roundup(npages, pcf_fanout) / pcf_fanout;
1637         freemem += npages;
1638 
1639         for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1640                 which = &p->pcf_count;
1641 
1642                 mutex_enter(&p->pcf_lock);
1643 
1644                 if (p->pcf_block) {
1645                         which = &p->pcf_reserve;
1646                 }
1647 
1648                 if (lump < npages) {
1649                         *which += (uint_t)lump;
1650                         npages -= lump;
1651                 } else {
1652                         *which += (uint_t)npages;
1653                         npages = 0;
1654                 }
1655 
1656                 if (p->pcf_wait) {
1657                         mutex_enter(&new_freemem_lock);
1658                         /*
1659                          * Check to see if some other thread
1660                          * is actually waiting.  Another bucket
1661                          * may have woken it up by now.  If there
1662                          * are no waiters, then set our pcf_wait
1663                          * count to zero to avoid coming in here
1664                          * next time.
1665                          */
1666                         if (freemem_wait) {
1667                                 if (npages > 1) {
1668                                         cv_broadcast(&freemem_cv);
1669                                 } else {
1670                                         cv_signal(&freemem_cv);
1671                                 }
1672                                 p->pcf_wait--;
1673                         } else {
1674                                 p->pcf_wait = 0;
1675                         }
1676                         mutex_exit(&new_freemem_lock);
1677                 }
1678                 mutex_exit(&p->pcf_lock);
1679         }
1680         ASSERT(npages == 0);
1681 }
1682 
1683 /*
1684  * A helper routine for page_create_get_something.
1685  * The indenting got to deep down there.
1686  * Unblock the pcf counters.  Any pages freed after
1687  * pcf_block got set are moved to pcf_count and
1688  * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1689  */
1690 static void
1691 pcgs_unblock(void)
1692 {
1693         int             i;
1694         struct pcf      *p;
1695 
1696         /* Update freemem while we're here. */
1697         freemem = 0;
1698         p = pcf;
1699         for (i = 0; i < pcf_fanout; i++) {
1700                 mutex_enter(&p->pcf_lock);
1701                 ASSERT(p->pcf_count == 0);
1702                 p->pcf_count = p->pcf_reserve;
1703                 p->pcf_block = 0;
1704                 freemem += p->pcf_count;
1705                 if (p->pcf_wait) {
1706                         mutex_enter(&new_freemem_lock);
1707                         if (freemem_wait) {
1708                                 if (p->pcf_reserve > 1) {
1709                                         cv_broadcast(&freemem_cv);
1710                                         p->pcf_wait = 0;
1711                                 } else {
1712                                         cv_signal(&freemem_cv);
1713                                         p->pcf_wait--;
1714                                 }
1715                         } else {
1716                                 p->pcf_wait = 0;
1717                         }
1718                         mutex_exit(&new_freemem_lock);
1719                 }
1720                 p->pcf_reserve = 0;
1721                 mutex_exit(&p->pcf_lock);
1722                 p++;
1723         }
1724 }
1725 
1726 /*
1727  * Called from page_create_va() when both the cache and free lists
1728  * have been checked once.
1729  *
1730  * Either returns a page or panics since the accounting was done
1731  * way before we got here.
1732  *
1733  * We don't come here often, so leave the accounting on permanently.
1734  */
1735 
1736 #define MAX_PCGS        100
1737 
1738 #ifdef  DEBUG
1739 #define PCGS_TRIES      100
1740 #else   /* DEBUG */
1741 #define PCGS_TRIES      10
1742 #endif  /* DEBUG */
1743 
1744 #ifdef  VM_STATS
1745 uint_t  pcgs_counts[PCGS_TRIES];
1746 uint_t  pcgs_too_many;
1747 uint_t  pcgs_entered;
1748 uint_t  pcgs_entered_noreloc;
1749 uint_t  pcgs_locked;
1750 uint_t  pcgs_cagelocked;
1751 #endif  /* VM_STATS */
1752 
1753 static page_t *
1754 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1755     caddr_t vaddr, uint_t flags)
1756 {
1757         uint_t          count;
1758         page_t          *pp;
1759         uint_t          locked, i;
1760         struct  pcf     *p;
1761         lgrp_t          *lgrp;
1762         int             cagelocked = 0;
1763 
1764         VM_STAT_ADD(pcgs_entered);
1765 
1766         /*
1767          * Tap any reserve freelists: if we fail now, we'll die
1768          * since the page(s) we're looking for have already been
1769          * accounted for.
1770          */
1771         flags |= PG_PANIC;
1772 
1773         if ((flags & PG_NORELOC) != 0) {
1774                 VM_STAT_ADD(pcgs_entered_noreloc);
1775                 /*
1776                  * Requests for free pages from critical threads
1777                  * such as pageout still won't throttle here, but
1778                  * we must try again, to give the cageout thread
1779                  * another chance to catch up. Since we already
1780                  * accounted for the pages, we had better get them
1781                  * this time.
1782                  *
1783                  * N.B. All non-critical threads acquire the pcgs_cagelock
1784                  * to serialize access to the freelists. This implements a
1785                  * turnstile-type synchornization to avoid starvation of
1786                  * critical requests for PG_NORELOC memory by non-critical
1787                  * threads: all non-critical threads must acquire a 'ticket'
1788                  * before passing through, which entails making sure
1789                  * kcage_freemem won't fall below minfree prior to grabbing
1790                  * pages from the freelists.
1791                  */
1792                 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1793                         mutex_enter(&pcgs_cagelock);
1794                         cagelocked = 1;
1795                         VM_STAT_ADD(pcgs_cagelocked);
1796                 }
1797         }
1798 
1799         /*
1800          * Time to get serious.
1801          * We failed to get a `correctly colored' page from both the
1802          * free and cache lists.
1803          * We escalate in stage.
1804          *
1805          * First try both lists without worring about color.
1806          *
1807          * Then, grab all page accounting locks (ie. pcf[]) and
1808          * steal any pages that they have and set the pcf_block flag to
1809          * stop deletions from the lists.  This will help because
1810          * a page can get added to the free list while we are looking
1811          * at the cache list, then another page could be added to the cache
1812          * list allowing the page on the free list to be removed as we
1813          * move from looking at the cache list to the free list. This
1814          * could happen over and over. We would never find the page
1815          * we have accounted for.
1816          *
1817          * Noreloc pages are a subset of the global (relocatable) page pool.
1818          * They are not tracked separately in the pcf bins, so it is
1819          * impossible to know when doing pcf accounting if the available
1820          * page(s) are noreloc pages or not. When looking for a noreloc page
1821          * it is quite easy to end up here even if the global (relocatable)
1822          * page pool has plenty of free pages but the noreloc pool is empty.
1823          *
1824          * When the noreloc pool is empty (or low), additional noreloc pages
1825          * are created by converting pages from the global page pool. This
1826          * process will stall during pcf accounting if the pcf bins are
1827          * already locked. Such is the case when a noreloc allocation is
1828          * looping here in page_create_get_something waiting for more noreloc
1829          * pages to appear.
1830          *
1831          * Short of adding a new field to the pcf bins to accurately track
1832          * the number of free noreloc pages, we instead do not grab the
1833          * pcgs_lock, do not set the pcf blocks and do not timeout when
1834          * allocating a noreloc page. This allows noreloc allocations to
1835          * loop without blocking global page pool allocations.
1836          *
1837          * NOTE: the behaviour of page_create_get_something has not changed
1838          * for the case of global page pool allocations.
1839          */
1840 
1841         flags &= ~PG_MATCH_COLOR;
1842         locked = 0;
1843 #if defined(__i386) || defined(__amd64)
1844         flags = page_create_update_flags_x86(flags);
1845 #endif
1846 
1847         lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1848 
1849         for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1850                 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1851                     flags, lgrp);
1852                 if (pp == NULL) {
1853                         pp = page_get_cachelist(vp, off, seg, vaddr,
1854                             flags, lgrp);
1855                 }
1856                 if (pp == NULL) {
1857                         /*
1858                          * Serialize.  Don't fight with other pcgs().
1859                          */
1860                         if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1861                                 mutex_enter(&pcgs_lock);
1862                                 VM_STAT_ADD(pcgs_locked);
1863                                 locked = 1;
1864                                 p = pcf;
1865                                 for (i = 0; i < pcf_fanout; i++) {
1866                                         mutex_enter(&p->pcf_lock);
1867                                         ASSERT(p->pcf_block == 0);
1868                                         p->pcf_block = 1;
1869                                         p->pcf_reserve = p->pcf_count;
1870                                         p->pcf_count = 0;
1871                                         mutex_exit(&p->pcf_lock);
1872                                         p++;
1873                                 }
1874                                 freemem = 0;
1875                         }
1876 
1877                         if (count) {
1878                                 /*
1879                                  * Since page_free() puts pages on
1880                                  * a list then accounts for it, we
1881                                  * just have to wait for page_free()
1882                                  * to unlock any page it was working
1883                                  * with. The page_lock()-page_reclaim()
1884                                  * path falls in the same boat.
1885                                  *
1886                                  * We don't need to check on the
1887                                  * PG_WAIT flag, we have already
1888                                  * accounted for the page we are
1889                                  * looking for in page_create_va().
1890                                  *
1891                                  * We just wait a moment to let any
1892                                  * locked pages on the lists free up,
1893                                  * then continue around and try again.
1894                                  *
1895                                  * Will be awakened by set_freemem().
1896                                  */
1897                                 mutex_enter(&pcgs_wait_lock);
1898                                 cv_wait(&pcgs_cv, &pcgs_wait_lock);
1899                                 mutex_exit(&pcgs_wait_lock);
1900                         }
1901                 } else {
1902 #ifdef VM_STATS
1903                         if (count >= PCGS_TRIES) {
1904                                 VM_STAT_ADD(pcgs_too_many);
1905                         } else {
1906                                 VM_STAT_ADD(pcgs_counts[count]);
1907                         }
1908 #endif
1909                         if (locked) {
1910                                 pcgs_unblock();
1911                                 mutex_exit(&pcgs_lock);
1912                         }
1913                         if (cagelocked)
1914                                 mutex_exit(&pcgs_cagelock);
1915                         return (pp);
1916                 }
1917         }
1918         /*
1919          * we go down holding the pcf locks.
1920          */
1921         panic("no %spage found %d",
1922             ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1923         /*NOTREACHED*/
1924 }
1925 
1926 /*
1927  * Create enough pages for "bytes" worth of data starting at
1928  * "off" in "vp".
1929  *
1930  *      Where flag must be one of:
1931  *
1932  *              PG_EXCL:        Exclusive create (fail if any page already
1933  *                              exists in the page cache) which does not
1934  *                              wait for memory to become available.
1935  *
1936  *              PG_WAIT:        Non-exclusive create which can wait for
1937  *                              memory to become available.
1938  *
1939  *              PG_PHYSCONTIG:  Allocate physically contiguous pages.
1940  *                              (Not Supported)
1941  *
1942  * A doubly linked list of pages is returned to the caller.  Each page
1943  * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1944  * lock.
1945  *
1946  * Unable to change the parameters to page_create() in a minor release,
1947  * we renamed page_create() to page_create_va(), changed all known calls
1948  * from page_create() to page_create_va(), and created this wrapper.
1949  *
1950  * Upon a major release, we should break compatibility by deleting this
1951  * wrapper, and replacing all the strings "page_create_va", with "page_create".
1952  *
1953  * NOTE: There is a copy of this interface as page_create_io() in
1954  *       i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1955  *       there.
1956  */
1957 page_t *
1958 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1959 {
1960         caddr_t random_vaddr;
1961         struct seg kseg;
1962 
1963 #ifdef DEBUG
1964         cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1965             (void *)caller());
1966 #endif
1967 
1968         random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1969             (uintptr_t)(off >> PAGESHIFT));
1970         kseg.s_as = &kas;
1971 
1972         return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1973 }
1974 
1975 #ifdef DEBUG
1976 uint32_t pg_alloc_pgs_mtbf = 0;
1977 #endif
1978 
1979 /*
1980  * Used for large page support. It will attempt to allocate
1981  * a large page(s) off the freelist.
1982  *
1983  * Returns non zero on failure.
1984  */
1985 int
1986 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1987     page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
1988 {
1989         pgcnt_t         npgs, curnpgs, totpgs;
1990         size_t          pgsz;
1991         page_t          *pplist = NULL, *pp;
1992         int             err = 0;
1993         lgrp_t          *lgrp;
1994 
1995         ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
1996         ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
1997 
1998         /*
1999          * Check if system heavily prefers local large pages over remote
2000          * on systems with multiple lgroups.
2001          */
2002         if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2003                 pgflags = PG_LOCAL;
2004         }
2005 
2006         VM_STAT_ADD(alloc_pages[0]);
2007 
2008 #ifdef DEBUG
2009         if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2010                 return (ENOMEM);
2011         }
2012 #endif
2013 
2014         /*
2015          * One must be NULL but not both.
2016          * And one must be non NULL but not both.
2017          */
2018         ASSERT(basepp != NULL || ppa != NULL);
2019         ASSERT(basepp == NULL || ppa == NULL);
2020 
2021 #if defined(__i386) || defined(__amd64)
2022         while (page_chk_freelist(szc) == 0) {
2023                 VM_STAT_ADD(alloc_pages[8]);
2024                 if (anypgsz == 0 || --szc == 0)
2025                         return (ENOMEM);
2026         }
2027 #endif
2028 
2029         pgsz = page_get_pagesize(szc);
2030         totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2031 
2032         ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2033 
2034         (void) page_create_wait(npgs, PG_WAIT);
2035 
2036         while (npgs && szc) {
2037                 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2038                 if (pgflags == PG_LOCAL) {
2039                         pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2040                             pgflags, lgrp);
2041                         if (pp == NULL) {
2042                                 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2043                                     0, lgrp);
2044                         }
2045                 } else {
2046                         pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2047                             0, lgrp);
2048                 }
2049                 if (pp != NULL) {
2050                         VM_STAT_ADD(alloc_pages[1]);
2051                         page_list_concat(&pplist, &pp);
2052                         ASSERT(npgs >= curnpgs);
2053                         npgs -= curnpgs;
2054                 } else if (anypgsz) {
2055                         VM_STAT_ADD(alloc_pages[2]);
2056                         szc--;
2057                         pgsz = page_get_pagesize(szc);
2058                         curnpgs = pgsz >> PAGESHIFT;
2059                 } else {
2060                         VM_STAT_ADD(alloc_pages[3]);
2061                         ASSERT(npgs == totpgs);
2062                         page_create_putback(npgs);
2063                         return (ENOMEM);
2064                 }
2065         }
2066         if (szc == 0) {
2067                 VM_STAT_ADD(alloc_pages[4]);
2068                 ASSERT(npgs != 0);
2069                 page_create_putback(npgs);
2070                 err = ENOMEM;
2071         } else if (basepp != NULL) {
2072                 ASSERT(npgs == 0);
2073                 ASSERT(ppa == NULL);
2074                 *basepp = pplist;
2075         }
2076 
2077         npgs = totpgs - npgs;
2078         pp = pplist;
2079 
2080         /*
2081          * Clear the free and age bits. Also if we were passed in a ppa then
2082          * fill it in with all the constituent pages from the large page. But
2083          * if we failed to allocate all the pages just free what we got.
2084          */
2085         while (npgs != 0) {
2086                 ASSERT(PP_ISFREE(pp));
2087                 ASSERT(PP_ISAGED(pp));
2088                 if (ppa != NULL || err != 0) {
2089                         if (err == 0) {
2090                                 VM_STAT_ADD(alloc_pages[5]);
2091                                 PP_CLRFREE(pp);
2092                                 PP_CLRAGED(pp);
2093                                 page_sub(&pplist, pp);
2094                                 *ppa++ = pp;
2095                                 npgs--;
2096                         } else {
2097                                 VM_STAT_ADD(alloc_pages[6]);
2098                                 ASSERT(pp->p_szc != 0);
2099                                 curnpgs = page_get_pagecnt(pp->p_szc);
2100                                 page_list_break(&pp, &pplist, curnpgs);
2101                                 page_list_add_pages(pp, 0);
2102                                 page_create_putback(curnpgs);
2103                                 ASSERT(npgs >= curnpgs);
2104                                 npgs -= curnpgs;
2105                         }
2106                         pp = pplist;
2107                 } else {
2108                         VM_STAT_ADD(alloc_pages[7]);
2109                         PP_CLRFREE(pp);
2110                         PP_CLRAGED(pp);
2111                         pp = pp->p_next;
2112                         npgs--;
2113                 }
2114         }
2115         return (err);
2116 }
2117 
2118 /*
2119  * Get a single large page off of the freelists, and set it up for use.
2120  * Number of bytes requested must be a supported page size.
2121  *
2122  * Note that this call may fail even if there is sufficient
2123  * memory available or PG_WAIT is set, so the caller must
2124  * be willing to fallback on page_create_va(), block and retry,
2125  * or fail the requester.
2126  */
2127 page_t *
2128 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2129     struct seg *seg, caddr_t vaddr, void *arg)
2130 {
2131         pgcnt_t         npages;
2132         page_t          *pp;
2133         page_t          *rootpp;
2134         lgrp_t          *lgrp;
2135         lgrp_id_t       *lgrpid = (lgrp_id_t *)arg;
2136 
2137         ASSERT(vp != NULL);
2138 
2139         ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2140             PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2141         /* but no others */
2142 
2143         ASSERT((flags & PG_EXCL) == PG_EXCL);
2144 
2145         npages = btop(bytes);
2146 
2147         if (!kcage_on || panicstr) {
2148                 /*
2149                  * Cage is OFF, or we are single threaded in
2150                  * panic, so make everything a RELOC request.
2151                  */
2152                 flags &= ~PG_NORELOC;
2153         }
2154 
2155         /*
2156          * Make sure there's adequate physical memory available.
2157          * Note: PG_WAIT is ignored here.
2158          */
2159         if (freemem <= throttlefree + npages) {
2160                 VM_STAT_ADD(page_create_large_cnt[1]);
2161                 return (NULL);
2162         }
2163 
2164         /*
2165          * If cage is on, dampen draw from cage when available
2166          * cage space is low.
2167          */
2168         if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
2169             kcage_freemem < kcage_throttlefree + npages) {
2170 
2171                 /*
2172                  * The cage is on, the caller wants PG_NORELOC
2173                  * pages and available cage memory is very low.
2174                  * Call kcage_create_throttle() to attempt to
2175                  * control demand on the cage.
2176                  */
2177                 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2178                         VM_STAT_ADD(page_create_large_cnt[2]);
2179                         return (NULL);
2180                 }
2181         }
2182 
2183         if (!pcf_decrement_bucket(npages) &&
2184             !pcf_decrement_multiple(NULL, npages, 1)) {
2185                 VM_STAT_ADD(page_create_large_cnt[4]);
2186                 return (NULL);
2187         }
2188 
2189         /*
2190          * This is where this function behaves fundamentally differently
2191          * than page_create_va(); since we're intending to map the page
2192          * with a single TTE, we have to get it as a physically contiguous
2193          * hardware pagesize chunk.  If we can't, we fail.
2194          */
2195         if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2196             LGRP_EXISTS(lgrp_table[*lgrpid]))
2197                 lgrp = lgrp_table[*lgrpid];
2198         else
2199                 lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2200 
2201         if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2202             bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2203                 page_create_putback(npages);
2204                 VM_STAT_ADD(page_create_large_cnt[5]);
2205                 return (NULL);
2206         }
2207 
2208         /*
2209          * if we got the page with the wrong mtype give it back this is a
2210          * workaround for CR 6249718. When CR 6249718 is fixed we never get
2211          * inside "if" and the workaround becomes just a nop
2212          */
2213         if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2214                 page_list_add_pages(rootpp, 0);
2215                 page_create_putback(npages);
2216                 VM_STAT_ADD(page_create_large_cnt[6]);
2217                 return (NULL);
2218         }
2219 
2220         /*
2221          * If satisfying this request has left us with too little
2222          * memory, start the wheels turning to get some back.  The
2223          * first clause of the test prevents waking up the pageout
2224          * daemon in situations where it would decide that there's
2225          * nothing to do.
2226          */
2227         if (nscan < desscan && freemem < minfree) {
2228                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2229                     "pageout_cv_signal:freemem %ld", freemem);
2230                 cv_signal(&proc_pageout->p_cv);
2231         }
2232 
2233         pp = rootpp;
2234         while (npages--) {
2235                 ASSERT(PAGE_EXCL(pp));
2236                 ASSERT(pp->p_vnode == NULL);
2237                 ASSERT(!hat_page_is_mapped(pp));
2238                 PP_CLRFREE(pp);
2239                 PP_CLRAGED(pp);
2240                 if (!page_hashin(pp, vp, off, NULL))
2241                         panic("page_create_large: hashin failed: page %p",
2242                             (void *)pp);
2243                 page_io_lock(pp);
2244                 off += PAGESIZE;
2245                 pp = pp->p_next;
2246         }
2247 
2248         VM_STAT_ADD(page_create_large_cnt[0]);
2249         return (rootpp);
2250 }
2251 
2252 page_t *
2253 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2254     struct seg *seg, caddr_t vaddr)
2255 {
2256         page_t          *plist = NULL;
2257         pgcnt_t         npages;
2258         pgcnt_t         found_on_free = 0;
2259         pgcnt_t         pages_req;
2260         page_t          *npp = NULL;
2261         struct pcf      *p;
2262         lgrp_t          *lgrp;
2263 
2264         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2265             "page_create_start:vp %p off %llx bytes %lu flags %x",
2266             vp, off, bytes, flags);
2267 
2268         ASSERT(bytes != 0 && vp != NULL);
2269 
2270         if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2271                 panic("page_create: invalid flags");
2272                 /*NOTREACHED*/
2273         }
2274         ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2275             PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2276             /* but no others */
2277 
2278         pages_req = npages = btopr(bytes);
2279         /*
2280          * Try to see whether request is too large to *ever* be
2281          * satisfied, in order to prevent deadlock.  We arbitrarily
2282          * decide to limit maximum size requests to max_page_get.
2283          */
2284         if (npages >= max_page_get) {
2285                 if ((flags & PG_WAIT) == 0) {
2286                         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2287                             "page_create_toobig:vp %p off %llx npages "
2288                             "%lu max_page_get %lu",
2289                             vp, off, npages, max_page_get);
2290                         return (NULL);
2291                 } else {
2292                         cmn_err(CE_WARN,
2293                             "Request for too much kernel memory "
2294                             "(%lu bytes), will hang forever", bytes);
2295                         for (;;)
2296                                 delay(1000000000);
2297                 }
2298         }
2299 
2300         if (!kcage_on || panicstr) {
2301                 /*
2302                  * Cage is OFF, or we are single threaded in
2303                  * panic, so make everything a RELOC request.
2304                  */
2305                 flags &= ~PG_NORELOC;
2306         }
2307 
2308         if (freemem <= throttlefree + npages)
2309                 if (!page_create_throttle(npages, flags))
2310                         return (NULL);
2311 
2312         /*
2313          * If cage is on, dampen draw from cage when available
2314          * cage space is low.
2315          */
2316         if ((flags & PG_NORELOC) &&
2317             kcage_freemem < kcage_throttlefree + npages) {
2318 
2319                 /*
2320                  * The cage is on, the caller wants PG_NORELOC
2321                  * pages and available cage memory is very low.
2322                  * Call kcage_create_throttle() to attempt to
2323                  * control demand on the cage.
2324                  */
2325                 if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2326                         return (NULL);
2327         }
2328 
2329         VM_STAT_ADD(page_create_cnt[0]);
2330 
2331         if (!pcf_decrement_bucket(npages)) {
2332                 /*
2333                  * Have to look harder.  If npages is greater than
2334                  * one, then we might have to coalesce the counters.
2335                  *
2336                  * Go wait.  We come back having accounted
2337                  * for the memory.
2338                  */
2339                 VM_STAT_ADD(page_create_cnt[1]);
2340                 if (!page_create_wait(npages, flags)) {
2341                         VM_STAT_ADD(page_create_cnt[2]);
2342                         return (NULL);
2343                 }
2344         }
2345 
2346         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2347             "page_create_success:vp %p off %llx", vp, off);
2348 
2349         /*
2350          * If satisfying this request has left us with too little
2351          * memory, start the wheels turning to get some back.  The
2352          * first clause of the test prevents waking up the pageout
2353          * daemon in situations where it would decide that there's
2354          * nothing to do.
2355          */
2356         if (nscan < desscan && freemem < minfree) {
2357                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2358                     "pageout_cv_signal:freemem %ld", freemem);
2359                 cv_signal(&proc_pageout->p_cv);
2360         }
2361 
2362         /*
2363          * Loop around collecting the requested number of pages.
2364          * Most of the time, we have to `create' a new page. With
2365          * this in mind, pull the page off the free list before
2366          * getting the hash lock.  This will minimize the hash
2367          * lock hold time, nesting, and the like.  If it turns
2368          * out we don't need the page, we put it back at the end.
2369          */
2370         while (npages--) {
2371                 page_t          *pp;
2372                 kmutex_t        *phm = NULL;
2373                 ulong_t         index;
2374 
2375                 index = PAGE_HASH_FUNC(vp, off);
2376 top:
2377                 ASSERT(phm == NULL);
2378                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
2379                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2380 
2381                 if (npp == NULL) {
2382                         /*
2383                          * Try to get a page from the freelist (ie,
2384                          * a page with no [vp, off] tag).  If that
2385                          * fails, use the cachelist.
2386                          *
2387                          * During the first attempt at both the free
2388                          * and cache lists we try for the correct color.
2389                          */
2390                         /*
2391                          * XXXX-how do we deal with virtual indexed
2392                          * caches and and colors?
2393                          */
2394                         VM_STAT_ADD(page_create_cnt[4]);
2395                         /*
2396                          * Get lgroup to allocate next page of shared memory
2397                          * from and use it to specify where to allocate
2398                          * the physical memory
2399                          */
2400                         lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2401                         npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2402                             flags | PG_MATCH_COLOR, lgrp);
2403                         if (npp == NULL) {
2404                                 npp = page_get_cachelist(vp, off, seg,
2405                                     vaddr, flags | PG_MATCH_COLOR, lgrp);
2406                                 if (npp == NULL) {
2407                                         npp = page_create_get_something(vp,
2408                                             off, seg, vaddr,
2409                                             flags & ~PG_MATCH_COLOR);
2410                                 }
2411 
2412                                 if (PP_ISAGED(npp) == 0) {
2413                                         /*
2414                                          * Since this page came from the
2415                                          * cachelist, we must destroy the
2416                                          * old vnode association.
2417                                          */
2418                                         page_hashout(npp, NULL);
2419                                 }
2420                         }
2421                 }
2422 
2423                 /*
2424                  * We own this page!
2425                  */
2426                 ASSERT(PAGE_EXCL(npp));
2427                 ASSERT(npp->p_vnode == NULL);
2428                 ASSERT(!hat_page_is_mapped(npp));
2429                 PP_CLRFREE(npp);
2430                 PP_CLRAGED(npp);
2431 
2432                 /*
2433                  * Here we have a page in our hot little mits and are
2434                  * just waiting to stuff it on the appropriate lists.
2435                  * Get the mutex and check to see if it really does
2436                  * not exist.
2437                  */
2438                 phm = PAGE_HASH_MUTEX(index);
2439                 mutex_enter(phm);
2440                 pp = page_hash_search(index, vp, off);
2441                 if (pp == NULL) {
2442                         VM_STAT_ADD(page_create_new);
2443                         pp = npp;
2444                         npp = NULL;
2445                         if (!page_hashin(pp, vp, off, phm)) {
2446                                 /*
2447                                  * Since we hold the page hash mutex and
2448                                  * just searched for this page, page_hashin
2449                                  * had better not fail.  If it does, that
2450                                  * means somethread did not follow the
2451                                  * page hash mutex rules.  Panic now and
2452                                  * get it over with.  As usual, go down
2453                                  * holding all the locks.
2454                                  */
2455                                 ASSERT(MUTEX_HELD(phm));
2456                                 panic("page_create: "
2457                                     "hashin failed %p %p %llx %p",
2458                                     (void *)pp, (void *)vp, off, (void *)phm);
2459                                 /*NOTREACHED*/
2460                         }
2461                         ASSERT(MUTEX_HELD(phm));
2462                         mutex_exit(phm);
2463                         phm = NULL;
2464 
2465                         /*
2466                          * Hat layer locking need not be done to set
2467                          * the following bits since the page is not hashed
2468                          * and was on the free list (i.e., had no mappings).
2469                          *
2470                          * Set the reference bit to protect
2471                          * against immediate pageout
2472                          *
2473                          * XXXmh modify freelist code to set reference
2474                          * bit so we don't have to do it here.
2475                          */
2476                         page_set_props(pp, P_REF);
2477                         found_on_free++;
2478                 } else {
2479                         VM_STAT_ADD(page_create_exists);
2480                         if (flags & PG_EXCL) {
2481                                 /*
2482                                  * Found an existing page, and the caller
2483                                  * wanted all new pages.  Undo all of the work
2484                                  * we have done.
2485                                  */
2486                                 mutex_exit(phm);
2487                                 phm = NULL;
2488                                 while (plist != NULL) {
2489                                         pp = plist;
2490                                         page_sub(&plist, pp);
2491                                         page_io_unlock(pp);
2492                                         /* large pages should not end up here */
2493                                         ASSERT(pp->p_szc == 0);
2494                                         /*LINTED: constant in conditional ctx*/
2495                                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
2496                                 }
2497                                 VM_STAT_ADD(page_create_found_one);
2498                                 goto fail;
2499                         }
2500                         ASSERT(flags & PG_WAIT);
2501                         if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2502                                 /*
2503                                  * Start all over again if we blocked trying
2504                                  * to lock the page.
2505                                  */
2506                                 mutex_exit(phm);
2507                                 VM_STAT_ADD(page_create_page_lock_failed);
2508                                 phm = NULL;
2509                                 goto top;
2510                         }
2511                         mutex_exit(phm);
2512                         phm = NULL;
2513 
2514                         if (PP_ISFREE(pp)) {
2515                                 ASSERT(PP_ISAGED(pp) == 0);
2516                                 VM_STAT_ADD(pagecnt.pc_get_cache);
2517                                 page_list_sub(pp, PG_CACHE_LIST);
2518                                 PP_CLRFREE(pp);
2519                                 found_on_free++;
2520                         }
2521                 }
2522 
2523                 /*
2524                  * Got a page!  It is locked.  Acquire the i/o
2525                  * lock since we are going to use the p_next and
2526                  * p_prev fields to link the requested pages together.
2527                  */
2528                 page_io_lock(pp);
2529                 page_add(&plist, pp);
2530                 plist = plist->p_next;
2531                 off += PAGESIZE;
2532                 vaddr += PAGESIZE;
2533         }
2534 
2535         ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2536 fail:
2537         if (npp != NULL) {
2538                 /*
2539                  * Did not need this page after all.
2540                  * Put it back on the free list.
2541                  */
2542                 VM_STAT_ADD(page_create_putbacks);
2543                 PP_SETFREE(npp);
2544                 PP_SETAGED(npp);
2545                 npp->p_offset = (u_offset_t)-1;
2546                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2547                 page_unlock(npp);
2548 
2549         }
2550 
2551         ASSERT(pages_req >= found_on_free);
2552 
2553         {
2554                 uint_t overshoot = (uint_t)(pages_req - found_on_free);
2555 
2556                 if (overshoot) {
2557                         VM_STAT_ADD(page_create_overshoot);
2558                         p = &pcf[PCF_INDEX()];
2559                         mutex_enter(&p->pcf_lock);
2560                         if (p->pcf_block) {
2561                                 p->pcf_reserve += overshoot;
2562                         } else {
2563                                 p->pcf_count += overshoot;
2564                                 if (p->pcf_wait) {
2565                                         mutex_enter(&new_freemem_lock);
2566                                         if (freemem_wait) {
2567                                                 cv_signal(&freemem_cv);
2568                                                 p->pcf_wait--;
2569                                         } else {
2570                                                 p->pcf_wait = 0;
2571                                         }
2572                                         mutex_exit(&new_freemem_lock);
2573                                 }
2574                         }
2575                         mutex_exit(&p->pcf_lock);
2576                         /* freemem is approximate, so this test OK */
2577                         if (!p->pcf_block)
2578                                 freemem += overshoot;
2579                 }
2580         }
2581 
2582         return (plist);
2583 }
2584 
2585 /*
2586  * One or more constituent pages of this large page has been marked
2587  * toxic. Simply demote the large page to PAGESIZE pages and let
2588  * page_free() handle it. This routine should only be called by
2589  * large page free routines (page_free_pages() and page_destroy_pages().
2590  * All pages are locked SE_EXCL and have already been marked free.
2591  */
2592 static void
2593 page_free_toxic_pages(page_t *rootpp)
2594 {
2595         page_t  *tpp;
2596         pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2597         uint_t  szc = rootpp->p_szc;
2598 
2599         for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2600                 ASSERT(tpp->p_szc == szc);
2601                 ASSERT((PAGE_EXCL(tpp) &&
2602                     !page_iolock_assert(tpp)) || panicstr);
2603                 tpp->p_szc = 0;
2604         }
2605 
2606         while (rootpp != NULL) {
2607                 tpp = rootpp;
2608                 page_sub(&rootpp, tpp);
2609                 ASSERT(PP_ISFREE(tpp));
2610                 PP_CLRFREE(tpp);
2611                 page_free(tpp, 1);
2612         }
2613 }
2614 
2615 /*
2616  * Put page on the "free" list.
2617  * The free list is really two lists maintained by
2618  * the PSM of whatever machine we happen to be on.
2619  */
2620 void
2621 page_free(page_t *pp, int dontneed)
2622 {
2623         struct pcf      *p;
2624         uint_t          pcf_index;
2625 
2626         ASSERT((PAGE_EXCL(pp) &&
2627             !page_iolock_assert(pp)) || panicstr);
2628 
2629         if (PP_ISFREE(pp)) {
2630                 panic("page_free: page %p is free", (void *)pp);
2631         }
2632 
2633         if (pp->p_szc != 0) {
2634                 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2635                     PP_ISKAS(pp)) {
2636                         panic("page_free: anon or kernel "
2637                             "or no vnode large page %p", (void *)pp);
2638                 }
2639                 page_demote_vp_pages(pp);
2640                 ASSERT(pp->p_szc == 0);
2641         }
2642 
2643         /*
2644          * The page_struct_lock need not be acquired to examine these
2645          * fields since the page has an "exclusive" lock.
2646          */
2647         if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2648             pp->p_slckcnt != 0) {
2649                 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2650                     "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2651                     pp->p_cowcnt, pp->p_slckcnt);
2652                 /*NOTREACHED*/
2653         }
2654 
2655         ASSERT(!hat_page_getshare(pp));
2656 
2657         PP_SETFREE(pp);
2658         ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2659             !hat_ismod(pp));
2660         page_clr_all_props(pp);
2661         ASSERT(!hat_page_getshare(pp));
2662 
2663         /*
2664          * Now we add the page to the head of the free list.
2665          * But if this page is associated with a paged vnode
2666          * then we adjust the head forward so that the page is
2667          * effectively at the end of the list.
2668          */
2669         if (pp->p_vnode == NULL) {
2670                 /*
2671                  * Page has no identity, put it on the free list.
2672                  */
2673                 PP_SETAGED(pp);
2674                 pp->p_offset = (u_offset_t)-1;
2675                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2676                 VM_STAT_ADD(pagecnt.pc_free_free);
2677                 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2678                     "page_free_free:pp %p", pp);
2679         } else {
2680                 PP_CLRAGED(pp);
2681 
2682                 if (!dontneed) {
2683                         /* move it to the tail of the list */
2684                         page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2685 
2686                         VM_STAT_ADD(pagecnt.pc_free_cache);
2687                         TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2688                             "page_free_cache_tail:pp %p", pp);
2689                 } else {
2690                         page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2691 
2692                         VM_STAT_ADD(pagecnt.pc_free_dontneed);
2693                         TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2694                             "page_free_cache_head:pp %p", pp);
2695                 }
2696         }
2697         page_unlock(pp);
2698 
2699         /*
2700          * Now do the `freemem' accounting.
2701          */
2702         pcf_index = PCF_INDEX();
2703         p = &pcf[pcf_index];
2704 
2705         mutex_enter(&p->pcf_lock);
2706         if (p->pcf_block) {
2707                 p->pcf_reserve += 1;
2708         } else {
2709                 p->pcf_count += 1;
2710                 if (p->pcf_wait) {
2711                         mutex_enter(&new_freemem_lock);
2712                         /*
2713                          * Check to see if some other thread
2714                          * is actually waiting.  Another bucket
2715                          * may have woken it up by now.  If there
2716                          * are no waiters, then set our pcf_wait
2717                          * count to zero to avoid coming in here
2718                          * next time.  Also, since only one page
2719                          * was put on the free list, just wake
2720                          * up one waiter.
2721                          */
2722                         if (freemem_wait) {
2723                                 cv_signal(&freemem_cv);
2724                                 p->pcf_wait--;
2725                         } else {
2726                                 p->pcf_wait = 0;
2727                         }
2728                         mutex_exit(&new_freemem_lock);
2729                 }
2730         }
2731         mutex_exit(&p->pcf_lock);
2732 
2733         /* freemem is approximate, so this test OK */
2734         if (!p->pcf_block)
2735                 freemem += 1;
2736 }
2737 
2738 /*
2739  * Put page on the "free" list during intial startup.
2740  * This happens during initial single threaded execution.
2741  */
2742 void
2743 page_free_at_startup(page_t *pp)
2744 {
2745         struct pcf      *p;
2746         uint_t          pcf_index;
2747 
2748         page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2749         VM_STAT_ADD(pagecnt.pc_free_free);
2750 
2751         /*
2752          * Now do the `freemem' accounting.
2753          */
2754         pcf_index = PCF_INDEX();
2755         p = &pcf[pcf_index];
2756 
2757         ASSERT(p->pcf_block == 0);
2758         ASSERT(p->pcf_wait == 0);
2759         p->pcf_count += 1;
2760 
2761         /* freemem is approximate, so this is OK */
2762         freemem += 1;
2763 }
2764 
2765 void
2766 page_free_pages(page_t *pp)
2767 {
2768         page_t  *tpp, *rootpp = NULL;
2769         pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2770         pgcnt_t i;
2771         uint_t  szc = pp->p_szc;
2772 
2773         VM_STAT_ADD(pagecnt.pc_free_pages);
2774         TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2775             "page_free_free:pp %p", pp);
2776 
2777         ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2778         if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2779                 panic("page_free_pages: not root page %p", (void *)pp);
2780                 /*NOTREACHED*/
2781         }
2782 
2783         for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2784                 ASSERT((PAGE_EXCL(tpp) &&
2785                     !page_iolock_assert(tpp)) || panicstr);
2786                 if (PP_ISFREE(tpp)) {
2787                         panic("page_free_pages: page %p is free", (void *)tpp);
2788                         /*NOTREACHED*/
2789                 }
2790                 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2791                     tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2792                         panic("page_free_pages %p", (void *)tpp);
2793                         /*NOTREACHED*/
2794                 }
2795 
2796                 ASSERT(!hat_page_getshare(tpp));
2797                 ASSERT(tpp->p_vnode == NULL);
2798                 ASSERT(tpp->p_szc == szc);
2799 
2800                 PP_SETFREE(tpp);
2801                 page_clr_all_props(tpp);
2802                 PP_SETAGED(tpp);
2803                 tpp->p_offset = (u_offset_t)-1;
2804                 ASSERT(tpp->p_next == tpp);
2805                 ASSERT(tpp->p_prev == tpp);
2806                 page_list_concat(&rootpp, &tpp);
2807         }
2808         ASSERT(rootpp == pp);
2809 
2810         page_list_add_pages(rootpp, 0);
2811         page_create_putback(pgcnt);
2812 }
2813 
2814 int free_pages = 1;
2815 
2816 /*
2817  * This routine attempts to return pages to the cachelist via page_release().
2818  * It does not *have* to be successful in all cases, since the pageout scanner
2819  * will catch any pages it misses.  It does need to be fast and not introduce
2820  * too much overhead.
2821  *
2822  * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2823  * don't lock and retry.  This is ok, since the page scanner will eventually
2824  * find any page we miss in free_vp_pages().
2825  */
2826 void
2827 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2828 {
2829         page_t *pp;
2830         u_offset_t eoff;
2831         extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2832 
2833         eoff = off + len;
2834 
2835         if (free_pages == 0)
2836                 return;
2837         if (swap_in_range(vp, off, len))
2838                 return;
2839 
2840         for (; off < eoff; off += PAGESIZE) {
2841 
2842                 /*
2843                  * find the page using a fast, but inexact search. It'll be OK
2844                  * if a few pages slip through the cracks here.
2845                  */
2846                 pp = page_exists(vp, off);
2847 
2848                 /*
2849                  * If we didn't find the page (it may not exist), the page
2850                  * is free, looks still in use (shared), or we can't lock it,
2851                  * just give up.
2852                  */
2853                 if (pp == NULL ||
2854                     PP_ISFREE(pp) ||
2855                     page_share_cnt(pp) > 0 ||
2856                     !page_trylock(pp, SE_EXCL))
2857                         continue;
2858 
2859                 /*
2860                  * Once we have locked pp, verify that it's still the
2861                  * correct page and not already free
2862                  */
2863                 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2864                 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2865                         page_unlock(pp);
2866                         continue;
2867                 }
2868 
2869                 /*
2870                  * try to release the page...
2871                  */
2872                 (void) page_release(pp, 1);
2873         }
2874 }
2875 
2876 /*
2877  * Reclaim the given page from the free list.
2878  * If pp is part of a large pages, only the given constituent page is reclaimed
2879  * and the large page it belonged to will be demoted.  This can only happen
2880  * if the page is not on the cachelist.
2881  *
2882  * Returns 1 on success or 0 on failure.
2883  *
2884  * The page is unlocked if it can't be reclaimed (when freemem == 0).
2885  * If `lock' is non-null, it will be dropped and re-acquired if
2886  * the routine must wait while freemem is 0.
2887  *
2888  * As it turns out, boot_getpages() does this.  It picks a page,
2889  * based on where OBP mapped in some address, gets its pfn, searches
2890  * the memsegs, locks the page, then pulls it off the free list!
2891  */
2892 int
2893 page_reclaim(page_t *pp, kmutex_t *lock)
2894 {
2895         struct pcf      *p;
2896         struct cpu      *cpup;
2897         int             enough;
2898         uint_t          i;
2899 
2900         ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2901         ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2902 
2903         /*
2904          * If `freemem' is 0, we cannot reclaim this page from the
2905          * freelist, so release every lock we might hold: the page,
2906          * and the `lock' before blocking.
2907          *
2908          * The only way `freemem' can become 0 while there are pages
2909          * marked free (have their p->p_free bit set) is when the
2910          * system is low on memory and doing a page_create().  In
2911          * order to guarantee that once page_create() starts acquiring
2912          * pages it will be able to get all that it needs since `freemem'
2913          * was decreased by the requested amount.  So, we need to release
2914          * this page, and let page_create() have it.
2915          *
2916          * Since `freemem' being zero is not supposed to happen, just
2917          * use the usual hash stuff as a starting point.  If that bucket
2918          * is empty, then assume the worst, and start at the beginning
2919          * of the pcf array.  If we always start at the beginning
2920          * when acquiring more than one pcf lock, there won't be any
2921          * deadlock problems.
2922          */
2923 
2924         /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2925 
2926         if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2927                 pcf_acquire_all();
2928                 goto page_reclaim_nomem;
2929         }
2930 
2931         enough = pcf_decrement_bucket(1);
2932 
2933         if (!enough) {
2934                 VM_STAT_ADD(page_reclaim_zero);
2935                 /*
2936                  * Check again. Its possible that some other thread
2937                  * could have been right behind us, and added one
2938                  * to a list somewhere.  Acquire each of the pcf locks
2939                  * until we find a page.
2940                  */
2941                 p = pcf;
2942                 for (i = 0; i < pcf_fanout; i++) {
2943                         mutex_enter(&p->pcf_lock);
2944                         if (p->pcf_count >= 1) {
2945                                 p->pcf_count -= 1;
2946                                 /*
2947                                  * freemem is not protected by any lock. Thus,
2948                                  * we cannot have any assertion containing
2949                                  * freemem here.
2950                                  */
2951                                 freemem -= 1;
2952                                 enough = 1;
2953                                 break;
2954                         }
2955                         p++;
2956                 }
2957 
2958                 if (!enough) {
2959 page_reclaim_nomem:
2960                         /*
2961                          * We really can't have page `pp'.
2962                          * Time for the no-memory dance with
2963                          * page_free().  This is just like
2964                          * page_create_wait().  Plus the added
2965                          * attraction of releasing whatever mutex
2966                          * we held when we were called with in `lock'.
2967                          * Page_unlock() will wakeup any thread
2968                          * waiting around for this page.
2969                          */
2970                         if (lock) {
2971                                 VM_STAT_ADD(page_reclaim_zero_locked);
2972                                 mutex_exit(lock);
2973                         }
2974                         page_unlock(pp);
2975 
2976                         /*
2977                          * get this before we drop all the pcf locks.
2978                          */
2979                         mutex_enter(&new_freemem_lock);
2980 
2981                         p = pcf;
2982                         for (i = 0; i < pcf_fanout; i++) {
2983                                 p->pcf_wait++;
2984                                 mutex_exit(&p->pcf_lock);
2985                                 p++;
2986                         }
2987 
2988                         freemem_wait++;
2989                         cv_wait(&freemem_cv, &new_freemem_lock);
2990                         freemem_wait--;
2991 
2992                         mutex_exit(&new_freemem_lock);
2993 
2994                         if (lock) {
2995                                 mutex_enter(lock);
2996                         }
2997                         return (0);
2998                 }
2999 
3000                 /*
3001                  * The pcf accounting has been done,
3002                  * though none of the pcf_wait flags have been set,
3003                  * drop the locks and continue on.
3004                  */
3005                 while (p >= pcf) {
3006                         mutex_exit(&p->pcf_lock);
3007                         p--;
3008                 }
3009         }
3010 
3011 
3012         VM_STAT_ADD(pagecnt.pc_reclaim);
3013 
3014         /*
3015          * page_list_sub will handle the case where pp is a large page.
3016          * It's possible that the page was promoted while on the freelist
3017          */
3018         if (PP_ISAGED(pp)) {
3019                 page_list_sub(pp, PG_FREE_LIST);
3020                 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3021                     "page_reclaim_free:pp %p", pp);
3022         } else {
3023                 page_list_sub(pp, PG_CACHE_LIST);
3024                 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3025                     "page_reclaim_cache:pp %p", pp);
3026         }
3027 
3028         /*
3029          * clear the p_free & p_age bits since this page is no longer
3030          * on the free list.  Notice that there was a brief time where
3031          * a page is marked as free, but is not on the list.
3032          *
3033          * Set the reference bit to protect against immediate pageout.
3034          */
3035         PP_CLRFREE(pp);
3036         PP_CLRAGED(pp);
3037         page_set_props(pp, P_REF);
3038 
3039         CPU_STATS_ENTER_K();
3040         cpup = CPU;     /* get cpup now that CPU cannot change */
3041         CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3042         CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3043         CPU_STATS_EXIT_K();
3044         ASSERT(pp->p_szc == 0);
3045 
3046         return (1);
3047 }
3048 
3049 /*
3050  * Destroy identity of the page and put it back on
3051  * the page free list.  Assumes that the caller has
3052  * acquired the "exclusive" lock on the page.
3053  */
3054 void
3055 page_destroy(page_t *pp, int dontfree)
3056 {
3057         ASSERT((PAGE_EXCL(pp) &&
3058             !page_iolock_assert(pp)) || panicstr);
3059         ASSERT(pp->p_slckcnt == 0 || panicstr);
3060 
3061         if (pp->p_szc != 0) {
3062                 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3063                     PP_ISKAS(pp)) {
3064                         panic("page_destroy: anon or kernel or no vnode "
3065                             "large page %p", (void *)pp);
3066                 }
3067                 page_demote_vp_pages(pp);
3068                 ASSERT(pp->p_szc == 0);
3069         }
3070 
3071         TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3072 
3073         /*
3074          * Unload translations, if any, then hash out the
3075          * page to erase its identity.
3076          */
3077         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3078         page_hashout(pp, NULL);
3079 
3080         if (!dontfree) {
3081                 /*
3082                  * Acquire the "freemem_lock" for availrmem.
3083                  * The page_struct_lock need not be acquired for lckcnt
3084                  * and cowcnt since the page has an "exclusive" lock.
3085                  * We are doing a modified version of page_pp_unlock here.
3086                  */
3087                 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3088                         mutex_enter(&freemem_lock);
3089                         if (pp->p_lckcnt != 0) {
3090                                 availrmem++;
3091                                 pages_locked--;
3092                                 pp->p_lckcnt = 0;
3093                         }
3094                         if (pp->p_cowcnt != 0) {
3095                                 availrmem += pp->p_cowcnt;
3096                                 pages_locked -= pp->p_cowcnt;
3097                                 pp->p_cowcnt = 0;
3098                         }
3099                         mutex_exit(&freemem_lock);
3100                 }
3101                 /*
3102                  * Put the page on the "free" list.
3103                  */
3104                 page_free(pp, 0);
3105         }
3106 }
3107 
3108 void
3109 page_destroy_pages(page_t *pp)
3110 {
3111 
3112         page_t  *tpp, *rootpp = NULL;
3113         pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
3114         pgcnt_t i, pglcks = 0;
3115         uint_t  szc = pp->p_szc;
3116 
3117         ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3118 
3119         VM_STAT_ADD(pagecnt.pc_destroy_pages);
3120 
3121         TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3122 
3123         if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3124                 panic("page_destroy_pages: not root page %p", (void *)pp);
3125                 /*NOTREACHED*/
3126         }
3127 
3128         for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3129                 ASSERT((PAGE_EXCL(tpp) &&
3130                     !page_iolock_assert(tpp)) || panicstr);
3131                 ASSERT(tpp->p_slckcnt == 0 || panicstr);
3132                 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3133                 page_hashout(tpp, NULL);
3134                 ASSERT(tpp->p_offset == (u_offset_t)-1);
3135                 if (tpp->p_lckcnt != 0) {
3136                         pglcks++;
3137                         tpp->p_lckcnt = 0;
3138                 } else if (tpp->p_cowcnt != 0) {
3139                         pglcks += tpp->p_cowcnt;
3140                         tpp->p_cowcnt = 0;
3141                 }
3142                 ASSERT(!hat_page_getshare(tpp));
3143                 ASSERT(tpp->p_vnode == NULL);
3144                 ASSERT(tpp->p_szc == szc);
3145 
3146                 PP_SETFREE(tpp);
3147                 page_clr_all_props(tpp);
3148                 PP_SETAGED(tpp);
3149                 ASSERT(tpp->p_next == tpp);
3150                 ASSERT(tpp->p_prev == tpp);
3151                 page_list_concat(&rootpp, &tpp);
3152         }
3153 
3154         ASSERT(rootpp == pp);
3155         if (pglcks != 0) {
3156                 mutex_enter(&freemem_lock);
3157                 availrmem += pglcks;
3158                 mutex_exit(&freemem_lock);
3159         }
3160 
3161         page_list_add_pages(rootpp, 0);
3162         page_create_putback(pgcnt);
3163 }
3164 
3165 /*
3166  * Similar to page_destroy(), but destroys pages which are
3167  * locked and known to be on the page free list.  Since
3168  * the page is known to be free and locked, no one can access
3169  * it.
3170  *
3171  * Also, the number of free pages does not change.
3172  */
3173 void
3174 page_destroy_free(page_t *pp)
3175 {
3176         ASSERT(PAGE_EXCL(pp));
3177         ASSERT(PP_ISFREE(pp));
3178         ASSERT(pp->p_vnode);
3179         ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3180         ASSERT(!hat_page_is_mapped(pp));
3181         ASSERT(PP_ISAGED(pp) == 0);
3182         ASSERT(pp->p_szc == 0);
3183 
3184         VM_STAT_ADD(pagecnt.pc_destroy_free);
3185         page_list_sub(pp, PG_CACHE_LIST);
3186 
3187         page_hashout(pp, NULL);
3188         ASSERT(pp->p_vnode == NULL);
3189         ASSERT(pp->p_offset == (u_offset_t)-1);
3190         ASSERT(pp->p_hash == NULL);
3191 
3192         PP_SETAGED(pp);
3193         page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3194         page_unlock(pp);
3195 
3196         mutex_enter(&new_freemem_lock);
3197         if (freemem_wait) {
3198                 cv_signal(&freemem_cv);
3199         }
3200         mutex_exit(&new_freemem_lock);
3201 }
3202 
3203 /*
3204  * Rename the page "opp" to have an identity specified
3205  * by [vp, off].  If a page already exists with this name
3206  * it is locked and destroyed.  Note that the page's
3207  * translations are not unloaded during the rename.
3208  *
3209  * This routine is used by the anon layer to "steal" the
3210  * original page and is not unlike destroying a page and
3211  * creating a new page using the same page frame.
3212  *
3213  * XXX -- Could deadlock if caller 1 tries to rename A to B while
3214  * caller 2 tries to rename B to A.
3215  */
3216 void
3217 page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3218 {
3219         page_t          *pp;
3220         int             olckcnt = 0;
3221         int             ocowcnt = 0;
3222         kmutex_t        *phm;
3223         ulong_t         index;
3224 
3225         ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3226         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3227         ASSERT(PP_ISFREE(opp) == 0);
3228 
3229         VM_STAT_ADD(page_rename_count);
3230 
3231         TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3232             "page rename:pp %p vp %p off %llx", opp, vp, off);
3233 
3234         /*
3235          * CacheFS may call page_rename for a large NFS page
3236          * when both CacheFS and NFS mount points are used
3237          * by applications. Demote this large page before
3238          * renaming it, to ensure that there are no "partial"
3239          * large pages left lying around.
3240          */
3241         if (opp->p_szc != 0) {
3242                 vnode_t *ovp = opp->p_vnode;
3243                 ASSERT(ovp != NULL);
3244                 ASSERT(!IS_SWAPFSVP(ovp));
3245                 ASSERT(!VN_ISKAS(ovp));
3246                 page_demote_vp_pages(opp);
3247                 ASSERT(opp->p_szc == 0);
3248         }
3249 
3250         page_hashout(opp, NULL);
3251         PP_CLRAGED(opp);
3252 
3253         /*
3254          * Acquire the appropriate page hash lock, since
3255          * we're going to rename the page.
3256          */
3257         index = PAGE_HASH_FUNC(vp, off);
3258         phm = PAGE_HASH_MUTEX(index);
3259         mutex_enter(phm);
3260 top:
3261         /*
3262          * Look for an existing page with this name and destroy it if found.
3263          * By holding the page hash lock all the way to the page_hashin()
3264          * call, we are assured that no page can be created with this
3265          * identity.  In the case when the phm lock is dropped to undo any
3266          * hat layer mappings, the existing page is held with an "exclusive"
3267          * lock, again preventing another page from being created with
3268          * this identity.
3269          */
3270         pp = page_hash_search(index, vp, off);
3271         if (pp != NULL) {
3272                 VM_STAT_ADD(page_rename_exists);
3273 
3274                 /*
3275                  * As it turns out, this is one of only two places where
3276                  * page_lock() needs to hold the passed in lock in the
3277                  * successful case.  In all of the others, the lock could
3278                  * be dropped as soon as the attempt is made to lock
3279                  * the page.  It is tempting to add yet another arguement,
3280                  * PL_KEEP or PL_DROP, to let page_lock know what to do.
3281                  */
3282                 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3283                         /*
3284                          * Went to sleep because the page could not
3285                          * be locked.  We were woken up when the page
3286                          * was unlocked, or when the page was destroyed.
3287                          * In either case, `phm' was dropped while we
3288                          * slept.  Hence we should not just roar through
3289                          * this loop.
3290                          */
3291                         goto top;
3292                 }
3293 
3294                 /*
3295                  * If an existing page is a large page, then demote
3296                  * it to ensure that no "partial" large pages are
3297                  * "created" after page_rename. An existing page
3298                  * can be a CacheFS page, and can't belong to swapfs.
3299                  */
3300                 if (hat_page_is_mapped(pp)) {
3301                         /*
3302                          * Unload translations.  Since we hold the
3303                          * exclusive lock on this page, the page
3304                          * can not be changed while we drop phm.
3305                          * This is also not a lock protocol violation,
3306                          * but rather the proper way to do things.
3307                          */
3308                         mutex_exit(phm);
3309                         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3310                         if (pp->p_szc != 0) {
3311                                 ASSERT(!IS_SWAPFSVP(vp));
3312                                 ASSERT(!VN_ISKAS(vp));
3313                                 page_demote_vp_pages(pp);
3314                                 ASSERT(pp->p_szc == 0);
3315                         }
3316                         mutex_enter(phm);
3317                 } else if (pp->p_szc != 0) {
3318                         ASSERT(!IS_SWAPFSVP(vp));
3319                         ASSERT(!VN_ISKAS(vp));
3320                         mutex_exit(phm);
3321                         page_demote_vp_pages(pp);
3322                         ASSERT(pp->p_szc == 0);
3323                         mutex_enter(phm);
3324                 }
3325                 page_hashout(pp, phm);
3326         }
3327         /*
3328          * Hash in the page with the new identity.
3329          */
3330         if (!page_hashin(opp, vp, off, phm)) {
3331                 /*
3332                  * We were holding phm while we searched for [vp, off]
3333                  * and only dropped phm if we found and locked a page.
3334                  * If we can't create this page now, then some thing
3335                  * is really broken.
3336                  */
3337                 panic("page_rename: Can't hash in page: %p", (void *)pp);
3338                 /*NOTREACHED*/
3339         }
3340 
3341         ASSERT(MUTEX_HELD(phm));
3342         mutex_exit(phm);
3343 
3344         /*
3345          * Now that we have dropped phm, lets get around to finishing up
3346          * with pp.
3347          */
3348         if (pp != NULL) {
3349                 ASSERT(!hat_page_is_mapped(pp));
3350                 /* for now large pages should not end up here */
3351                 ASSERT(pp->p_szc == 0);
3352                 /*
3353                  * Save the locks for transfer to the new page and then
3354                  * clear them so page_free doesn't think they're important.
3355                  * The page_struct_lock need not be acquired for lckcnt and
3356                  * cowcnt since the page has an "exclusive" lock.
3357                  */
3358                 olckcnt = pp->p_lckcnt;
3359                 ocowcnt = pp->p_cowcnt;
3360                 pp->p_lckcnt = pp->p_cowcnt = 0;
3361 
3362                 /*
3363                  * Put the page on the "free" list after we drop
3364                  * the lock.  The less work under the lock the better.
3365                  */
3366                 /*LINTED: constant in conditional context*/
3367                 VN_DISPOSE(pp, B_FREE, 0, kcred);
3368         }
3369 
3370         /*
3371          * Transfer the lock count from the old page (if any).
3372          * The page_struct_lock need not be acquired for lckcnt and
3373          * cowcnt since the page has an "exclusive" lock.
3374          */
3375         opp->p_lckcnt += olckcnt;
3376         opp->p_cowcnt += ocowcnt;
3377 }
3378 
3379 /*
3380  * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3381  *
3382  * Pages are normally inserted at the start of a vnode's v_pages list.
3383  * If the vnode is VMODSORT and the page is modified, it goes at the end.
3384  * This can happen when a modified page is relocated for DR.
3385  *
3386  * Returns 1 on success and 0 on failure.
3387  */
3388 static int
3389 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3390 {
3391         page_t          **listp;
3392         page_t          *tp;
3393         ulong_t         index;
3394 
3395         ASSERT(PAGE_EXCL(pp));
3396         ASSERT(vp != NULL);
3397         ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3398 
3399         /*
3400          * Be sure to set these up before the page is inserted on the hash
3401          * list.  As soon as the page is placed on the list some other
3402          * thread might get confused and wonder how this page could
3403          * possibly hash to this list.
3404          */
3405         pp->p_vnode = vp;
3406         pp->p_offset = offset;
3407 
3408         /*
3409          * record if this page is on a swap vnode
3410          */
3411         if ((vp->v_flag & VISSWAP) != 0)
3412                 PP_SETSWAP(pp);
3413 
3414         index = PAGE_HASH_FUNC(vp, offset);
3415         ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3416         listp = &page_hash[index];
3417 
3418         /*
3419          * If this page is already hashed in, fail this attempt to add it.
3420          */
3421         for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3422                 if (tp->p_vnode == vp && tp->p_offset == offset) {
3423                         pp->p_vnode = NULL;
3424                         pp->p_offset = (u_offset_t)(-1);
3425                         return (0);
3426                 }
3427         }
3428         pp->p_hash = *listp;
3429         *listp = pp;
3430 
3431         /*
3432          * Add the page to the vnode's list of pages
3433          */
3434         if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3435                 listp = &vp->v_pages->p_vpprev->p_vpnext;
3436         else
3437                 listp = &vp->v_pages;
3438 
3439         page_vpadd(listp, pp);
3440 
3441         return (1);
3442 }
3443 
3444 /*
3445  * Add page `pp' to both the hash and vp chains for [vp, offset].
3446  *
3447  * Returns 1 on success and 0 on failure.
3448  * If hold is passed in, it is not dropped.
3449  */
3450 int
3451 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3452 {
3453         kmutex_t        *phm = NULL;
3454         kmutex_t        *vphm;
3455         int             rc;
3456 
3457         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3458         ASSERT(pp->p_fsdata == 0 || panicstr);
3459 
3460         TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3461             "page_hashin:pp %p vp %p offset %llx",
3462             pp, vp, offset);
3463 
3464         VM_STAT_ADD(hashin_count);
3465 
3466         if (hold != NULL)
3467                 phm = hold;
3468         else {
3469                 VM_STAT_ADD(hashin_not_held);
3470                 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3471                 mutex_enter(phm);
3472         }
3473 
3474         vphm = page_vnode_mutex(vp);
3475         mutex_enter(vphm);
3476         rc = page_do_hashin(pp, vp, offset);
3477         mutex_exit(vphm);
3478         if (hold == NULL)
3479                 mutex_exit(phm);
3480         if (rc == 0)
3481                 VM_STAT_ADD(hashin_already);
3482         return (rc);
3483 }
3484 
3485 /*
3486  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3487  * All mutexes must be held
3488  */
3489 static void
3490 page_do_hashout(page_t *pp)
3491 {
3492         page_t  **hpp;
3493         page_t  *hp;
3494         vnode_t *vp = pp->p_vnode;
3495 
3496         ASSERT(vp != NULL);
3497         ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3498 
3499         /*
3500          * First, take pp off of its hash chain.
3501          */
3502         hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3503 
3504         for (;;) {
3505                 hp = *hpp;
3506                 if (hp == pp)
3507                         break;
3508                 if (hp == NULL) {
3509                         panic("page_do_hashout");
3510                         /*NOTREACHED*/
3511                 }
3512                 hpp = &hp->p_hash;
3513         }
3514         *hpp = pp->p_hash;
3515 
3516         /*
3517          * Now remove it from its associated vnode.
3518          */
3519         if (vp->v_pages)
3520                 page_vpsub(&vp->v_pages, pp);
3521 
3522         pp->p_hash = NULL;
3523         page_clr_all_props(pp);
3524         PP_CLRSWAP(pp);
3525         pp->p_vnode = NULL;
3526         pp->p_offset = (u_offset_t)-1;
3527         pp->p_fsdata = 0;
3528 }
3529 
3530 /*
3531  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3532  *
3533  * When `phm' is non-NULL it contains the address of the mutex protecting the
3534  * hash list pp is on.  It is not dropped.
3535  */
3536 void
3537 page_hashout(page_t *pp, kmutex_t *phm)
3538 {
3539         vnode_t         *vp;
3540         ulong_t         index;
3541         kmutex_t        *nphm;
3542         kmutex_t        *vphm;
3543         kmutex_t        *sep;
3544 
3545         ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3546         ASSERT(pp->p_vnode != NULL);
3547         ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3548         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3549 
3550         vp = pp->p_vnode;
3551 
3552         TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3553             "page_hashout:pp %p vp %p", pp, vp);
3554 
3555         /* Kernel probe */
3556         TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3557             tnf_opaque, vnode, vp,
3558             tnf_offset, offset, pp->p_offset);
3559 
3560         /*
3561          *
3562          */
3563         VM_STAT_ADD(hashout_count);
3564         index = PAGE_HASH_FUNC(vp, pp->p_offset);
3565         if (phm == NULL) {
3566                 VM_STAT_ADD(hashout_not_held);
3567                 nphm = PAGE_HASH_MUTEX(index);
3568                 mutex_enter(nphm);
3569         }
3570         ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3571 
3572 
3573         /*
3574          * grab page vnode mutex and remove it...
3575          */
3576         vphm = page_vnode_mutex(vp);
3577         mutex_enter(vphm);
3578 
3579         page_do_hashout(pp);
3580 
3581         mutex_exit(vphm);
3582         if (phm == NULL)
3583                 mutex_exit(nphm);
3584 
3585         /*
3586          * Wake up processes waiting for this page.  The page's
3587          * identity has been changed, and is probably not the
3588          * desired page any longer.
3589          */
3590         sep = page_se_mutex(pp);
3591         mutex_enter(sep);
3592         pp->p_selock &= ~SE_EWANTED;
3593         if (CV_HAS_WAITERS(&pp->p_cv))
3594                 cv_broadcast(&pp->p_cv);
3595         mutex_exit(sep);
3596 }
3597 
3598 /*
3599  * Add the page to the front of a linked list of pages
3600  * using the p_next & p_prev pointers for the list.
3601  * The caller is responsible for protecting the list pointers.
3602  */
3603 void
3604 page_add(page_t **ppp, page_t *pp)
3605 {
3606         ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3607 
3608         page_add_common(ppp, pp);
3609 }
3610 
3611 
3612 
3613 /*
3614  *  Common code for page_add() and mach_page_add()
3615  */
3616 void
3617 page_add_common(page_t **ppp, page_t *pp)
3618 {
3619         if (*ppp == NULL) {
3620                 pp->p_next = pp->p_prev = pp;
3621         } else {
3622                 pp->p_next = *ppp;
3623                 pp->p_prev = (*ppp)->p_prev;
3624                 (*ppp)->p_prev = pp;
3625                 pp->p_prev->p_next = pp;
3626         }
3627         *ppp = pp;
3628 }
3629 
3630 
3631 /*
3632  * Remove this page from a linked list of pages
3633  * using the p_next & p_prev pointers for the list.
3634  *
3635  * The caller is responsible for protecting the list pointers.
3636  */
3637 void
3638 page_sub(page_t **ppp, page_t *pp)
3639 {
3640         ASSERT((PP_ISFREE(pp)) ? 1 :
3641             (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3642 
3643         if (*ppp == NULL || pp == NULL) {
3644                 panic("page_sub: bad arg(s): pp %p, *ppp %p",
3645                     (void *)pp, (void *)(*ppp));
3646                 /*NOTREACHED*/
3647         }
3648 
3649         page_sub_common(ppp, pp);
3650 }
3651 
3652 
3653 /*
3654  *  Common code for page_sub() and mach_page_sub()
3655  */
3656 void
3657 page_sub_common(page_t **ppp, page_t *pp)
3658 {
3659         if (*ppp == pp)
3660                 *ppp = pp->p_next;           /* go to next page */
3661 
3662         if (*ppp == pp)
3663                 *ppp = NULL;                    /* page list is gone */
3664         else {
3665                 pp->p_prev->p_next = pp->p_next;
3666                 pp->p_next->p_prev = pp->p_prev;
3667         }
3668         pp->p_prev = pp->p_next = pp;             /* make pp a list of one */
3669 }
3670 
3671 
3672 /*
3673  * Break page list cppp into two lists with npages in the first list.
3674  * The tail is returned in nppp.
3675  */
3676 void
3677 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3678 {
3679         page_t *s1pp = *oppp;
3680         page_t *s2pp;
3681         page_t *e1pp, *e2pp;
3682         long n = 0;
3683 
3684         if (s1pp == NULL) {
3685                 *nppp = NULL;
3686                 return;
3687         }
3688         if (npages == 0) {
3689                 *nppp = s1pp;
3690                 *oppp = NULL;
3691                 return;
3692         }
3693         for (n = 0, s2pp = *oppp; n < npages; n++) {
3694                 s2pp = s2pp->p_next;
3695         }
3696         /* Fix head and tail of new lists */
3697         e1pp = s2pp->p_prev;
3698         e2pp = s1pp->p_prev;
3699         s1pp->p_prev = e1pp;
3700         e1pp->p_next = s1pp;
3701         s2pp->p_prev = e2pp;
3702         e2pp->p_next = s2pp;
3703 
3704         /* second list empty */
3705         if (s2pp == s1pp) {
3706                 *oppp = s1pp;
3707                 *nppp = NULL;
3708         } else {
3709                 *oppp = s1pp;
3710                 *nppp = s2pp;
3711         }
3712 }
3713 
3714 /*
3715  * Concatenate page list nppp onto the end of list ppp.
3716  */
3717 void
3718 page_list_concat(page_t **ppp, page_t **nppp)
3719 {
3720         page_t *s1pp, *s2pp, *e1pp, *e2pp;
3721 
3722         if (*nppp == NULL) {
3723                 return;
3724         }
3725         if (*ppp == NULL) {
3726                 *ppp = *nppp;
3727                 return;
3728         }
3729         s1pp = *ppp;
3730         e1pp =  s1pp->p_prev;
3731         s2pp = *nppp;
3732         e2pp = s2pp->p_prev;
3733         s1pp->p_prev = e2pp;
3734         e2pp->p_next = s1pp;
3735         e1pp->p_next = s2pp;
3736         s2pp->p_prev = e1pp;
3737 }
3738 
3739 /*
3740  * return the next page in the page list
3741  */
3742 page_t *
3743 page_list_next(page_t *pp)
3744 {
3745         return (pp->p_next);
3746 }
3747 
3748 
3749 /*
3750  * Add the page to the front of the linked list of pages
3751  * using p_vpnext/p_vpprev pointers for the list.
3752  *
3753  * The caller is responsible for protecting the lists.
3754  */
3755 void
3756 page_vpadd(page_t **ppp, page_t *pp)
3757 {
3758         if (*ppp == NULL) {
3759                 pp->p_vpnext = pp->p_vpprev = pp;
3760         } else {
3761                 pp->p_vpnext = *ppp;
3762                 pp->p_vpprev = (*ppp)->p_vpprev;
3763                 (*ppp)->p_vpprev = pp;
3764                 pp->p_vpprev->p_vpnext = pp;
3765         }
3766         *ppp = pp;
3767 }
3768 
3769 /*
3770  * Remove this page from the linked list of pages
3771  * using p_vpnext/p_vpprev pointers for the list.
3772  *
3773  * The caller is responsible for protecting the lists.
3774  */
3775 void
3776 page_vpsub(page_t **ppp, page_t *pp)
3777 {
3778         if (*ppp == NULL || pp == NULL) {
3779                 panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3780                     (void *)pp, (void *)(*ppp));
3781                 /*NOTREACHED*/
3782         }
3783 
3784         if (*ppp == pp)
3785                 *ppp = pp->p_vpnext;         /* go to next page */
3786 
3787         if (*ppp == pp)
3788                 *ppp = NULL;                    /* page list is gone */
3789         else {
3790                 pp->p_vpprev->p_vpnext = pp->p_vpnext;
3791                 pp->p_vpnext->p_vpprev = pp->p_vpprev;
3792         }
3793         pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */
3794 }
3795 
3796 /*
3797  * Lock a physical page into memory "long term".  Used to support "lock
3798  * in memory" functions.  Accepts the page to be locked, and a cow variable
3799  * to indicate whether a the lock will travel to the new page during
3800  * a potential copy-on-write.
3801  */
3802 int
3803 page_pp_lock(
3804         page_t *pp,                     /* page to be locked */
3805         int cow,                        /* cow lock */
3806         int kernel)                     /* must succeed -- ignore checking */
3807 {
3808         int r = 0;                      /* result -- assume failure */
3809 
3810         ASSERT(PAGE_LOCKED(pp));
3811 
3812         page_struct_lock(pp);
3813         /*
3814          * Acquire the "freemem_lock" for availrmem.
3815          */
3816         if (cow) {
3817                 mutex_enter(&freemem_lock);
3818                 if ((availrmem > pages_pp_maximum) &&
3819                     (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3820                         availrmem--;
3821                         pages_locked++;
3822                         mutex_exit(&freemem_lock);
3823                         r = 1;
3824                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3825                                 cmn_err(CE_WARN,
3826                                     "COW lock limit reached on pfn 0x%lx",
3827                                     page_pptonum(pp));
3828                         }
3829                 } else
3830                         mutex_exit(&freemem_lock);
3831         } else {
3832                 if (pp->p_lckcnt) {
3833                         if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3834                                 r = 1;
3835                                 if (++pp->p_lckcnt ==
3836                                     (ushort_t)PAGE_LOCK_MAXIMUM) {
3837                                         cmn_err(CE_WARN, "Page lock limit "
3838                                             "reached on pfn 0x%lx",
3839                                             page_pptonum(pp));
3840                                 }
3841                         }
3842                 } else {
3843                         if (kernel) {
3844                                 /* availrmem accounting done by caller */
3845                                 ++pp->p_lckcnt;
3846                                 r = 1;
3847                         } else {
3848                                 mutex_enter(&freemem_lock);
3849                                 if (availrmem > pages_pp_maximum) {
3850                                         availrmem--;
3851                                         pages_locked++;
3852                                         ++pp->p_lckcnt;
3853                                         r = 1;
3854                                 }
3855                                 mutex_exit(&freemem_lock);
3856                         }
3857                 }
3858         }
3859         page_struct_unlock(pp);
3860         return (r);
3861 }
3862 
3863 /*
3864  * Decommit a lock on a physical page frame.  Account for cow locks if
3865  * appropriate.
3866  */
3867 void
3868 page_pp_unlock(
3869         page_t *pp,                     /* page to be unlocked */
3870         int cow,                        /* expect cow lock */
3871         int kernel)                     /* this was a kernel lock */
3872 {
3873         ASSERT(PAGE_LOCKED(pp));
3874 
3875         page_struct_lock(pp);
3876         /*
3877          * Acquire the "freemem_lock" for availrmem.
3878          * If cowcnt or lcknt is already 0 do nothing; i.e., we
3879          * could be called to unlock even if nothing is locked. This could
3880          * happen if locked file pages were truncated (removing the lock)
3881          * and the file was grown again and new pages faulted in; the new
3882          * pages are unlocked but the segment still thinks they're locked.
3883          */
3884         if (cow) {
3885                 if (pp->p_cowcnt) {
3886                         mutex_enter(&freemem_lock);
3887                         pp->p_cowcnt--;
3888                         availrmem++;
3889                         pages_locked--;
3890                         mutex_exit(&freemem_lock);
3891                 }
3892         } else {
3893                 if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3894                         if (!kernel) {
3895                                 mutex_enter(&freemem_lock);
3896                                 availrmem++;
3897                                 pages_locked--;
3898                                 mutex_exit(&freemem_lock);
3899                         }
3900                 }
3901         }
3902         page_struct_unlock(pp);
3903 }
3904 
3905 /*
3906  * This routine reserves availrmem for npages;
3907  *      flags: KM_NOSLEEP or KM_SLEEP
3908  *      returns 1 on success or 0 on failure
3909  */
3910 int
3911 page_resv(pgcnt_t npages, uint_t flags)
3912 {
3913         mutex_enter(&freemem_lock);
3914         while (availrmem < tune.t_minarmem + npages) {
3915                 if (flags & KM_NOSLEEP) {
3916                         mutex_exit(&freemem_lock);
3917                         return (0);
3918                 }
3919                 mutex_exit(&freemem_lock);
3920                 page_needfree(npages);
3921                 kmem_reap();
3922                 delay(hz >> 2);
3923                 page_needfree(-(spgcnt_t)npages);
3924                 mutex_enter(&freemem_lock);
3925         }
3926         availrmem -= npages;
3927         mutex_exit(&freemem_lock);
3928         return (1);
3929 }
3930 
3931 /*
3932  * This routine unreserves availrmem for npages;
3933  */
3934 void
3935 page_unresv(pgcnt_t npages)
3936 {
3937         mutex_enter(&freemem_lock);
3938         availrmem += npages;
3939         mutex_exit(&freemem_lock);
3940 }
3941 
3942 /*
3943  * See Statement at the beginning of segvn_lockop() regarding
3944  * the way we handle cowcnts and lckcnts.
3945  *
3946  * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3947  * that breaks COW has PROT_WRITE.
3948  *
3949  * Note that, we may also break COW in case we are softlocking
3950  * on read access during physio;
3951  * in this softlock case, the vpage may not have PROT_WRITE.
3952  * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3953  * if the vpage doesn't have PROT_WRITE.
3954  *
3955  * This routine is never called if we are stealing a page
3956  * in anon_private.
3957  *
3958  * The caller subtracted from availrmem for read only mapping.
3959  * if lckcnt is 1 increment availrmem.
3960  */
3961 void
3962 page_pp_useclaim(
3963         page_t *opp,            /* original page frame losing lock */
3964         page_t *npp,            /* new page frame gaining lock */
3965         uint_t  write_perm)     /* set if vpage has PROT_WRITE */
3966 {
3967         int payback = 0;
3968         int nidx, oidx;
3969 
3970         ASSERT(PAGE_LOCKED(opp));
3971         ASSERT(PAGE_LOCKED(npp));
3972 
3973         /*
3974          * Since we have two pages we probably have two locks.  We need to take
3975          * them in a defined order to avoid deadlocks.  It's also possible they
3976          * both hash to the same lock in which case this is a non-issue.
3977          */
3978         nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3979         oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3980         if (nidx < oidx) {
3981                 page_struct_lock(npp);
3982                 page_struct_lock(opp);
3983         } else if (oidx < nidx) {
3984                 page_struct_lock(opp);
3985                 page_struct_lock(npp);
3986         } else {        /* The pages hash to the same lock */
3987                 page_struct_lock(npp);
3988         }
3989 
3990         ASSERT(npp->p_cowcnt == 0);
3991         ASSERT(npp->p_lckcnt == 0);
3992 
3993         /* Don't use claim if nothing is locked (see page_pp_unlock above) */
3994         if ((write_perm && opp->p_cowcnt != 0) ||
3995             (!write_perm && opp->p_lckcnt != 0)) {
3996 
3997                 if (write_perm) {
3998                         npp->p_cowcnt++;
3999                         ASSERT(opp->p_cowcnt != 0);
4000                         opp->p_cowcnt--;
4001                 } else {
4002 
4003                         ASSERT(opp->p_lckcnt != 0);
4004 
4005                         /*
4006                          * We didn't need availrmem decremented if p_lckcnt on
4007                          * original page is 1. Here, we are unlocking
4008                          * read-only copy belonging to original page and
4009                          * are locking a copy belonging to new page.
4010                          */
4011                         if (opp->p_lckcnt == 1)
4012                                 payback = 1;
4013 
4014                         npp->p_lckcnt++;
4015                         opp->p_lckcnt--;
4016                 }
4017         }
4018         if (payback) {
4019                 mutex_enter(&freemem_lock);
4020                 availrmem++;
4021                 pages_useclaim--;
4022                 mutex_exit(&freemem_lock);
4023         }
4024 
4025         if (nidx < oidx) {
4026                 page_struct_unlock(opp);
4027                 page_struct_unlock(npp);
4028         } else if (oidx < nidx) {
4029                 page_struct_unlock(npp);
4030                 page_struct_unlock(opp);
4031         } else {        /* The pages hash to the same lock */
4032                 page_struct_unlock(npp);
4033         }
4034 }
4035 
4036 /*
4037  * Simple claim adjust functions -- used to support changes in
4038  * claims due to changes in access permissions.  Used by segvn_setprot().
4039  */
4040 int
4041 page_addclaim(page_t *pp)
4042 {
4043         int r = 0;                      /* result */
4044 
4045         ASSERT(PAGE_LOCKED(pp));
4046 
4047         page_struct_lock(pp);
4048         ASSERT(pp->p_lckcnt != 0);
4049 
4050         if (pp->p_lckcnt == 1) {
4051                 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4052                         --pp->p_lckcnt;
4053                         r = 1;
4054                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4055                                 cmn_err(CE_WARN,
4056                                     "COW lock limit reached on pfn 0x%lx",
4057                                     page_pptonum(pp));
4058                         }
4059                 }
4060         } else {
4061                 mutex_enter(&freemem_lock);
4062                 if ((availrmem > pages_pp_maximum) &&
4063                     (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4064                         --availrmem;
4065                         ++pages_claimed;
4066                         mutex_exit(&freemem_lock);
4067                         --pp->p_lckcnt;
4068                         r = 1;
4069                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4070                                 cmn_err(CE_WARN,
4071                                     "COW lock limit reached on pfn 0x%lx",
4072                                     page_pptonum(pp));
4073                         }
4074                 } else
4075                         mutex_exit(&freemem_lock);
4076         }
4077         page_struct_unlock(pp);
4078         return (r);
4079 }
4080 
4081 int
4082 page_subclaim(page_t *pp)
4083 {
4084         int r = 0;
4085 
4086         ASSERT(PAGE_LOCKED(pp));
4087 
4088         page_struct_lock(pp);
4089         ASSERT(pp->p_cowcnt != 0);
4090 
4091         if (pp->p_lckcnt) {
4092                 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4093                         r = 1;
4094                         /*
4095                          * for availrmem
4096                          */
4097                         mutex_enter(&freemem_lock);
4098                         availrmem++;
4099                         pages_claimed--;
4100                         mutex_exit(&freemem_lock);
4101 
4102                         pp->p_cowcnt--;
4103 
4104                         if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4105                                 cmn_err(CE_WARN,
4106                                     "Page lock limit reached on pfn 0x%lx",
4107                                     page_pptonum(pp));
4108                         }
4109                 }
4110         } else {
4111                 r = 1;
4112                 pp->p_cowcnt--;
4113                 pp->p_lckcnt++;
4114         }
4115         page_struct_unlock(pp);
4116         return (r);
4117 }
4118 
4119 /*
4120  * Variant of page_addclaim(), where ppa[] contains the pages of a single large
4121  * page.
4122  */
4123 int
4124 page_addclaim_pages(page_t  **ppa)
4125 {
4126         pgcnt_t lckpgs = 0, pg_idx;
4127 
4128         VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4129 
4130         /*
4131          * Only need to take the page struct lock on the large page root.
4132          */
4133         page_struct_lock(ppa[0]);
4134         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4135 
4136                 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4137                 ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4138                 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4139                         page_struct_unlock(ppa[0]);
4140                         return (0);
4141                 }
4142                 if (ppa[pg_idx]->p_lckcnt > 1)
4143                         lckpgs++;
4144         }
4145 
4146         if (lckpgs != 0) {
4147                 mutex_enter(&freemem_lock);
4148                 if (availrmem >= pages_pp_maximum + lckpgs) {
4149                         availrmem -= lckpgs;
4150                         pages_claimed += lckpgs;
4151                 } else {
4152                         mutex_exit(&freemem_lock);
4153                         page_struct_unlock(ppa[0]);
4154                         return (0);
4155                 }
4156                 mutex_exit(&freemem_lock);
4157         }
4158 
4159         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4160                 ppa[pg_idx]->p_lckcnt--;
4161                 ppa[pg_idx]->p_cowcnt++;
4162         }
4163         page_struct_unlock(ppa[0]);
4164         return (1);
4165 }
4166 
4167 /*
4168  * Variant of page_subclaim(), where ppa[] contains the pages of a single large
4169  * page.
4170  */
4171 int
4172 page_subclaim_pages(page_t  **ppa)
4173 {
4174         pgcnt_t ulckpgs = 0, pg_idx;
4175 
4176         VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4177 
4178         /*
4179          * Only need to take the page struct lock on the large page root.
4180          */
4181         page_struct_lock(ppa[0]);
4182         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4183 
4184                 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4185                 ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4186                 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4187                         page_struct_unlock(ppa[0]);
4188                         return (0);
4189                 }
4190                 if (ppa[pg_idx]->p_lckcnt != 0)
4191                         ulckpgs++;
4192         }
4193 
4194         if (ulckpgs != 0) {
4195                 mutex_enter(&freemem_lock);
4196                 availrmem += ulckpgs;
4197                 pages_claimed -= ulckpgs;
4198                 mutex_exit(&freemem_lock);
4199         }
4200 
4201         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4202                 ppa[pg_idx]->p_cowcnt--;
4203                 ppa[pg_idx]->p_lckcnt++;
4204 
4205         }
4206         page_struct_unlock(ppa[0]);
4207         return (1);
4208 }
4209 
4210 page_t *
4211 page_numtopp(pfn_t pfnum, se_t se)
4212 {
4213         page_t *pp;
4214 
4215 retry:
4216         pp = page_numtopp_nolock(pfnum);
4217         if (pp == NULL) {
4218                 return ((page_t *)NULL);
4219         }
4220 
4221         /*
4222          * Acquire the appropriate lock on the page.
4223          */
4224         while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4225                 if (page_pptonum(pp) != pfnum)
4226                         goto retry;
4227                 continue;
4228         }
4229 
4230         if (page_pptonum(pp) != pfnum) {
4231                 page_unlock(pp);
4232                 goto retry;
4233         }
4234 
4235         return (pp);
4236 }
4237 
4238 page_t *
4239 page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4240 {
4241         page_t *pp;
4242 
4243 retry:
4244         pp = page_numtopp_nolock(pfnum);
4245         if (pp == NULL) {
4246                 return ((page_t *)NULL);
4247         }
4248 
4249         /*
4250          * Acquire the appropriate lock on the page.
4251          */
4252         while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4253                 if (page_pptonum(pp) != pfnum)
4254                         goto retry;
4255                 continue;
4256         }
4257 
4258         if (page_pptonum(pp) != pfnum) {
4259                 page_unlock(pp);
4260                 goto retry;
4261         }
4262 
4263         return (pp);
4264 }
4265 
4266 /*
4267  * This routine is like page_numtopp, but will only return page structs
4268  * for pages which are ok for loading into hardware using the page struct.
4269  */
4270 page_t *
4271 page_numtopp_nowait(pfn_t pfnum, se_t se)
4272 {
4273         page_t *pp;
4274 
4275 retry:
4276         pp = page_numtopp_nolock(pfnum);
4277         if (pp == NULL) {
4278                 return ((page_t *)NULL);
4279         }
4280 
4281         /*
4282          * Try to acquire the appropriate lock on the page.
4283          */
4284         if (PP_ISFREE(pp))
4285                 pp = NULL;
4286         else {
4287                 if (!page_trylock(pp, se))
4288                         pp = NULL;
4289                 else {
4290                         if (page_pptonum(pp) != pfnum) {
4291                                 page_unlock(pp);
4292                                 goto retry;
4293                         }
4294                         if (PP_ISFREE(pp)) {
4295                                 page_unlock(pp);
4296                                 pp = NULL;
4297                         }
4298                 }
4299         }
4300         return (pp);
4301 }
4302 
4303 /*
4304  * Returns a count of dirty pages that are in the process
4305  * of being written out.  If 'cleanit' is set, try to push the page.
4306  */
4307 pgcnt_t
4308 page_busy(int cleanit)
4309 {
4310         page_t *page0 = page_first();
4311         page_t *pp = page0;
4312         pgcnt_t nppbusy = 0;
4313         u_offset_t off;
4314 
4315         do {
4316                 vnode_t *vp = pp->p_vnode;
4317                 /*
4318                  * A page is a candidate for syncing if it is:
4319                  *
4320                  * (a)  On neither the freelist nor the cachelist
4321                  * (b)  Hashed onto a vnode
4322                  * (c)  Not a kernel page
4323                  * (d)  Dirty
4324                  * (e)  Not part of a swapfile
4325                  * (f)  a page which belongs to a real vnode; eg has a non-null
4326                  *      v_vfsp pointer.
4327                  * (g)  Backed by a filesystem which doesn't have a
4328                  *      stubbed-out sync operation
4329                  */
4330                 if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4331                     hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4332                     vfs_can_sync(vp->v_vfsp)) {
4333                         nppbusy++;
4334 
4335                         if (!cleanit)
4336                                 continue;
4337                         if (!page_trylock(pp, SE_EXCL))
4338                                 continue;
4339 
4340                         if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4341                             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4342                             !(hat_pagesync(pp,
4343                             HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4344                                 page_unlock(pp);
4345                                 continue;
4346                         }
4347                         off = pp->p_offset;
4348                         VN_HOLD(vp);
4349                         page_unlock(pp);
4350                         (void) VOP_PUTPAGE(vp, off, PAGESIZE,
4351                             B_ASYNC | B_FREE, kcred, NULL);
4352                         VN_RELE(vp);
4353                 }
4354         } while ((pp = page_next(pp)) != page0);
4355 
4356         return (nppbusy);
4357 }
4358 
4359 void page_invalidate_pages(void);
4360 
4361 /*
4362  * callback handler to vm sub-system
4363  *
4364  * callers make sure no recursive entries to this func.
4365  */
4366 /*ARGSUSED*/
4367 boolean_t
4368 callb_vm_cpr(void *arg, int code)
4369 {
4370         if (code == CB_CODE_CPR_CHKPT)
4371                 page_invalidate_pages();
4372         return (B_TRUE);
4373 }
4374 
4375 /*
4376  * Invalidate all pages of the system.
4377  * It shouldn't be called until all user page activities are all stopped.
4378  */
4379 void
4380 page_invalidate_pages()
4381 {
4382         page_t *pp;
4383         page_t *page0;
4384         pgcnt_t nbusypages;
4385         int retry = 0;
4386         const int MAXRETRIES = 4;
4387 top:
4388         /*
4389          * Flush dirty pages and destroy the clean ones.
4390          */
4391         nbusypages = 0;
4392 
4393         pp = page0 = page_first();
4394         do {
4395                 struct vnode    *vp;
4396                 u_offset_t      offset;
4397                 int             mod;
4398 
4399                 /*
4400                  * skip the page if it has no vnode or the page associated
4401                  * with the kernel vnode or prom allocated kernel mem.
4402                  */
4403                 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4404                         continue;
4405 
4406                 /*
4407                  * skip the page which is already free invalidated.
4408                  */
4409                 if (PP_ISFREE(pp) && PP_ISAGED(pp))
4410                         continue;
4411 
4412                 /*
4413                  * skip pages that are already locked or can't be "exclusively"
4414                  * locked or are already free.  After we lock the page, check
4415                  * the free and age bits again to be sure it's not destroyed
4416                  * yet.
4417                  * To achieve max. parallelization, we use page_trylock instead
4418                  * of page_lock so that we don't get block on individual pages
4419                  * while we have thousands of other pages to process.
4420                  */
4421                 if (!page_trylock(pp, SE_EXCL)) {
4422                         nbusypages++;
4423                         continue;
4424                 } else if (PP_ISFREE(pp)) {
4425                         if (!PP_ISAGED(pp)) {
4426                                 page_destroy_free(pp);
4427                         } else {
4428                                 page_unlock(pp);
4429                         }
4430                         continue;
4431                 }
4432                 /*
4433                  * Is this page involved in some I/O? shared?
4434                  *
4435                  * The page_struct_lock need not be acquired to
4436                  * examine these fields since the page has an
4437                  * "exclusive" lock.
4438                  */
4439                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4440                         page_unlock(pp);
4441                         continue;
4442                 }
4443 
4444                 if (vp->v_type == VCHR) {
4445                         panic("vp->v_type == VCHR");
4446                         /*NOTREACHED*/
4447                 }
4448 
4449                 if (!page_try_demote_pages(pp)) {
4450                         page_unlock(pp);
4451                         continue;
4452                 }
4453 
4454                 /*
4455                  * Check the modified bit. Leave the bits alone in hardware
4456                  * (they will be modified if we do the putpage).
4457                  */
4458                 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4459                     & P_MOD);
4460                 if (mod) {
4461                         offset = pp->p_offset;
4462                         /*
4463                          * Hold the vnode before releasing the page lock
4464                          * to prevent it from being freed and re-used by
4465                          * some other thread.
4466                          */
4467                         VN_HOLD(vp);
4468                         page_unlock(pp);
4469                         /*
4470                          * No error return is checked here. Callers such as
4471                          * cpr deals with the dirty pages at the dump time
4472                          * if this putpage fails.
4473                          */
4474                         (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4475                             kcred, NULL);
4476                         VN_RELE(vp);
4477                 } else {
4478                         /*LINTED: constant in conditional context*/
4479                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
4480                 }
4481         } while ((pp = page_next(pp)) != page0);
4482         if (nbusypages && retry++ < MAXRETRIES) {
4483                 delay(1);
4484                 goto top;
4485         }
4486 }
4487 
4488 /*
4489  * Replace the page "old" with the page "new" on the page hash and vnode lists
4490  *
4491  * the replacement must be done in place, ie the equivalent sequence:
4492  *
4493  *      vp = old->p_vnode;
4494  *      off = old->p_offset;
4495  *      page_do_hashout(old)
4496  *      page_do_hashin(new, vp, off)
4497  *
4498  * doesn't work, since
4499  *  1) if old is the only page on the vnode, the v_pages list has a window
4500  *     where it looks empty. This will break file system assumptions.
4501  * and
4502  *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4503  */
4504 static void
4505 page_do_relocate_hash(page_t *new, page_t *old)
4506 {
4507         page_t  **hash_list;
4508         vnode_t *vp = old->p_vnode;
4509         kmutex_t *sep;
4510 
4511         ASSERT(PAGE_EXCL(old));
4512         ASSERT(PAGE_EXCL(new));
4513         ASSERT(vp != NULL);
4514         ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4515         ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4516 
4517         /*
4518          * First find old page on the page hash list
4519          */
4520         hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4521 
4522         for (;;) {
4523                 if (*hash_list == old)
4524                         break;
4525                 if (*hash_list == NULL) {
4526                         panic("page_do_hashout");
4527                         /*NOTREACHED*/
4528                 }
4529                 hash_list = &(*hash_list)->p_hash;
4530         }
4531 
4532         /*
4533          * update new and replace old with new on the page hash list
4534          */
4535         new->p_vnode = old->p_vnode;
4536         new->p_offset = old->p_offset;
4537         new->p_hash = old->p_hash;
4538         *hash_list = new;
4539 
4540         if ((new->p_vnode->v_flag & VISSWAP) != 0)
4541                 PP_SETSWAP(new);
4542 
4543         /*
4544          * replace old with new on the vnode's page list
4545          */
4546         if (old->p_vpnext == old) {
4547                 new->p_vpnext = new;
4548                 new->p_vpprev = new;
4549         } else {
4550                 new->p_vpnext = old->p_vpnext;
4551                 new->p_vpprev = old->p_vpprev;
4552                 new->p_vpnext->p_vpprev = new;
4553                 new->p_vpprev->p_vpnext = new;
4554         }
4555         if (vp->v_pages == old)
4556                 vp->v_pages = new;
4557 
4558         /*
4559          * clear out the old page
4560          */
4561         old->p_hash = NULL;
4562         old->p_vpnext = NULL;
4563         old->p_vpprev = NULL;
4564         old->p_vnode = NULL;
4565         PP_CLRSWAP(old);
4566         old->p_offset = (u_offset_t)-1;
4567         page_clr_all_props(old);
4568 
4569         /*
4570          * Wake up processes waiting for this page.  The page's
4571          * identity has been changed, and is probably not the
4572          * desired page any longer.
4573          */
4574         sep = page_se_mutex(old);
4575         mutex_enter(sep);
4576         old->p_selock &= ~SE_EWANTED;
4577         if (CV_HAS_WAITERS(&old->p_cv))
4578                 cv_broadcast(&old->p_cv);
4579         mutex_exit(sep);
4580 }
4581 
4582 /*
4583  * This function moves the identity of page "pp_old" to page "pp_new".
4584  * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4585  * and need not be hashed out from anywhere.
4586  */
4587 void
4588 page_relocate_hash(page_t *pp_new, page_t *pp_old)
4589 {
4590         vnode_t *vp = pp_old->p_vnode;
4591         u_offset_t off = pp_old->p_offset;
4592         kmutex_t *phm, *vphm;
4593 
4594         /*
4595          * Rehash two pages
4596          */
4597         ASSERT(PAGE_EXCL(pp_old));
4598         ASSERT(PAGE_EXCL(pp_new));
4599         ASSERT(vp != NULL);
4600         ASSERT(pp_new->p_vnode == NULL);
4601 
4602         /*
4603          * hashout then hashin while holding the mutexes
4604          */
4605         phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4606         mutex_enter(phm);
4607         vphm = page_vnode_mutex(vp);
4608         mutex_enter(vphm);
4609 
4610         page_do_relocate_hash(pp_new, pp_old);
4611 
4612         /* The following comment preserved from page_flip(). */
4613         pp_new->p_fsdata = pp_old->p_fsdata;
4614         pp_old->p_fsdata = 0;
4615         mutex_exit(vphm);
4616         mutex_exit(phm);
4617 
4618         /*
4619          * The page_struct_lock need not be acquired for lckcnt and
4620          * cowcnt since the page has an "exclusive" lock.
4621          */
4622         ASSERT(pp_new->p_lckcnt == 0);
4623         ASSERT(pp_new->p_cowcnt == 0);
4624         pp_new->p_lckcnt = pp_old->p_lckcnt;
4625         pp_new->p_cowcnt = pp_old->p_cowcnt;
4626         pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4627 
4628 }
4629 
4630 /*
4631  * Helper routine used to lock all remaining members of a
4632  * large page. The caller is responsible for passing in a locked
4633  * pp. If pp is a large page, then it succeeds in locking all the
4634  * remaining constituent pages or it returns with only the
4635  * original page locked.
4636  *
4637  * Returns 1 on success, 0 on failure.
4638  *
4639  * If success is returned this routine guarantees p_szc for all constituent
4640  * pages of a large page pp belongs to can't change. To achieve this we
4641  * recheck szc of pp after locking all constituent pages and retry if szc
4642  * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4643  * lock on one of constituent pages it can't be running after all constituent
4644  * pages are locked.  hat_page_demote() with a lock on a constituent page
4645  * outside of this large page (i.e. pp belonged to a larger large page) is
4646  * already done with all constituent pages of pp since the root's p_szc is
4647  * changed last. Therefore no need to synchronize with hat_page_demote() that
4648  * locked a constituent page outside of pp's current large page.
4649  */
4650 #ifdef DEBUG
4651 uint32_t gpg_trylock_mtbf = 0;
4652 #endif
4653 
4654 int
4655 group_page_trylock(page_t *pp, se_t se)
4656 {
4657         page_t  *tpp;
4658         pgcnt_t npgs, i, j;
4659         uint_t pszc = pp->p_szc;
4660 
4661 #ifdef DEBUG
4662         if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4663                 return (0);
4664         }
4665 #endif
4666 
4667         if (pp != PP_GROUPLEADER(pp, pszc)) {
4668                 return (0);
4669         }
4670 
4671 retry:
4672         ASSERT(PAGE_LOCKED_SE(pp, se));
4673         ASSERT(!PP_ISFREE(pp));
4674         if (pszc == 0) {
4675                 return (1);
4676         }
4677         npgs = page_get_pagecnt(pszc);
4678         tpp = pp + 1;
4679         for (i = 1; i < npgs; i++, tpp++) {
4680                 if (!page_trylock(tpp, se)) {
4681                         tpp = pp + 1;
4682                         for (j = 1; j < i; j++, tpp++) {
4683                                 page_unlock(tpp);
4684                         }
4685                         return (0);
4686                 }
4687         }
4688         if (pp->p_szc != pszc) {
4689                 ASSERT(pp->p_szc < pszc);
4690                 ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4691                     !IS_SWAPFSVP(pp->p_vnode));
4692                 tpp = pp + 1;
4693                 for (i = 1; i < npgs; i++, tpp++) {
4694                         page_unlock(tpp);
4695                 }
4696                 pszc = pp->p_szc;
4697                 goto retry;
4698         }
4699         return (1);
4700 }
4701 
4702 void
4703 group_page_unlock(page_t *pp)
4704 {
4705         page_t *tpp;
4706         pgcnt_t npgs, i;
4707 
4708         ASSERT(PAGE_LOCKED(pp));
4709         ASSERT(!PP_ISFREE(pp));
4710         ASSERT(pp == PP_PAGEROOT(pp));
4711         npgs = page_get_pagecnt(pp->p_szc);
4712         for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4713                 page_unlock(tpp);
4714         }
4715 }
4716 
4717 /*
4718  * returns
4719  * 0            : on success and *nrelocp is number of relocated PAGESIZE pages
4720  * ERANGE       : this is not a base page
4721  * EBUSY        : failure to get locks on the page/pages
4722  * ENOMEM       : failure to obtain replacement pages
4723  * EAGAIN       : OBP has not yet completed its boot-time handoff to the kernel
4724  * EIO          : An error occurred while trying to copy the page data
4725  *
4726  * Return with all constituent members of target and replacement
4727  * SE_EXCL locked. It is the callers responsibility to drop the
4728  * locks.
4729  */
4730 int
4731 do_page_relocate(
4732         page_t **target,
4733         page_t **replacement,
4734         int grouplock,
4735         spgcnt_t *nrelocp,
4736         lgrp_t *lgrp)
4737 {
4738         page_t *first_repl;
4739         page_t *repl;
4740         page_t *targ;
4741         page_t *pl = NULL;
4742         uint_t ppattr;
4743         pfn_t   pfn, repl_pfn;
4744         uint_t  szc;
4745         spgcnt_t npgs, i;
4746         int repl_contig = 0;
4747         uint_t flags = 0;
4748         spgcnt_t dofree = 0;
4749 
4750         *nrelocp = 0;
4751 
4752 #if defined(__sparc)
4753         /*
4754          * We need to wait till OBP has completed
4755          * its boot-time handoff of its resources to the kernel
4756          * before we allow page relocation
4757          */
4758         if (page_relocate_ready == 0) {
4759                 return (EAGAIN);
4760         }
4761 #endif
4762 
4763         /*
4764          * If this is not a base page,
4765          * just return with 0x0 pages relocated.
4766          */
4767         targ = *target;
4768         ASSERT(PAGE_EXCL(targ));
4769         ASSERT(!PP_ISFREE(targ));
4770         szc = targ->p_szc;
4771         ASSERT(szc < mmu_page_sizes);
4772         VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4773         pfn = targ->p_pagenum;
4774         if (pfn != PFN_BASE(pfn, szc)) {
4775                 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4776                 return (ERANGE);
4777         }
4778 
4779         if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4780                 repl_pfn = repl->p_pagenum;
4781                 if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4782                         VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4783                         return (ERANGE);
4784                 }
4785                 repl_contig = 1;
4786         }
4787 
4788         /*
4789          * We must lock all members of this large page or we cannot
4790          * relocate any part of it.
4791          */
4792         if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4793                 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4794                 return (EBUSY);
4795         }
4796 
4797         /*
4798          * reread szc it could have been decreased before
4799          * group_page_trylock() was done.
4800          */
4801         szc = targ->p_szc;
4802         ASSERT(szc < mmu_page_sizes);
4803         VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4804         ASSERT(pfn == PFN_BASE(pfn, szc));
4805 
4806         npgs = page_get_pagecnt(targ->p_szc);
4807 
4808         if (repl == NULL) {
4809                 dofree = npgs;          /* Size of target page in MMU pages */
4810                 if (!page_create_wait(dofree, 0)) {
4811                         if (grouplock != 0) {
4812                                 group_page_unlock(targ);
4813                         }
4814                         VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4815                         return (ENOMEM);
4816                 }
4817 
4818                 /*
4819                  * seg kmem pages require that the target and replacement
4820                  * page be the same pagesize.
4821                  */
4822                 flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4823                 repl = page_get_replacement_page(targ, lgrp, flags);
4824                 if (repl == NULL) {
4825                         if (grouplock != 0) {
4826                                 group_page_unlock(targ);
4827                         }
4828                         page_create_putback(dofree);
4829                         VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4830                         return (ENOMEM);
4831                 }
4832         }
4833 #ifdef DEBUG
4834         else {
4835                 ASSERT(PAGE_LOCKED(repl));
4836         }
4837 #endif /* DEBUG */
4838 
4839 #if defined(__sparc)
4840         /*
4841          * Let hat_page_relocate() complete the relocation if it's kernel page
4842          */
4843         if (VN_ISKAS(targ->p_vnode)) {
4844                 *replacement = repl;
4845                 if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4846                         if (grouplock != 0) {
4847                                 group_page_unlock(targ);
4848                         }
4849                         if (dofree) {
4850                                 *replacement = NULL;
4851                                 page_free_replacement_page(repl);
4852                                 page_create_putback(dofree);
4853                         }
4854                         VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4855                         return (EAGAIN);
4856                 }
4857                 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4858                 return (0);
4859         }
4860 #else
4861 #if defined(lint)
4862         dofree = dofree;
4863 #endif
4864 #endif
4865 
4866         first_repl = repl;
4867 
4868         for (i = 0; i < npgs; i++) {
4869                 ASSERT(PAGE_EXCL(targ));
4870                 ASSERT(targ->p_slckcnt == 0);
4871                 ASSERT(repl->p_slckcnt == 0);
4872 
4873                 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4874 
4875                 ASSERT(hat_page_getshare(targ) == 0);
4876                 ASSERT(!PP_ISFREE(targ));
4877                 ASSERT(targ->p_pagenum == (pfn + i));
4878                 ASSERT(repl_contig == 0 ||
4879                     repl->p_pagenum == (repl_pfn + i));
4880 
4881                 /*
4882                  * Copy the page contents and attributes then
4883                  * relocate the page in the page hash.
4884                  */
4885                 if (ppcopy(targ, repl) == 0) {
4886                         targ = *target;
4887                         repl = first_repl;
4888                         VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4889                         if (grouplock != 0) {
4890                                 group_page_unlock(targ);
4891                         }
4892                         if (dofree) {
4893                                 *replacement = NULL;
4894                                 page_free_replacement_page(repl);
4895                                 page_create_putback(dofree);
4896                         }
4897                         return (EIO);
4898                 }
4899 
4900                 targ++;
4901                 if (repl_contig != 0) {
4902                         repl++;
4903                 } else {
4904                         repl = repl->p_next;
4905                 }
4906         }
4907 
4908         repl = first_repl;
4909         targ = *target;
4910 
4911         for (i = 0; i < npgs; i++) {
4912                 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4913                 page_clr_all_props(repl);
4914                 page_set_props(repl, ppattr);
4915                 page_relocate_hash(repl, targ);
4916 
4917                 ASSERT(hat_page_getshare(targ) == 0);
4918                 ASSERT(hat_page_getshare(repl) == 0);
4919                 /*
4920                  * Now clear the props on targ, after the
4921                  * page_relocate_hash(), they no longer
4922                  * have any meaning.
4923                  */
4924                 page_clr_all_props(targ);
4925                 ASSERT(targ->p_next == targ);
4926                 ASSERT(targ->p_prev == targ);
4927                 page_list_concat(&pl, &targ);
4928 
4929                 targ++;
4930                 if (repl_contig != 0) {
4931                         repl++;
4932                 } else {
4933                         repl = repl->p_next;
4934                 }
4935         }
4936         /* assert that we have come full circle with repl */
4937         ASSERT(repl_contig == 1 || first_repl == repl);
4938 
4939         *target = pl;
4940         if (*replacement == NULL) {
4941                 ASSERT(first_repl == repl);
4942                 *replacement = repl;
4943         }
4944         VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4945         *nrelocp = npgs;
4946         return (0);
4947 }
4948 /*
4949  * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4950  */
4951 int
4952 page_relocate(
4953         page_t **target,
4954         page_t **replacement,
4955         int grouplock,
4956         int freetarget,
4957         spgcnt_t *nrelocp,
4958         lgrp_t *lgrp)
4959 {
4960         spgcnt_t ret;
4961 
4962         /* do_page_relocate returns 0 on success or errno value */
4963         ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4964 
4965         if (ret != 0 || freetarget == 0) {
4966                 return (ret);
4967         }
4968         if (*nrelocp == 1) {
4969                 ASSERT(*target != NULL);
4970                 page_free(*target, 1);
4971         } else {
4972                 page_t *tpp = *target;
4973                 uint_t szc = tpp->p_szc;
4974                 pgcnt_t npgs = page_get_pagecnt(szc);
4975                 ASSERT(npgs > 1);
4976                 ASSERT(szc != 0);
4977                 do {
4978                         ASSERT(PAGE_EXCL(tpp));
4979                         ASSERT(!hat_page_is_mapped(tpp));
4980                         ASSERT(tpp->p_szc == szc);
4981                         PP_SETFREE(tpp);
4982                         PP_SETAGED(tpp);
4983                         npgs--;
4984                 } while ((tpp = tpp->p_next) != *target);
4985                 ASSERT(npgs == 0);
4986                 page_list_add_pages(*target, 0);
4987                 npgs = page_get_pagecnt(szc);
4988                 page_create_putback(npgs);
4989         }
4990         return (ret);
4991 }
4992 
4993 /*
4994  * it is up to the caller to deal with pcf accounting.
4995  */
4996 void
4997 page_free_replacement_page(page_t *pplist)
4998 {
4999         page_t *pp;
5000 
5001         while (pplist != NULL) {
5002                 /*
5003                  * pp_targ is a linked list.
5004                  */
5005                 pp = pplist;
5006                 if (pp->p_szc == 0) {
5007                         page_sub(&pplist, pp);
5008                         page_clr_all_props(pp);
5009                         PP_SETFREE(pp);
5010                         PP_SETAGED(pp);
5011                         page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5012                         page_unlock(pp);
5013                         VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5014                 } else {
5015                         spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5016                         page_t *tpp;
5017                         page_list_break(&pp, &pplist, curnpgs);
5018                         tpp = pp;
5019                         do {
5020                                 ASSERT(PAGE_EXCL(tpp));
5021                                 ASSERT(!hat_page_is_mapped(tpp));
5022                                 page_clr_all_props(tpp);
5023                                 PP_SETFREE(tpp);
5024                                 PP_SETAGED(tpp);
5025                         } while ((tpp = tpp->p_next) != pp);
5026                         page_list_add_pages(pp, 0);
5027                         VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5028                 }
5029         }
5030 }
5031 
5032 /*
5033  * Relocate target to non-relocatable replacement page.
5034  */
5035 int
5036 page_relocate_cage(page_t **target, page_t **replacement)
5037 {
5038         page_t *tpp, *rpp;
5039         spgcnt_t pgcnt, npgs;
5040         int result;
5041 
5042         tpp = *target;
5043 
5044         ASSERT(PAGE_EXCL(tpp));
5045         ASSERT(tpp->p_szc == 0);
5046 
5047         pgcnt = btop(page_get_pagesize(tpp->p_szc));
5048 
5049         do {
5050                 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5051                 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5052                 if (rpp == NULL) {
5053                         page_create_putback(pgcnt);
5054                         kcage_cageout_wakeup();
5055                 }
5056         } while (rpp == NULL);
5057 
5058         ASSERT(PP_ISNORELOC(rpp));
5059 
5060         result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5061 
5062         if (result == 0) {
5063                 *replacement = rpp;
5064                 if (pgcnt != npgs)
5065                         panic("page_relocate_cage: partial relocation");
5066         }
5067 
5068         return (result);
5069 }
5070 
5071 /*
5072  * Release the page lock on a page, place on cachelist
5073  * tail if no longer mapped. Caller can let us know if
5074  * the page is known to be clean.
5075  */
5076 int
5077 page_release(page_t *pp, int checkmod)
5078 {
5079         int status;
5080 
5081         ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5082             (pp->p_vnode != NULL));
5083 
5084         if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5085             ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5086             pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5087             !hat_page_is_mapped(pp)) {
5088 
5089                 /*
5090                  * If page is modified, unlock it
5091                  *
5092                  * (p_nrm & P_MOD) bit has the latest stuff because:
5093                  * (1) We found that this page doesn't have any mappings
5094                  *      _after_ holding SE_EXCL and
5095                  * (2) We didn't drop SE_EXCL lock after the check in (1)
5096                  */
5097                 if (checkmod && hat_ismod(pp)) {
5098                         page_unlock(pp);
5099                         status = PGREL_MOD;
5100                 } else {
5101                         /*LINTED: constant in conditional context*/
5102                         VN_DISPOSE(pp, B_FREE, 0, kcred);
5103                         status = PGREL_CLEAN;
5104                 }
5105         } else {
5106                 page_unlock(pp);
5107                 status = PGREL_NOTREL;
5108         }
5109         return (status);
5110 }
5111 
5112 /*
5113  * Given a constituent page, try to demote the large page on the freelist.
5114  *
5115  * Returns nonzero if the page could be demoted successfully. Returns with
5116  * the constituent page still locked.
5117  */
5118 int
5119 page_try_demote_free_pages(page_t *pp)
5120 {
5121         page_t *rootpp = pp;
5122         pfn_t   pfn = page_pptonum(pp);
5123         spgcnt_t npgs;
5124         uint_t  szc = pp->p_szc;
5125 
5126         ASSERT(PP_ISFREE(pp));
5127         ASSERT(PAGE_EXCL(pp));
5128 
5129         /*
5130          * Adjust rootpp and lock it, if `pp' is not the base
5131          * constituent page.
5132          */
5133         npgs = page_get_pagecnt(pp->p_szc);
5134         if (npgs == 1) {
5135                 return (0);
5136         }
5137 
5138         if (!IS_P2ALIGNED(pfn, npgs)) {
5139                 pfn = P2ALIGN(pfn, npgs);
5140                 rootpp = page_numtopp_nolock(pfn);
5141         }
5142 
5143         if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5144                 return (0);
5145         }
5146 
5147         if (rootpp->p_szc != szc) {
5148                 if (pp != rootpp)
5149                         page_unlock(rootpp);
5150                 return (0);
5151         }
5152 
5153         page_demote_free_pages(rootpp);
5154 
5155         if (pp != rootpp)
5156                 page_unlock(rootpp);
5157 
5158         ASSERT(PP_ISFREE(pp));
5159         ASSERT(PAGE_EXCL(pp));
5160         return (1);
5161 }
5162 
5163 /*
5164  * Given a constituent page, try to demote the large page.
5165  *
5166  * Returns nonzero if the page could be demoted successfully. Returns with
5167  * the constituent page still locked.
5168  */
5169 int
5170 page_try_demote_pages(page_t *pp)
5171 {
5172         page_t *tpp, *rootpp = pp;
5173         pfn_t   pfn = page_pptonum(pp);
5174         spgcnt_t i, npgs;
5175         uint_t  szc = pp->p_szc;
5176         vnode_t *vp = pp->p_vnode;
5177 
5178         ASSERT(PAGE_EXCL(pp));
5179 
5180         VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5181 
5182         if (pp->p_szc == 0) {
5183                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5184                 return (1);
5185         }
5186 
5187         if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5188                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5189                 page_demote_vp_pages(pp);
5190                 ASSERT(pp->p_szc == 0);
5191                 return (1);
5192         }
5193 
5194         /*
5195          * Adjust rootpp if passed in is not the base
5196          * constituent page.
5197          */
5198         npgs = page_get_pagecnt(pp->p_szc);
5199         ASSERT(npgs > 1);
5200         if (!IS_P2ALIGNED(pfn, npgs)) {
5201                 pfn = P2ALIGN(pfn, npgs);
5202                 rootpp = page_numtopp_nolock(pfn);
5203                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5204                 ASSERT(rootpp->p_vnode != NULL);
5205                 ASSERT(rootpp->p_szc == szc);
5206         }
5207 
5208         /*
5209          * We can't demote kernel pages since we can't hat_unload()
5210          * the mappings.
5211          */
5212         if (VN_ISKAS(rootpp->p_vnode))
5213                 return (0);
5214 
5215         /*
5216          * Attempt to lock all constituent pages except the page passed
5217          * in since it's already locked.
5218          */
5219         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5220                 ASSERT(!PP_ISFREE(tpp));
5221                 ASSERT(tpp->p_vnode != NULL);
5222 
5223                 if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5224                         break;
5225                 ASSERT(tpp->p_szc == rootpp->p_szc);
5226                 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5227         }
5228 
5229         /*
5230          * If we failed to lock them all then unlock what we have
5231          * locked so far and bail.
5232          */
5233         if (i < npgs) {
5234                 tpp = rootpp;
5235                 while (i-- > 0) {
5236                         if (tpp != pp)
5237                                 page_unlock(tpp);
5238                         tpp++;
5239                 }
5240                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5241                 return (0);
5242         }
5243 
5244         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5245                 ASSERT(PAGE_EXCL(tpp));
5246                 ASSERT(tpp->p_slckcnt == 0);
5247                 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5248                 tpp->p_szc = 0;
5249         }
5250 
5251         /*
5252          * Unlock all pages except the page passed in.
5253          */
5254         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5255                 ASSERT(!hat_page_is_mapped(tpp));
5256                 if (tpp != pp)
5257                         page_unlock(tpp);
5258         }
5259 
5260         VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5261         return (1);
5262 }
5263 
5264 /*
5265  * Called by page_free() and page_destroy() to demote the page size code
5266  * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5267  * p_szc on free list, neither can we just clear p_szc of a single page_t
5268  * within a large page since it will break other code that relies on p_szc
5269  * being the same for all page_t's of a large page). Anonymous pages should
5270  * never end up here because anon_map_getpages() cannot deal with p_szc
5271  * changes after a single constituent page is locked.  While anonymous or
5272  * kernel large pages are demoted or freed the entire large page at a time
5273  * with all constituent pages locked EXCL for the file system pages we
5274  * have to be able to demote a large page (i.e. decrease all constituent pages
5275  * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5276  * we can easily deal with anonymous page demotion the entire large page at a
5277  * time is that those operation originate at address space level and concern
5278  * the entire large page region with actual demotion only done when pages are
5279  * not shared with any other processes (therefore we can always get EXCL lock
5280  * on all anonymous constituent pages after clearing segment page
5281  * cache). However file system pages can be truncated or invalidated at a
5282  * PAGESIZE level from the file system side and end up in page_free() or
5283  * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5284  * and therefore pageout should be able to demote a large page by EXCL locking
5285  * any constituent page that is not under SOFTLOCK). In those cases we cannot
5286  * rely on being able to lock EXCL all constituent pages.
5287  *
5288  * To prevent szc changes on file system pages one has to lock all constituent
5289  * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5290  * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5291  * prevent szc changes is hat layer that uses its own page level mlist
5292  * locks. hat assumes that szc doesn't change after mlist lock for a page is
5293  * taken. Therefore we need to change szc under hat level locks if we only
5294  * have an EXCL lock on a single constituent page and hat still references any
5295  * of constituent pages.  (Note we can't "ignore" hat layer by simply
5296  * hat_pageunload() all constituent pages without having EXCL locks on all of
5297  * constituent pages). We use hat_page_demote() call to safely demote szc of
5298  * all constituent pages under hat locks when we only have an EXCL lock on one
5299  * of constituent pages.
5300  *
5301  * This routine calls page_szc_lock() before calling hat_page_demote() to
5302  * allow segvn in one special case not to lock all constituent pages SHARED
5303  * before calling hat_memload_array() that relies on p_szc not changing even
5304  * before hat level mlist lock is taken.  In that case segvn uses
5305  * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5306  *
5307  * Anonymous or kernel page demotion still has to lock all pages exclusively
5308  * and do hat_pageunload() on all constituent pages before demoting the page
5309  * therefore there's no need for anonymous or kernel page demotion to use
5310  * hat_page_demote() mechanism.
5311  *
5312  * hat_page_demote() removes all large mappings that map pp and then decreases
5313  * p_szc starting from the last constituent page of the large page. By working
5314  * from the tail of a large page in pfn decreasing order allows one looking at
5315  * the root page to know that hat_page_demote() is done for root's szc area.
5316  * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5317  * pages within szc 1 area to prevent szc changes because hat_page_demote()
5318  * that started on this page when it had szc > 1 is done for this szc 1 area.
5319  *
5320  * We are guaranteed that all constituent pages of pp's large page belong to
5321  * the same vnode with the consecutive offsets increasing in the direction of
5322  * the pfn i.e. the identity of constituent pages can't change until their
5323  * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5324  * large mappings to pp even though we don't lock any constituent page except
5325  * pp (i.e. we won't unload e.g. kernel locked page).
5326  */
5327 static void
5328 page_demote_vp_pages(page_t *pp)
5329 {
5330         kmutex_t *mtx;
5331 
5332         ASSERT(PAGE_EXCL(pp));
5333         ASSERT(!PP_ISFREE(pp));
5334         ASSERT(pp->p_vnode != NULL);
5335         ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5336         ASSERT(!PP_ISKAS(pp));
5337 
5338         VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5339 
5340         mtx = page_szc_lock(pp);
5341         if (mtx != NULL) {
5342                 hat_page_demote(pp);
5343                 mutex_exit(mtx);
5344         }
5345         ASSERT(pp->p_szc == 0);
5346 }
5347 
5348 /*
5349  * Mark any existing pages for migration in the given range
5350  */
5351 void
5352 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5353     struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5354     u_offset_t vnoff, int rflag)
5355 {
5356         struct anon     *ap;
5357         vnode_t         *curvp;
5358         lgrp_t          *from;
5359         pgcnt_t         nlocked;
5360         u_offset_t      off;
5361         pfn_t           pfn;
5362         size_t          pgsz;
5363         size_t          segpgsz;
5364         pgcnt_t         pages;
5365         uint_t          pszc;
5366         page_t          *pp0, *pp;
5367         caddr_t         va;
5368         ulong_t         an_idx;
5369         anon_sync_obj_t cookie;
5370 
5371         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
5372 
5373         /*
5374          * Don't do anything if don't need to do lgroup optimizations
5375          * on this system
5376          */
5377         if (!lgrp_optimizations())
5378                 return;
5379 
5380         /*
5381          * Align address and length to (potentially large) page boundary
5382          */
5383         segpgsz = page_get_pagesize(seg->s_szc);
5384         addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5385         if (rflag)
5386                 len = P2ROUNDUP(len, segpgsz);
5387 
5388         /*
5389          * Do one (large) page at a time
5390          */
5391         va = addr;
5392         while (va < addr + len) {
5393                 /*
5394                  * Lookup (root) page for vnode and offset corresponding to
5395                  * this virtual address
5396                  * Try anonmap first since there may be copy-on-write
5397                  * pages, but initialize vnode pointer and offset using
5398                  * vnode arguments just in case there isn't an amp.
5399                  */
5400                 curvp = vp;
5401                 off = vnoff + va - seg->s_base;
5402                 if (amp) {
5403                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5404                         an_idx = anon_index + seg_page(seg, va);
5405                         anon_array_enter(amp, an_idx, &cookie);
5406                         ap = anon_get_ptr(amp->ahp, an_idx);
5407                         if (ap)
5408                                 swap_xlate(ap, &curvp, &off);
5409                         anon_array_exit(&cookie);
5410                         ANON_LOCK_EXIT(&amp->a_rwlock);
5411                 }
5412 
5413                 pp = NULL;
5414                 if (curvp)
5415                         pp = page_lookup(curvp, off, SE_SHARED);
5416 
5417                 /*
5418                  * If there isn't a page at this virtual address,
5419                  * skip to next page
5420                  */
5421                 if (pp == NULL) {
5422                         va += PAGESIZE;
5423                         continue;
5424                 }
5425 
5426                 /*
5427                  * Figure out which lgroup this page is in for kstats
5428                  */
5429                 pfn = page_pptonum(pp);
5430                 from = lgrp_pfn_to_lgrp(pfn);
5431 
5432                 /*
5433                  * Get page size, and round up and skip to next page boundary
5434                  * if unaligned address
5435                  */
5436                 pszc = pp->p_szc;
5437                 pgsz = page_get_pagesize(pszc);
5438                 pages = btop(pgsz);
5439                 if (!IS_P2ALIGNED(va, pgsz) ||
5440                     !IS_P2ALIGNED(pfn, pages) ||
5441                     pgsz > segpgsz) {
5442                         pgsz = MIN(pgsz, segpgsz);
5443                         page_unlock(pp);
5444                         pages = btop(P2END((uintptr_t)va, pgsz) -
5445                             (uintptr_t)va);
5446                         va = (caddr_t)P2END((uintptr_t)va, pgsz);
5447                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5448                         continue;
5449                 }
5450 
5451                 /*
5452                  * Upgrade to exclusive lock on page
5453                  */
5454                 if (!page_tryupgrade(pp)) {
5455                         page_unlock(pp);
5456                         va += pgsz;
5457                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5458                             btop(pgsz));
5459                         continue;
5460                 }
5461 
5462                 pp0 = pp++;
5463                 nlocked = 1;
5464 
5465                 /*
5466                  * Lock constituent pages if this is large page
5467                  */
5468                 if (pages > 1) {
5469                         /*
5470                          * Lock all constituents except root page, since it
5471                          * should be locked already.
5472                          */
5473                         for (; nlocked < pages; nlocked++) {
5474                                 if (!page_trylock(pp, SE_EXCL)) {
5475                                         break;
5476                                 }
5477                                 if (PP_ISFREE(pp) ||
5478                                     pp->p_szc != pszc) {
5479                                         /*
5480                                          * hat_page_demote() raced in with us.
5481                                          */
5482                                         ASSERT(!IS_SWAPFSVP(curvp));
5483                                         page_unlock(pp);
5484                                         break;
5485                                 }
5486                                 pp++;
5487                         }
5488                 }
5489 
5490                 /*
5491                  * If all constituent pages couldn't be locked,
5492                  * unlock pages locked so far and skip to next page.
5493                  */
5494                 if (nlocked < pages) {
5495                         while (pp0 < pp) {
5496                                 page_unlock(pp0++);
5497                         }
5498                         va += pgsz;
5499                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5500                             btop(pgsz));
5501                         continue;
5502                 }
5503 
5504                 /*
5505                  * hat_page_demote() can no longer happen
5506                  * since last cons page had the right p_szc after
5507                  * all cons pages were locked. all cons pages
5508                  * should now have the same p_szc.
5509                  */
5510 
5511                 /*
5512                  * All constituent pages locked successfully, so mark
5513                  * large page for migration and unload the mappings of
5514                  * constituent pages, so a fault will occur on any part of the
5515                  * large page
5516                  */
5517                 PP_SETMIGRATE(pp0);
5518                 while (pp0 < pp) {
5519                         (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5520                         ASSERT(hat_page_getshare(pp0) == 0);
5521                         page_unlock(pp0++);
5522                 }
5523                 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5524 
5525                 va += pgsz;
5526         }
5527 }
5528 
5529 /*
5530  * Migrate any pages that have been marked for migration in the given range
5531  */
5532 void
5533 page_migrate(
5534         struct seg      *seg,
5535         caddr_t         addr,
5536         page_t          **ppa,
5537         pgcnt_t         npages)
5538 {
5539         lgrp_t          *from;
5540         lgrp_t          *to;
5541         page_t          *newpp;
5542         page_t          *pp;
5543         pfn_t           pfn;
5544         size_t          pgsz;
5545         spgcnt_t        page_cnt;
5546         spgcnt_t        i;
5547         uint_t          pszc;
5548 
5549         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
5550 
5551         while (npages > 0) {
5552                 pp = *ppa;
5553                 pszc = pp->p_szc;
5554                 pgsz = page_get_pagesize(pszc);
5555                 page_cnt = btop(pgsz);
5556 
5557                 /*
5558                  * Check to see whether this page is marked for migration
5559                  *
5560                  * Assume that root page of large page is marked for
5561                  * migration and none of the other constituent pages
5562                  * are marked.  This really simplifies clearing the
5563                  * migrate bit by not having to clear it from each
5564                  * constituent page.
5565                  *
5566                  * note we don't want to relocate an entire large page if
5567                  * someone is only using one subpage.
5568                  */
5569                 if (npages < page_cnt)
5570                         break;
5571 
5572                 /*
5573                  * Is it marked for migration?
5574                  */
5575                 if (!PP_ISMIGRATE(pp))
5576                         goto next;
5577 
5578                 /*
5579                  * Determine lgroups that page is being migrated between
5580                  */
5581                 pfn = page_pptonum(pp);
5582                 if (!IS_P2ALIGNED(pfn, page_cnt)) {
5583                         break;
5584                 }
5585                 from = lgrp_pfn_to_lgrp(pfn);
5586                 to = lgrp_mem_choose(seg, addr, pgsz);
5587 
5588                 /*
5589                  * Need to get exclusive lock's to migrate
5590                  */
5591                 for (i = 0; i < page_cnt; i++) {
5592                         ASSERT(PAGE_LOCKED(ppa[i]));
5593                         if (page_pptonum(ppa[i]) != pfn + i ||
5594                             ppa[i]->p_szc != pszc) {
5595                                 break;
5596                         }
5597                         if (!page_tryupgrade(ppa[i])) {
5598                                 lgrp_stat_add(from->lgrp_id,
5599                                     LGRP_PM_FAIL_LOCK_PGS,
5600                                     page_cnt);
5601                                 break;
5602                         }
5603 
5604                         /*
5605                          * Check to see whether we are trying to migrate
5606                          * page to lgroup where it is allocated already.
5607                          * If so, clear the migrate bit and skip to next
5608                          * page.
5609                          */
5610                         if (i == 0 && to == from) {
5611                                 PP_CLRMIGRATE(ppa[0]);
5612                                 page_downgrade(ppa[0]);
5613                                 goto next;
5614                         }
5615                 }
5616 
5617                 /*
5618                  * If all constituent pages couldn't be locked,
5619                  * unlock pages locked so far and skip to next page.
5620                  */
5621                 if (i != page_cnt) {
5622                         while (--i != -1) {
5623                                 page_downgrade(ppa[i]);
5624                         }
5625                         goto next;
5626                 }
5627 
5628                 (void) page_create_wait(page_cnt, PG_WAIT);
5629                 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5630                 if (newpp == NULL) {
5631                         page_create_putback(page_cnt);
5632                         for (i = 0; i < page_cnt; i++) {
5633                                 page_downgrade(ppa[i]);
5634                         }
5635                         lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5636                             page_cnt);
5637                         goto next;
5638                 }
5639                 ASSERT(newpp->p_szc == pszc);
5640                 /*
5641                  * Clear migrate bit and relocate page
5642                  */
5643                 PP_CLRMIGRATE(pp);
5644                 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5645                         panic("page_migrate: page_relocate failed");
5646                 }
5647                 ASSERT(page_cnt * PAGESIZE == pgsz);
5648 
5649                 /*
5650                  * Keep stats for number of pages migrated from and to
5651                  * each lgroup
5652                  */
5653                 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5654                 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5655                 /*
5656                  * update the page_t array we were passed in and
5657                  * unlink constituent pages of a large page.
5658                  */
5659                 for (i = 0; i < page_cnt; ++i, ++pp) {
5660                         ASSERT(PAGE_EXCL(newpp));
5661                         ASSERT(newpp->p_szc == pszc);
5662                         ppa[i] = newpp;
5663                         pp = newpp;
5664                         page_sub(&newpp, pp);
5665                         page_downgrade(pp);
5666                 }
5667                 ASSERT(newpp == NULL);
5668 next:
5669                 addr += pgsz;
5670                 ppa += page_cnt;
5671                 npages -= page_cnt;
5672         }
5673 }
5674 
5675 uint_t page_reclaim_maxcnt = 60; /* max total iterations */
5676 uint_t page_reclaim_nofree_maxcnt = 3; /* max iterations without progress */
5677 /*
5678  * Reclaim/reserve availrmem for npages.
5679  * If there is not enough memory start reaping seg, kmem caches.
5680  * Start pageout scanner (via page_needfree()).
5681  * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5682  * Note: There is no guarantee that any availrmem will be freed as
5683  * this memory typically is locked (kernel heap) or reserved for swap.
5684  * Also due to memory fragmentation kmem allocator may not be able
5685  * to free any memory (single user allocated buffer will prevent
5686  * freeing slab or a page).
5687  */
5688 int
5689 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5690 {
5691         int     i = 0;
5692         int     i_nofree = 0;
5693         int     ret = 0;
5694         pgcnt_t deficit;
5695         pgcnt_t old_availrmem = 0;
5696 
5697         mutex_enter(&freemem_lock);
5698         while (availrmem < tune.t_minarmem + npages + epages &&
5699             i++ < page_reclaim_maxcnt) {
5700                 /* ensure we made some progress in the last few iterations */
5701                 if (old_availrmem < availrmem) {
5702                         old_availrmem = availrmem;
5703                         i_nofree = 0;
5704                 } else if (i_nofree++ >= page_reclaim_nofree_maxcnt) {
5705                         break;
5706                 }
5707 
5708                 deficit = tune.t_minarmem + npages + epages - availrmem;
5709                 mutex_exit(&freemem_lock);
5710                 page_needfree(deficit);
5711                 kmem_reap();
5712                 delay(hz);
5713                 page_needfree(-(spgcnt_t)deficit);
5714                 mutex_enter(&freemem_lock);
5715         }
5716 
5717         if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5718                 availrmem -= npages;
5719                 ret = 1;
5720         }
5721 
5722         mutex_exit(&freemem_lock);
5723 
5724         return (ret);
5725 }
5726 
5727 /*
5728  * Search the memory segments to locate the desired page.  Within a
5729  * segment, pages increase linearly with one page structure per
5730  * physical page frame (size PAGESIZE).  The search begins
5731  * with the segment that was accessed last, to take advantage of locality.
5732  * If the hint misses, we start from the beginning of the sorted memseg list
5733  */
5734 
5735 
5736 /*
5737  * Some data structures for pfn to pp lookup.
5738  */
5739 ulong_t mhash_per_slot;
5740 struct memseg *memseg_hash[N_MEM_SLOTS];
5741 
5742 page_t *
5743 page_numtopp_nolock(pfn_t pfnum)
5744 {
5745         struct memseg *seg;
5746         page_t *pp;
5747         vm_cpu_data_t *vc;
5748 
5749         /*
5750          * We need to disable kernel preemption while referencing the
5751          * cpu_vm_data field in order to prevent us from being switched to
5752          * another cpu and trying to reference it after it has been freed.
5753          * This will keep us on cpu and prevent it from being removed while
5754          * we are still on it.
5755          *
5756          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5757          * which is being resued by DR who will flush those references
5758          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5759          */
5760         kpreempt_disable();
5761         vc = CPU->cpu_vm_data;
5762         ASSERT(vc != NULL);
5763 
5764         MEMSEG_STAT_INCR(nsearch);
5765 
5766         /* Try last winner first */
5767         if (((seg = vc->vc_pnum_memseg) != NULL) &&
5768             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5769                 MEMSEG_STAT_INCR(nlastwon);
5770                 pp = seg->pages + (pfnum - seg->pages_base);
5771                 if (pp->p_pagenum == pfnum) {
5772                         kpreempt_enable();
5773                         return ((page_t *)pp);
5774                 }
5775         }
5776 
5777         /* Else Try hash */
5778         if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5779             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5780                 MEMSEG_STAT_INCR(nhashwon);
5781                 vc->vc_pnum_memseg = seg;
5782                 pp = seg->pages + (pfnum - seg->pages_base);
5783                 if (pp->p_pagenum == pfnum) {
5784                         kpreempt_enable();
5785                         return ((page_t *)pp);
5786                 }
5787         }
5788 
5789         /* Else Brute force */
5790         for (seg = memsegs; seg != NULL; seg = seg->next) {
5791                 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5792                         vc->vc_pnum_memseg = seg;
5793                         pp = seg->pages + (pfnum - seg->pages_base);
5794                         if (pp->p_pagenum == pfnum) {
5795                                 kpreempt_enable();
5796                                 return ((page_t *)pp);
5797                         }
5798                 }
5799         }
5800         vc->vc_pnum_memseg = NULL;
5801         kpreempt_enable();
5802         MEMSEG_STAT_INCR(nnotfound);
5803         return ((page_t *)NULL);
5804 
5805 }
5806 
5807 struct memseg *
5808 page_numtomemseg_nolock(pfn_t pfnum)
5809 {
5810         struct memseg *seg;
5811         page_t *pp;
5812 
5813         /*
5814          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5815          * which is being resued by DR who will flush those references
5816          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5817          */
5818         kpreempt_disable();
5819         /* Try hash */
5820         if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5821             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5822                 pp = seg->pages + (pfnum - seg->pages_base);
5823                 if (pp->p_pagenum == pfnum) {
5824                         kpreempt_enable();
5825                         return (seg);
5826                 }
5827         }
5828 
5829         /* Else Brute force */
5830         for (seg = memsegs; seg != NULL; seg = seg->next) {
5831                 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5832                         pp = seg->pages + (pfnum - seg->pages_base);
5833                         if (pp->p_pagenum == pfnum) {
5834                                 kpreempt_enable();
5835                                 return (seg);
5836                         }
5837                 }
5838         }
5839         kpreempt_enable();
5840         return ((struct memseg *)NULL);
5841 }
5842 
5843 /*
5844  * Given a page and a count return the page struct that is
5845  * n structs away from the current one in the global page
5846  * list.
5847  *
5848  * This function wraps to the first page upon
5849  * reaching the end of the memseg list.
5850  */
5851 page_t *
5852 page_nextn(page_t *pp, ulong_t n)
5853 {
5854         struct memseg *seg;
5855         page_t *ppn;
5856         vm_cpu_data_t *vc;
5857 
5858         /*
5859          * We need to disable kernel preemption while referencing the
5860          * cpu_vm_data field in order to prevent us from being switched to
5861          * another cpu and trying to reference it after it has been freed.
5862          * This will keep us on cpu and prevent it from being removed while
5863          * we are still on it.
5864          *
5865          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5866          * which is being resued by DR who will flush those references
5867          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5868          */
5869         kpreempt_disable();
5870         vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5871 
5872         ASSERT(vc != NULL);
5873 
5874         if (((seg = vc->vc_pnext_memseg) == NULL) ||
5875             (seg->pages_base == seg->pages_end) ||
5876             !(pp >= seg->pages && pp < seg->epages)) {
5877 
5878                 for (seg = memsegs; seg; seg = seg->next) {
5879                         if (pp >= seg->pages && pp < seg->epages)
5880                                 break;
5881                 }
5882 
5883                 if (seg == NULL) {
5884                         /* Memory delete got in, return something valid. */
5885                         /* TODO: fix me. */
5886                         seg = memsegs;
5887                         pp = seg->pages;
5888                 }
5889         }
5890 
5891         /* check for wraparound - possible if n is large */
5892         while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5893                 n -= seg->epages - pp;
5894                 seg = seg->next;
5895                 if (seg == NULL)
5896                         seg = memsegs;
5897                 pp = seg->pages;
5898         }
5899         vc->vc_pnext_memseg = seg;
5900         kpreempt_enable();
5901         return (ppn);
5902 }
5903 
5904 /*
5905  * Initialize for a loop using page_next_scan_large().
5906  */
5907 page_t *
5908 page_next_scan_init(void **cookie)
5909 {
5910         ASSERT(cookie != NULL);
5911         *cookie = (void *)memsegs;
5912         return ((page_t *)memsegs->pages);
5913 }
5914 
5915 /*
5916  * Return the next page in a scan of page_t's, assuming we want
5917  * to skip over sub-pages within larger page sizes.
5918  *
5919  * The cookie is used to keep track of the current memseg.
5920  */
5921 page_t *
5922 page_next_scan_large(
5923         page_t          *pp,
5924         ulong_t         *n,
5925         void            **cookie)
5926 {
5927         struct memseg   *seg = (struct memseg *)*cookie;
5928         page_t          *new_pp;
5929         ulong_t         cnt;
5930         pfn_t           pfn;
5931 
5932 
5933         /*
5934          * get the count of page_t's to skip based on the page size
5935          */
5936         ASSERT(pp != NULL);
5937         if (pp->p_szc == 0) {
5938                 cnt = 1;
5939         } else {
5940                 pfn = page_pptonum(pp);
5941                 cnt = page_get_pagecnt(pp->p_szc);
5942                 cnt -= pfn & (cnt - 1);
5943         }
5944         *n += cnt;
5945         new_pp = pp + cnt;
5946 
5947         /*
5948          * Catch if we went past the end of the current memory segment. If so,
5949          * just move to the next segment with pages.
5950          */
5951         if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5952                 do {
5953                         seg = seg->next;
5954                         if (seg == NULL)
5955                                 seg = memsegs;
5956                 } while (seg->pages_base == seg->pages_end);
5957                 new_pp = seg->pages;
5958                 *cookie = (void *)seg;
5959         }
5960 
5961         return (new_pp);
5962 }
5963 
5964 
5965 /*
5966  * Returns next page in list. Note: this function wraps
5967  * to the first page in the list upon reaching the end
5968  * of the list. Callers should be aware of this fact.
5969  */
5970 
5971 /* We should change this be a #define */
5972 
5973 page_t *
5974 page_next(page_t *pp)
5975 {
5976         return (page_nextn(pp, 1));
5977 }
5978 
5979 page_t *
5980 page_first()
5981 {
5982         return ((page_t *)memsegs->pages);
5983 }
5984 
5985 
5986 /*
5987  * This routine is called at boot with the initial memory configuration
5988  * and when memory is added or removed.
5989  */
5990 void
5991 build_pfn_hash()
5992 {
5993         pfn_t cur;
5994         pgcnt_t index;
5995         struct memseg *pseg;
5996         int     i;
5997 
5998         /*
5999          * Clear memseg_hash array.
6000          * Since memory add/delete is designed to operate concurrently
6001          * with normal operation, the hash rebuild must be able to run
6002          * concurrently with page_numtopp_nolock(). To support this
6003          * functionality, assignments to memseg_hash array members must
6004          * be done atomically.
6005          *
6006          * NOTE: bzero() does not currently guarantee this for kernel
6007          * threads, and cannot be used here.
6008          */
6009         for (i = 0; i < N_MEM_SLOTS; i++)
6010                 memseg_hash[i] = NULL;
6011 
6012         hat_kpm_mseghash_clear(N_MEM_SLOTS);
6013 
6014         /*
6015          * Physmax is the last valid pfn.
6016          */
6017         mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6018         for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6019                 index = MEMSEG_PFN_HASH(pseg->pages_base);
6020                 cur = pseg->pages_base;
6021                 do {
6022                         if (index >= N_MEM_SLOTS)
6023                                 index = MEMSEG_PFN_HASH(cur);
6024 
6025                         if (memseg_hash[index] == NULL ||
6026                             memseg_hash[index]->pages_base > pseg->pages_base) {
6027                                 memseg_hash[index] = pseg;
6028                                 hat_kpm_mseghash_update(index, pseg);
6029                         }
6030                         cur += mhash_per_slot;
6031                         index++;
6032                 } while (cur < pseg->pages_end);
6033         }
6034 }
6035 
6036 /*
6037  * Return the pagenum for the pp
6038  */
6039 pfn_t
6040 page_pptonum(page_t *pp)
6041 {
6042         return (pp->p_pagenum);
6043 }
6044 
6045 /*
6046  * interface to the referenced and modified etc bits
6047  * in the PSM part of the page struct
6048  * when no locking is desired.
6049  */
6050 void
6051 page_set_props(page_t *pp, uint_t flags)
6052 {
6053         ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6054         pp->p_nrm |= (uchar_t)flags;
6055 }
6056 
6057 void
6058 page_clr_all_props(page_t *pp)
6059 {
6060         pp->p_nrm = 0;
6061 }
6062 
6063 /*
6064  * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6065  */
6066 int
6067 page_clear_lck_cow(page_t *pp, int adjust)
6068 {
6069         int     f_amount;
6070 
6071         ASSERT(PAGE_EXCL(pp));
6072 
6073         /*
6074          * The page_struct_lock need not be acquired here since
6075          * we require the caller hold the page exclusively locked.
6076          */
6077         f_amount = 0;
6078         if (pp->p_lckcnt) {
6079                 f_amount = 1;
6080                 pp->p_lckcnt = 0;
6081         }
6082         if (pp->p_cowcnt) {
6083                 f_amount += pp->p_cowcnt;
6084                 pp->p_cowcnt = 0;
6085         }
6086 
6087         if (adjust && f_amount) {
6088                 mutex_enter(&freemem_lock);
6089                 availrmem += f_amount;
6090                 mutex_exit(&freemem_lock);
6091         }
6092 
6093         return (f_amount);
6094 }
6095 
6096 /*
6097  * The following functions is called from free_vp_pages()
6098  * for an inexact estimate of a newly free'd page...
6099  */
6100 ulong_t
6101 page_share_cnt(page_t *pp)
6102 {
6103         return (hat_page_getshare(pp));
6104 }
6105 
6106 int
6107 page_isshared(page_t *pp)
6108 {
6109         return (hat_page_checkshare(pp, 1));
6110 }
6111 
6112 int
6113 page_isfree(page_t *pp)
6114 {
6115         return (PP_ISFREE(pp));
6116 }
6117 
6118 int
6119 page_isref(page_t *pp)
6120 {
6121         return (hat_page_getattr(pp, P_REF));
6122 }
6123 
6124 int
6125 page_ismod(page_t *pp)
6126 {
6127         return (hat_page_getattr(pp, P_MOD));
6128 }
6129 
6130 /*
6131  * The following code all currently relates to the page capture logic:
6132  *
6133  * This logic is used for cases where there is a desire to claim a certain
6134  * physical page in the system for the caller.  As it may not be possible
6135  * to capture the page immediately, the p_toxic bits are used in the page
6136  * structure to indicate that someone wants to capture this page.  When the
6137  * page gets unlocked, the toxic flag will be noted and an attempt to capture
6138  * the page will be made.  If it is successful, the original callers callback
6139  * will be called with the page to do with it what they please.
6140  *
6141  * There is also an async thread which wakes up to attempt to capture
6142  * pages occasionally which have the capture bit set.  All of the pages which
6143  * need to be captured asynchronously have been inserted into the
6144  * page_capture_hash and thus this thread walks that hash list.  Items in the
6145  * hash have an expiration time so this thread handles that as well by removing
6146  * the item from the hash if it has expired.
6147  *
6148  * Some important things to note are:
6149  * - if the PR_CAPTURE bit is set on a page, then the page is in the
6150  *   page_capture_hash.  The page_capture_hash_head.pchh_mutex is needed
6151  *   to set and clear this bit, and while the lock is held is the only time
6152  *   you can add or remove an entry from the hash.
6153  * - the PR_CAPTURE bit can only be set and cleared while holding the
6154  *   page_capture_hash_head.pchh_mutex
6155  * - the t_flag field of the thread struct is used with the T_CAPTURING
6156  *   flag to prevent recursion while dealing with large pages.
6157  * - pages which need to be retired never expire on the page_capture_hash.
6158  */
6159 
6160 static void page_capture_thread(void);
6161 static kthread_t *pc_thread_id;
6162 kcondvar_t pc_cv;
6163 static kmutex_t pc_thread_mutex;
6164 static clock_t pc_thread_shortwait;
6165 static clock_t pc_thread_longwait;
6166 static int pc_thread_retry;
6167 
6168 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6169 
6170 /* Note that this is a circular linked list */
6171 typedef struct page_capture_hash_bucket {
6172         page_t *pp;
6173         uchar_t szc;
6174         uchar_t pri;
6175         uint_t flags;
6176         clock_t expires;        /* lbolt at which this request expires. */
6177         void *datap;            /* Cached data passed in for callback */
6178         struct page_capture_hash_bucket *next;
6179         struct page_capture_hash_bucket *prev;
6180 } page_capture_hash_bucket_t;
6181 
6182 #define PC_PRI_HI       0       /* capture now */
6183 #define PC_PRI_LO       1       /* capture later */
6184 #define PC_NUM_PRI      2
6185 
6186 #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
6187 
6188 
6189 /*
6190  * Each hash bucket will have it's own mutex and two lists which are:
6191  * active (0):  represents requests which have not been processed by
6192  *              the page_capture async thread yet.
6193  * walked (1):  represents requests which have been processed by the
6194  *              page_capture async thread within it's given walk of this bucket.
6195  *
6196  * These are all needed so that we can synchronize all async page_capture
6197  * events.  When the async thread moves to a new bucket, it will append the
6198  * walked list to the active list and walk each item one at a time, moving it
6199  * from the active list to the walked list.  Thus if there is an async request
6200  * outstanding for a given page, it will always be in one of the two lists.
6201  * New requests will always be added to the active list.
6202  * If we were not able to capture a page before the request expired, we'd free
6203  * up the request structure which would indicate to page_capture that there is
6204  * no longer a need for the given page, and clear the PR_CAPTURE flag if
6205  * possible.
6206  */
6207 typedef struct page_capture_hash_head {
6208         kmutex_t pchh_mutex;
6209         uint_t num_pages[PC_NUM_PRI];
6210         page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6211 } page_capture_hash_head_t;
6212 
6213 #ifdef DEBUG
6214 #define NUM_PAGE_CAPTURE_BUCKETS 4
6215 #else
6216 #define NUM_PAGE_CAPTURE_BUCKETS 64
6217 #endif
6218 
6219 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6220 
6221 /* for now use a very simple hash based upon the size of a page struct */
6222 #define PAGE_CAPTURE_HASH(pp)   \
6223         ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6224 
6225 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6226 
6227 /*
6228  * a callback function is required for page capture requests.
6229  */
6230 void
6231 page_capture_register_callback(uint_t index, clock_t duration,
6232     int (*cb_func)(page_t *, void *, uint_t))
6233 {
6234         ASSERT(pc_cb[index].cb_active == 0);
6235         ASSERT(cb_func != NULL);
6236         rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6237         pc_cb[index].duration = duration;
6238         pc_cb[index].cb_func = cb_func;
6239         pc_cb[index].cb_active = 1;
6240         rw_exit(&pc_cb[index].cb_rwlock);
6241 }
6242 
6243 void
6244 page_capture_unregister_callback(uint_t index)
6245 {
6246         int i, j;
6247         struct page_capture_hash_bucket *bp1;
6248         struct page_capture_hash_bucket *bp2;
6249         struct page_capture_hash_bucket *head = NULL;
6250         uint_t flags = (1 << index);
6251 
6252         rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6253         ASSERT(pc_cb[index].cb_active == 1);
6254         pc_cb[index].duration = 0;      /* Paranoia */
6255         pc_cb[index].cb_func = NULL;    /* Paranoia */
6256         pc_cb[index].cb_active = 0;
6257         rw_exit(&pc_cb[index].cb_rwlock);
6258 
6259         /*
6260          * Just move all the entries to a private list which we can walk
6261          * through without the need to hold any locks.
6262          * No more requests can get added to the hash lists for this consumer
6263          * as the cb_active field for the callback has been cleared.
6264          */
6265         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6266                 mutex_enter(&page_capture_hash[i].pchh_mutex);
6267                 for (j = 0; j < 2; j++) {
6268                         bp1 = page_capture_hash[i].lists[j].next;
6269                         /* walk through all but first (sentinel) element */
6270                         while (bp1 != &page_capture_hash[i].lists[j]) {
6271                                 bp2 = bp1;
6272                                 if (bp2->flags & flags) {
6273                                         bp1 = bp2->next;
6274                                         bp1->prev = bp2->prev;
6275                                         bp2->prev->next = bp1;
6276                                         bp2->next = head;
6277                                         head = bp2;
6278                                         /*
6279                                          * Clear the PR_CAPTURE bit as we
6280                                          * hold appropriate locks here.
6281                                          */
6282                                         page_clrtoxic(head->pp, PR_CAPTURE);
6283                                         page_capture_hash[i].
6284                                             num_pages[bp2->pri]--;
6285                                         continue;
6286                                 }
6287                                 bp1 = bp1->next;
6288                         }
6289                 }
6290                 mutex_exit(&page_capture_hash[i].pchh_mutex);
6291         }
6292 
6293         while (head != NULL) {
6294                 bp1 = head;
6295                 head = head->next;
6296                 kmem_free(bp1, sizeof (*bp1));
6297         }
6298 }
6299 
6300 
6301 /*
6302  * Find pp in the active list and move it to the walked list if it
6303  * exists.
6304  * Note that most often pp should be at the front of the active list
6305  * as it is currently used and thus there is no other sort of optimization
6306  * being done here as this is a linked list data structure.
6307  * Returns 1 on successful move or 0 if page could not be found.
6308  */
6309 static int
6310 page_capture_move_to_walked(page_t *pp)
6311 {
6312         page_capture_hash_bucket_t *bp;
6313         int index;
6314 
6315         index = PAGE_CAPTURE_HASH(pp);
6316 
6317         mutex_enter(&page_capture_hash[index].pchh_mutex);
6318         bp = page_capture_hash[index].lists[0].next;
6319         while (bp != &page_capture_hash[index].lists[0]) {
6320                 if (bp->pp == pp) {
6321                         /* Remove from old list */
6322                         bp->next->prev = bp->prev;
6323                         bp->prev->next = bp->next;
6324 
6325                         /* Add to new list */
6326                         bp->next = page_capture_hash[index].lists[1].next;
6327                         bp->prev = &page_capture_hash[index].lists[1];
6328                         page_capture_hash[index].lists[1].next = bp;
6329                         bp->next->prev = bp;
6330 
6331                         /*
6332                          * There is a small probability of page on a free
6333                          * list being retired while being allocated
6334                          * and before P_RAF is set on it. The page may
6335                          * end up marked as high priority request instead
6336                          * of low priority request.
6337                          * If P_RAF page is not marked as low priority request
6338                          * change it to low priority request.
6339                          */
6340                         page_capture_hash[index].num_pages[bp->pri]--;
6341                         bp->pri = PAGE_CAPTURE_PRIO(pp);
6342                         page_capture_hash[index].num_pages[bp->pri]++;
6343                         mutex_exit(&page_capture_hash[index].pchh_mutex);
6344                         return (1);
6345                 }
6346                 bp = bp->next;
6347         }
6348         mutex_exit(&page_capture_hash[index].pchh_mutex);
6349         return (0);
6350 }
6351 
6352 /*
6353  * Add a new entry to the page capture hash.  The only case where a new
6354  * entry is not added is when the page capture consumer is no longer registered.
6355  * In this case, we'll silently not add the page to the hash.  We know that
6356  * page retire will always be registered for the case where we are currently
6357  * unretiring a page and thus there are no conflicts.
6358  */
6359 static void
6360 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6361 {
6362         page_capture_hash_bucket_t *bp1;
6363         page_capture_hash_bucket_t *bp2;
6364         int index;
6365         int cb_index;
6366         int i;
6367         uchar_t pri;
6368 #ifdef DEBUG
6369         page_capture_hash_bucket_t *tp1;
6370         int l;
6371 #endif
6372 
6373         ASSERT(!(flags & CAPTURE_ASYNC));
6374 
6375         bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6376 
6377         bp1->pp = pp;
6378         bp1->szc = szc;
6379         bp1->flags = flags;
6380         bp1->datap = datap;
6381 
6382         for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6383                 if ((flags >> cb_index) & 1) {
6384                         break;
6385                 }
6386         }
6387 
6388         ASSERT(cb_index != PC_NUM_CALLBACKS);
6389 
6390         rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6391         if (pc_cb[cb_index].cb_active) {
6392                 if (pc_cb[cb_index].duration == -1) {
6393                         bp1->expires = (clock_t)-1;
6394                 } else {
6395                         bp1->expires = ddi_get_lbolt() +
6396                             pc_cb[cb_index].duration;
6397                 }
6398         } else {
6399                 /* There's no callback registered so don't add to the hash */
6400                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6401                 kmem_free(bp1, sizeof (*bp1));
6402                 return;
6403         }
6404 
6405         index = PAGE_CAPTURE_HASH(pp);
6406 
6407         /*
6408          * Only allow capture flag to be modified under this mutex.
6409          * Prevents multiple entries for same page getting added.
6410          */
6411         mutex_enter(&page_capture_hash[index].pchh_mutex);
6412 
6413         /*
6414          * if not already on the hash, set capture bit and add to the hash
6415          */
6416         if (!(pp->p_toxic & PR_CAPTURE)) {
6417 #ifdef DEBUG
6418                 /* Check for duplicate entries */
6419                 for (l = 0; l < 2; l++) {
6420                         tp1 = page_capture_hash[index].lists[l].next;
6421                         while (tp1 != &page_capture_hash[index].lists[l]) {
6422                                 if (tp1->pp == pp) {
6423                                         panic("page pp 0x%p already on hash "
6424                                             "at 0x%p\n",
6425                                             (void *)pp, (void *)tp1);
6426                                 }
6427                                 tp1 = tp1->next;
6428                         }
6429                 }
6430 
6431 #endif
6432                 page_settoxic(pp, PR_CAPTURE);
6433                 pri = PAGE_CAPTURE_PRIO(pp);
6434                 bp1->pri = pri;
6435                 bp1->next = page_capture_hash[index].lists[0].next;
6436                 bp1->prev = &page_capture_hash[index].lists[0];
6437                 bp1->next->prev = bp1;
6438                 page_capture_hash[index].lists[0].next = bp1;
6439                 page_capture_hash[index].num_pages[pri]++;
6440                 if (flags & CAPTURE_RETIRE) {
6441                         page_retire_incr_pend_count(datap);
6442                 }
6443                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6444                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6445                 cv_signal(&pc_cv);
6446                 return;
6447         }
6448 
6449         /*
6450          * A page retire request will replace any other request.
6451          * A second physmem request which is for a different process than
6452          * the currently registered one will be dropped as there is
6453          * no way to hold the private data for both calls.
6454          * In the future, once there are more callers, this will have to
6455          * be worked out better as there needs to be private storage for
6456          * at least each type of caller (maybe have datap be an array of
6457          * *void's so that we can index based upon callers index).
6458          */
6459 
6460         /* walk hash list to update expire time */
6461         for (i = 0; i < 2; i++) {
6462                 bp2 = page_capture_hash[index].lists[i].next;
6463                 while (bp2 != &page_capture_hash[index].lists[i]) {
6464                         if (bp2->pp == pp) {
6465                                 if (flags & CAPTURE_RETIRE) {
6466                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6467                                                 page_retire_incr_pend_count(
6468                                                     datap);
6469                                                 bp2->flags = flags;
6470                                                 bp2->expires = bp1->expires;
6471                                                 bp2->datap = datap;
6472                                         }
6473                                 } else {
6474                                         ASSERT(flags & CAPTURE_PHYSMEM);
6475                                         if (!(bp2->flags & CAPTURE_RETIRE) &&
6476                                             (datap == bp2->datap)) {
6477                                                 bp2->expires = bp1->expires;
6478                                         }
6479                                 }
6480                                 mutex_exit(&page_capture_hash[index].
6481                                     pchh_mutex);
6482                                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6483                                 kmem_free(bp1, sizeof (*bp1));
6484                                 return;
6485                         }
6486                         bp2 = bp2->next;
6487                 }
6488         }
6489 
6490         /*
6491          * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6492          * and thus it either has to be set or not set and can't change
6493          * while holding the mutex above.
6494          */
6495         panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6496             (void *)pp);
6497 }
6498 
6499 /*
6500  * We have a page in our hands, lets try and make it ours by turning
6501  * it into a clean page like it had just come off the freelists.
6502  *
6503  * Returns 0 on success, with the page still EXCL locked.
6504  * On failure, the page will be unlocked, and returns EAGAIN
6505  */
6506 static int
6507 page_capture_clean_page(page_t *pp)
6508 {
6509         page_t *newpp;
6510         int skip_unlock = 0;
6511         spgcnt_t count;
6512         page_t *tpp;
6513         int ret = 0;
6514         int extra;
6515 
6516         ASSERT(PAGE_EXCL(pp));
6517         ASSERT(!PP_RETIRED(pp));
6518         ASSERT(curthread->t_flag & T_CAPTURING);
6519 
6520         if (PP_ISFREE(pp)) {
6521                 if (!page_reclaim(pp, NULL)) {
6522                         skip_unlock = 1;
6523                         ret = EAGAIN;
6524                         goto cleanup;
6525                 }
6526                 ASSERT(pp->p_szc == 0);
6527                 if (pp->p_vnode != NULL) {
6528                         /*
6529                          * Since this page came from the
6530                          * cachelist, we must destroy the
6531                          * old vnode association.
6532                          */
6533                         page_hashout(pp, NULL);
6534                 }
6535                 goto cleanup;
6536         }
6537 
6538         /*
6539          * If we know page_relocate will fail, skip it
6540          * It could still fail due to a UE on another page but we
6541          * can't do anything about that.
6542          */
6543         if (pp->p_toxic & PR_UE) {
6544                 goto skip_relocate;
6545         }
6546 
6547         /*
6548          * It's possible that pages can not have a vnode as fsflush comes
6549          * through and cleans up these pages.  It's ugly but that's how it is.
6550          */
6551         if (pp->p_vnode == NULL) {
6552                 goto skip_relocate;
6553         }
6554 
6555         /*
6556          * Page was not free, so lets try to relocate it.
6557          * page_relocate only works with root pages, so if this is not a root
6558          * page, we need to demote it to try and relocate it.
6559          * Unfortunately this is the best we can do right now.
6560          */
6561         newpp = NULL;
6562         if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6563                 if (page_try_demote_pages(pp) == 0) {
6564                         ret = EAGAIN;
6565                         goto cleanup;
6566                 }
6567         }
6568         ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6569         if (ret == 0) {
6570                 page_t *npp;
6571                 /* unlock the new page(s) */
6572                 while (count-- > 0) {
6573                         ASSERT(newpp != NULL);
6574                         npp = newpp;
6575                         page_sub(&newpp, npp);
6576                         page_unlock(npp);
6577                 }
6578                 ASSERT(newpp == NULL);
6579                 /*
6580                  * Check to see if the page we have is too large.
6581                  * If so, demote it freeing up the extra pages.
6582                  */
6583                 if (pp->p_szc > 0) {
6584                         /* For now demote extra pages to szc == 0 */
6585                         extra = page_get_pagecnt(pp->p_szc) - 1;
6586                         while (extra > 0) {
6587                                 tpp = pp->p_next;
6588                                 page_sub(&pp, tpp);
6589                                 tpp->p_szc = 0;
6590                                 page_free(tpp, 1);
6591                                 extra--;
6592                         }
6593                         /* Make sure to set our page to szc 0 as well */
6594                         ASSERT(pp->p_next == pp && pp->p_prev == pp);
6595                         pp->p_szc = 0;
6596                 }
6597                 goto cleanup;
6598         } else if (ret == EIO) {
6599                 ret = EAGAIN;
6600                 goto cleanup;
6601         } else {
6602                 /*
6603                  * Need to reset return type as we failed to relocate the page
6604                  * but that does not mean that some of the next steps will not
6605                  * work.
6606                  */
6607                 ret = 0;
6608         }
6609 
6610 skip_relocate:
6611 
6612         if (pp->p_szc > 0) {
6613                 if (page_try_demote_pages(pp) == 0) {
6614                         ret = EAGAIN;
6615                         goto cleanup;
6616                 }
6617         }
6618 
6619         ASSERT(pp->p_szc == 0);
6620 
6621         if (hat_ismod(pp)) {
6622                 ret = EAGAIN;
6623                 goto cleanup;
6624         }
6625         if (PP_ISKAS(pp)) {
6626                 ret = EAGAIN;
6627                 goto cleanup;
6628         }
6629         if (pp->p_lckcnt || pp->p_cowcnt) {
6630                 ret = EAGAIN;
6631                 goto cleanup;
6632         }
6633 
6634         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6635         ASSERT(!hat_page_is_mapped(pp));
6636 
6637         if (hat_ismod(pp)) {
6638                 /*
6639                  * This is a semi-odd case as the page is now modified but not
6640                  * mapped as we just unloaded the mappings above.
6641                  */
6642                 ret = EAGAIN;
6643                 goto cleanup;
6644         }
6645         if (pp->p_vnode != NULL) {
6646                 page_hashout(pp, NULL);
6647         }
6648 
6649         /*
6650          * At this point, the page should be in a clean state and
6651          * we can do whatever we want with it.
6652          */
6653 
6654 cleanup:
6655         if (ret != 0) {
6656                 if (!skip_unlock) {
6657                         page_unlock(pp);
6658                 }
6659         } else {
6660                 ASSERT(pp->p_szc == 0);
6661                 ASSERT(PAGE_EXCL(pp));
6662 
6663                 pp->p_next = pp;
6664                 pp->p_prev = pp;
6665         }
6666         return (ret);
6667 }
6668 
6669 /*
6670  * Various callers of page_trycapture() can have different restrictions upon
6671  * what memory they have access to.
6672  * Returns 0 on success, with the following error codes on failure:
6673  *      EPERM - The requested page is long term locked, and thus repeated
6674  *              requests to capture this page will likely fail.
6675  *      ENOMEM - There was not enough free memory in the system to safely
6676  *              map the requested page.
6677  *      ENOENT - The requested page was inside the kernel cage, and the
6678  *              PHYSMEM_CAGE flag was not set.
6679  */
6680 int
6681 page_capture_pre_checks(page_t *pp, uint_t flags)
6682 {
6683         ASSERT(pp != NULL);
6684 
6685 #if defined(__sparc)
6686         if (pp->p_vnode == &promvp) {
6687                 return (EPERM);
6688         }
6689 
6690         if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6691             (flags & CAPTURE_PHYSMEM)) {
6692                 return (ENOENT);
6693         }
6694 
6695         if (PP_ISNORELOCKERNEL(pp)) {
6696                 return (EPERM);
6697         }
6698 #else
6699         if (PP_ISKAS(pp)) {
6700                 return (EPERM);
6701         }
6702 #endif /* __sparc */
6703 
6704         /* only physmem currently has the restrictions checked below */
6705         if (!(flags & CAPTURE_PHYSMEM)) {
6706                 return (0);
6707         }
6708 
6709         if (availrmem < swapfs_minfree) {
6710                 /*
6711                  * We won't try to capture this page as we are
6712                  * running low on memory.
6713                  */
6714                 return (ENOMEM);
6715         }
6716         return (0);
6717 }
6718 
6719 /*
6720  * Once we have a page in our mits, go ahead and complete the capture
6721  * operation.
6722  * Returns 1 on failure where page is no longer needed
6723  * Returns 0 on success
6724  * Returns -1 if there was a transient failure.
6725  * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6726  */
6727 int
6728 page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6729 {
6730         int cb_index;
6731         int ret = 0;
6732         page_capture_hash_bucket_t *bp1;
6733         page_capture_hash_bucket_t *bp2;
6734         int index;
6735         int found = 0;
6736         int i;
6737 
6738         ASSERT(PAGE_EXCL(pp));
6739         ASSERT(curthread->t_flag & T_CAPTURING);
6740 
6741         for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6742                 if ((flags >> cb_index) & 1) {
6743                         break;
6744                 }
6745         }
6746         ASSERT(cb_index < PC_NUM_CALLBACKS);
6747 
6748         /*
6749          * Remove the entry from the page_capture hash, but don't free it yet
6750          * as we may need to put it back.
6751          * Since we own the page at this point in time, we should find it
6752          * in the hash if this is an ASYNC call.  If we don't it's likely
6753          * that the page_capture_async() thread decided that this request
6754          * had expired, in which case we just continue on.
6755          */
6756         if (flags & CAPTURE_ASYNC) {
6757 
6758                 index = PAGE_CAPTURE_HASH(pp);
6759 
6760                 mutex_enter(&page_capture_hash[index].pchh_mutex);
6761                 for (i = 0; i < 2 && !found; i++) {
6762                         bp1 = page_capture_hash[index].lists[i].next;
6763                         while (bp1 != &page_capture_hash[index].lists[i]) {
6764                                 if (bp1->pp == pp) {
6765                                         bp1->next->prev = bp1->prev;
6766                                         bp1->prev->next = bp1->next;
6767                                         page_capture_hash[index].
6768                                             num_pages[bp1->pri]--;
6769                                         page_clrtoxic(pp, PR_CAPTURE);
6770                                         found = 1;
6771                                         break;
6772                                 }
6773                                 bp1 = bp1->next;
6774                         }
6775                 }
6776                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6777         }
6778 
6779         /* Synchronize with the unregister func. */
6780         rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6781         if (!pc_cb[cb_index].cb_active) {
6782                 page_free(pp, 1);
6783                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6784                 if (found) {
6785                         kmem_free(bp1, sizeof (*bp1));
6786                 }
6787                 return (1);
6788         }
6789 
6790         /*
6791          * We need to remove the entry from the page capture hash and turn off
6792          * the PR_CAPTURE bit before calling the callback.  We'll need to cache
6793          * the entry here, and then based upon the return value, cleanup
6794          * appropriately or re-add it to the hash, making sure that someone else
6795          * hasn't already done so.
6796          * It should be rare for the callback to fail and thus it's ok for
6797          * the failure path to be a bit complicated as the success path is
6798          * cleaner and the locking rules are easier to follow.
6799          */
6800 
6801         ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6802 
6803         rw_exit(&pc_cb[cb_index].cb_rwlock);
6804 
6805         /*
6806          * If this was an ASYNC request, we need to cleanup the hash if the
6807          * callback was successful or if the request was no longer valid.
6808          * For non-ASYNC requests, we return failure to map and the caller
6809          * will take care of adding the request to the hash.
6810          * Note also that the callback itself is responsible for the page
6811          * at this point in time in terms of locking ...  The most common
6812          * case for the failure path should just be a page_free.
6813          */
6814         if (ret >= 0) {
6815                 if (found) {
6816                         if (bp1->flags & CAPTURE_RETIRE) {
6817                                 page_retire_decr_pend_count(datap);
6818                         }
6819                         kmem_free(bp1, sizeof (*bp1));
6820                 }
6821                 return (ret);
6822         }
6823         if (!found) {
6824                 return (ret);
6825         }
6826 
6827         ASSERT(flags & CAPTURE_ASYNC);
6828 
6829         /*
6830          * Check for expiration time first as we can just free it up if it's
6831          * expired.
6832          */
6833         if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6834                 kmem_free(bp1, sizeof (*bp1));
6835                 return (ret);
6836         }
6837 
6838         /*
6839          * The callback failed and there used to be an entry in the hash for
6840          * this page, so we need to add it back to the hash.
6841          */
6842         mutex_enter(&page_capture_hash[index].pchh_mutex);
6843         if (!(pp->p_toxic & PR_CAPTURE)) {
6844                 /* just add bp1 back to head of walked list */
6845                 page_settoxic(pp, PR_CAPTURE);
6846                 bp1->next = page_capture_hash[index].lists[1].next;
6847                 bp1->prev = &page_capture_hash[index].lists[1];
6848                 bp1->next->prev = bp1;
6849                 bp1->pri = PAGE_CAPTURE_PRIO(pp);
6850                 page_capture_hash[index].lists[1].next = bp1;
6851                 page_capture_hash[index].num_pages[bp1->pri]++;
6852                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6853                 return (ret);
6854         }
6855 
6856         /*
6857          * Otherwise there was a new capture request added to list
6858          * Need to make sure that our original data is represented if
6859          * appropriate.
6860          */
6861         for (i = 0; i < 2; i++) {
6862                 bp2 = page_capture_hash[index].lists[i].next;
6863                 while (bp2 != &page_capture_hash[index].lists[i]) {
6864                         if (bp2->pp == pp) {
6865                                 if (bp1->flags & CAPTURE_RETIRE) {
6866                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6867                                                 bp2->szc = bp1->szc;
6868                                                 bp2->flags = bp1->flags;
6869                                                 bp2->expires = bp1->expires;
6870                                                 bp2->datap = bp1->datap;
6871                                         }
6872                                 } else {
6873                                         ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6874                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6875                                                 bp2->szc = bp1->szc;
6876                                                 bp2->flags = bp1->flags;
6877                                                 bp2->expires = bp1->expires;
6878                                                 bp2->datap = bp1->datap;
6879                                         }
6880                                 }
6881                                 page_capture_hash[index].num_pages[bp2->pri]--;
6882                                 bp2->pri = PAGE_CAPTURE_PRIO(pp);
6883                                 page_capture_hash[index].num_pages[bp2->pri]++;
6884                                 mutex_exit(&page_capture_hash[index].
6885                                     pchh_mutex);
6886                                 kmem_free(bp1, sizeof (*bp1));
6887                                 return (ret);
6888                         }
6889                         bp2 = bp2->next;
6890                 }
6891         }
6892         panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6893         /*NOTREACHED*/
6894 }
6895 
6896 /*
6897  * Try to capture the given page for the caller specified in the flags
6898  * parameter.  The page will either be captured and handed over to the
6899  * appropriate callback, or will be queued up in the page capture hash
6900  * to be captured asynchronously.
6901  * If the current request is due to an async capture, the page must be
6902  * exclusively locked before calling this function.
6903  * Currently szc must be 0 but in the future this should be expandable to
6904  * other page sizes.
6905  * Returns 0 on success, with the following error codes on failure:
6906  *      EPERM - The requested page is long term locked, and thus repeated
6907  *              requests to capture this page will likely fail.
6908  *      ENOMEM - There was not enough free memory in the system to safely
6909  *              map the requested page.
6910  *      ENOENT - The requested page was inside the kernel cage, and the
6911  *              CAPTURE_GET_CAGE flag was not set.
6912  *      EAGAIN - The requested page could not be capturead at this point in
6913  *              time but future requests will likely work.
6914  *      EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6915  *              was not set.
6916  */
6917 int
6918 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6919 {
6920         int ret;
6921         int cb_index;
6922 
6923         if (flags & CAPTURE_ASYNC) {
6924                 ASSERT(PAGE_EXCL(pp));
6925                 goto async;
6926         }
6927 
6928         /* Make sure there's enough availrmem ... */
6929         ret = page_capture_pre_checks(pp, flags);
6930         if (ret != 0) {
6931                 return (ret);
6932         }
6933 
6934         if (!page_trylock(pp, SE_EXCL)) {
6935                 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6936                         if ((flags >> cb_index) & 1) {
6937                                 break;
6938                         }
6939                 }
6940                 ASSERT(cb_index < PC_NUM_CALLBACKS);
6941                 ret = EAGAIN;
6942                 /* Special case for retired pages */
6943                 if (PP_RETIRED(pp)) {
6944                         if (flags & CAPTURE_GET_RETIRED) {
6945                                 if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6946                                         /*
6947                                          * Need to set capture bit and add to
6948                                          * hash so that the page will be
6949                                          * retired when freed.
6950                                          */
6951                                         page_capture_add_hash(pp, szc,
6952                                             CAPTURE_RETIRE, NULL);
6953                                         ret = 0;
6954                                         goto own_page;
6955                                 }
6956                         } else {
6957                                 return (EBUSY);
6958                         }
6959                 }
6960                 page_capture_add_hash(pp, szc, flags, datap);
6961                 return (ret);
6962         }
6963 
6964 async:
6965         ASSERT(PAGE_EXCL(pp));
6966 
6967         /* Need to check for physmem async requests that availrmem is sane */
6968         if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6969             (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6970             (availrmem < swapfs_minfree)) {
6971                 page_unlock(pp);
6972                 return (ENOMEM);
6973         }
6974 
6975         ret = page_capture_clean_page(pp);
6976 
6977         if (ret != 0) {
6978                 /* We failed to get the page, so lets add it to the hash */
6979                 if (!(flags & CAPTURE_ASYNC)) {
6980                         page_capture_add_hash(pp, szc, flags, datap);
6981                 }
6982                 return (ret);
6983         }
6984 
6985 own_page:
6986         ASSERT(PAGE_EXCL(pp));
6987         ASSERT(pp->p_szc == 0);
6988 
6989         /* Call the callback */
6990         ret = page_capture_take_action(pp, flags, datap);
6991 
6992         if (ret == 0) {
6993                 return (0);
6994         }
6995 
6996         /*
6997          * Note that in the failure cases from page_capture_take_action, the
6998          * EXCL lock will have already been dropped.
6999          */
7000         if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
7001                 page_capture_add_hash(pp, szc, flags, datap);
7002         }
7003         return (EAGAIN);
7004 }
7005 
7006 int
7007 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
7008 {
7009         int ret;
7010 
7011         curthread->t_flag |= T_CAPTURING;
7012         ret = page_itrycapture(pp, szc, flags, datap);
7013         curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
7014         return (ret);
7015 }
7016 
7017 /*
7018  * When unlocking a page which has the PR_CAPTURE bit set, this routine
7019  * gets called to try and capture the page.
7020  */
7021 void
7022 page_unlock_capture(page_t *pp)
7023 {
7024         page_capture_hash_bucket_t *bp;
7025         int index;
7026         int i;
7027         uint_t szc;
7028         uint_t flags = 0;
7029         void *datap;
7030         kmutex_t *mp;
7031         extern vnode_t retired_pages;
7032 
7033         /*
7034          * We need to protect against a possible deadlock here where we own
7035          * the vnode page hash mutex and want to acquire it again as there
7036          * are locations in the code, where we unlock a page while holding
7037          * the mutex which can lead to the page being captured and eventually
7038          * end up here.  As we may be hashing out the old page and hashing into
7039          * the retire vnode, we need to make sure we don't own them.
7040          * Other callbacks who do hash operations also need to make sure that
7041          * before they hashin to a vnode that they do not currently own the
7042          * vphm mutex otherwise there will be a panic.
7043          */
7044         if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7045                 page_unlock_nocapture(pp);
7046                 return;
7047         }
7048         if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7049                 page_unlock_nocapture(pp);
7050                 return;
7051         }
7052 
7053         index = PAGE_CAPTURE_HASH(pp);
7054 
7055         mp = &page_capture_hash[index].pchh_mutex;
7056         mutex_enter(mp);
7057         for (i = 0; i < 2; i++) {
7058                 bp = page_capture_hash[index].lists[i].next;
7059                 while (bp != &page_capture_hash[index].lists[i]) {
7060                         if (bp->pp == pp) {
7061                                 szc = bp->szc;
7062                                 flags = bp->flags | CAPTURE_ASYNC;
7063                                 datap = bp->datap;
7064                                 mutex_exit(mp);
7065                                 (void) page_trycapture(pp, szc, flags, datap);
7066                                 return;
7067                         }
7068                         bp = bp->next;
7069                 }
7070         }
7071 
7072         /* Failed to find page in hash so clear flags and unlock it. */
7073         page_clrtoxic(pp, PR_CAPTURE);
7074         page_unlock(pp);
7075 
7076         mutex_exit(mp);
7077 }
7078 
7079 void
7080 page_capture_init()
7081 {
7082         int i;
7083         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7084                 page_capture_hash[i].lists[0].next =
7085                     &page_capture_hash[i].lists[0];
7086                 page_capture_hash[i].lists[0].prev =
7087                     &page_capture_hash[i].lists[0];
7088                 page_capture_hash[i].lists[1].next =
7089                     &page_capture_hash[i].lists[1];
7090                 page_capture_hash[i].lists[1].prev =
7091                     &page_capture_hash[i].lists[1];
7092         }
7093 
7094         pc_thread_shortwait = 23 * hz;
7095         pc_thread_longwait = 1201 * hz;
7096         pc_thread_retry = 3;
7097         mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7098         cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7099         pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7100             TS_RUN, minclsyspri);
7101 }
7102 
7103 /*
7104  * It is necessary to scrub any failing pages prior to reboot in order to
7105  * prevent a latent error trap from occurring on the next boot.
7106  */
7107 void
7108 page_retire_mdboot()
7109 {
7110         page_t *pp;
7111         int i, j;
7112         page_capture_hash_bucket_t *bp;
7113         uchar_t pri;
7114 
7115         /* walk lists looking for pages to scrub */
7116         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7117                 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7118                         if (page_capture_hash[i].num_pages[pri] != 0) {
7119                                 break;
7120                         }
7121                 }
7122                 if (pri == PC_NUM_PRI)
7123                         continue;
7124 
7125                 mutex_enter(&page_capture_hash[i].pchh_mutex);
7126 
7127                 for (j = 0; j < 2; j++) {
7128                         bp = page_capture_hash[i].lists[j].next;
7129                         while (bp != &page_capture_hash[i].lists[j]) {
7130                                 pp = bp->pp;
7131                                 if (PP_TOXIC(pp)) {
7132                                         if (page_trylock(pp, SE_EXCL)) {
7133                                                 PP_CLRFREE(pp);
7134                                                 pagescrub(pp, 0, PAGESIZE);
7135                                                 page_unlock(pp);
7136                                         }
7137                                 }
7138                                 bp = bp->next;
7139                         }
7140                 }
7141                 mutex_exit(&page_capture_hash[i].pchh_mutex);
7142         }
7143 }
7144 
7145 /*
7146  * Walk the page_capture_hash trying to capture pages and also cleanup old
7147  * entries which have expired.
7148  */
7149 void
7150 page_capture_async()
7151 {
7152         page_t *pp;
7153         int i;
7154         int ret;
7155         page_capture_hash_bucket_t *bp1, *bp2;
7156         uint_t szc;
7157         uint_t flags;
7158         void *datap;
7159         uchar_t pri;
7160 
7161         /* If there are outstanding pages to be captured, get to work */
7162         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7163                 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7164                         if (page_capture_hash[i].num_pages[pri] != 0)
7165                                 break;
7166                 }
7167                 if (pri == PC_NUM_PRI)
7168                         continue;
7169 
7170                 /* Append list 1 to list 0 and then walk through list 0 */
7171                 mutex_enter(&page_capture_hash[i].pchh_mutex);
7172                 bp1 = &page_capture_hash[i].lists[1];
7173                 bp2 = bp1->next;
7174                 if (bp1 != bp2) {
7175                         bp1->prev->next = page_capture_hash[i].lists[0].next;
7176                         bp2->prev = &page_capture_hash[i].lists[0];
7177                         page_capture_hash[i].lists[0].next->prev = bp1->prev;
7178                         page_capture_hash[i].lists[0].next = bp2;
7179                         bp1->next = bp1;
7180                         bp1->prev = bp1;
7181                 }
7182 
7183                 /* list[1] will be empty now */
7184 
7185                 bp1 = page_capture_hash[i].lists[0].next;
7186                 while (bp1 != &page_capture_hash[i].lists[0]) {
7187                         /* Check expiration time */
7188                         if ((ddi_get_lbolt() > bp1->expires &&
7189                             bp1->expires != -1) ||
7190                             page_deleted(bp1->pp)) {
7191                                 page_capture_hash[i].lists[0].next = bp1->next;
7192                                 bp1->next->prev =
7193                                     &page_capture_hash[i].lists[0];
7194                                 page_capture_hash[i].num_pages[bp1->pri]--;
7195 
7196                                 /*
7197                                  * We can safely remove the PR_CAPTURE bit
7198                                  * without holding the EXCL lock on the page
7199                                  * as the PR_CAPTURE bit requres that the
7200                                  * page_capture_hash[].pchh_mutex be held
7201                                  * to modify it.
7202                                  */
7203                                 page_clrtoxic(bp1->pp, PR_CAPTURE);
7204                                 mutex_exit(&page_capture_hash[i].pchh_mutex);
7205                                 kmem_free(bp1, sizeof (*bp1));
7206                                 mutex_enter(&page_capture_hash[i].pchh_mutex);
7207                                 bp1 = page_capture_hash[i].lists[0].next;
7208                                 continue;
7209                         }
7210                         pp = bp1->pp;
7211                         szc = bp1->szc;
7212                         flags = bp1->flags;
7213                         datap = bp1->datap;
7214                         mutex_exit(&page_capture_hash[i].pchh_mutex);
7215                         if (page_trylock(pp, SE_EXCL)) {
7216                                 ret = page_trycapture(pp, szc,
7217                                     flags | CAPTURE_ASYNC, datap);
7218                         } else {
7219                                 ret = 1;        /* move to walked hash */
7220                         }
7221 
7222                         if (ret != 0) {
7223                                 /* Move to walked hash */
7224                                 (void) page_capture_move_to_walked(pp);
7225                         }
7226                         mutex_enter(&page_capture_hash[i].pchh_mutex);
7227                         bp1 = page_capture_hash[i].lists[0].next;
7228                 }
7229 
7230                 mutex_exit(&page_capture_hash[i].pchh_mutex);
7231         }
7232 }
7233 
7234 /*
7235  * This function is called by the page_capture_thread, and is needed in
7236  * in order to initiate aio cleanup, so that pages used in aio
7237  * will be unlocked and subsequently retired by page_capture_thread.
7238  */
7239 static int
7240 do_aio_cleanup(void)
7241 {
7242         proc_t *procp;
7243         int (*aio_cleanup_dr_delete_memory)(proc_t *);
7244         int cleaned = 0;
7245 
7246         if (modload("sys", "kaio") == -1) {
7247                 cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7248                 return (0);
7249         }
7250         /*
7251          * We use the aio_cleanup_dr_delete_memory function to
7252          * initiate the actual clean up; this function will wake
7253          * up the per-process aio_cleanup_thread.
7254          */
7255         aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7256             modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7257         if (aio_cleanup_dr_delete_memory == NULL) {
7258                 cmn_err(CE_WARN,
7259             "aio_cleanup_dr_delete_memory not found in kaio");
7260                 return (0);
7261         }
7262         mutex_enter(&pidlock);
7263         for (procp = practive; (procp != NULL); procp = procp->p_next) {
7264                 mutex_enter(&procp->p_lock);
7265                 if (procp->p_aio != NULL) {
7266                         /* cleanup proc's outstanding kaio */
7267                         cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7268                 }
7269                 mutex_exit(&procp->p_lock);
7270         }
7271         mutex_exit(&pidlock);
7272         return (cleaned);
7273 }
7274 
7275 /*
7276  * helper function for page_capture_thread
7277  */
7278 static void
7279 page_capture_handle_outstanding(void)
7280 {
7281         int ntry;
7282 
7283         /* Reap pages before attempting capture pages */
7284         kmem_reap();
7285 
7286         if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
7287             hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
7288                 /*
7289                  * Note: Purging only for platforms that support
7290                  * ISM hat_pageunload() - mainly SPARC. On x86/x64
7291                  * platforms ISM pages SE_SHARED locked until destroyed.
7292                  */
7293 
7294                 /* disable and purge seg_pcache */
7295                 (void) seg_p_disable();
7296                 for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7297                         if (!page_retire_pend_count())
7298                                 break;
7299                         if (do_aio_cleanup()) {
7300                                 /*
7301                                  * allow the apps cleanup threads
7302                                  * to run
7303                                  */
7304                                 delay(pc_thread_shortwait);
7305                         }
7306                         page_capture_async();
7307                 }
7308                 /* reenable seg_pcache */
7309                 seg_p_enable();
7310 
7311                 /* completed what can be done.  break out */
7312                 return;
7313         }
7314 
7315         /*
7316          * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
7317          * and then attempt to capture.
7318          */
7319         seg_preap();
7320         page_capture_async();
7321 }
7322 
7323 /*
7324  * The page_capture_thread loops forever, looking to see if there are
7325  * pages still waiting to be captured.
7326  */
7327 static void
7328 page_capture_thread(void)
7329 {
7330         callb_cpr_t c;
7331         int i;
7332         int high_pri_pages;
7333         int low_pri_pages;
7334         clock_t timeout;
7335 
7336         CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7337 
7338         mutex_enter(&pc_thread_mutex);
7339         for (;;) {
7340                 high_pri_pages = 0;
7341                 low_pri_pages = 0;
7342                 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7343                         high_pri_pages +=
7344                             page_capture_hash[i].num_pages[PC_PRI_HI];
7345                         low_pri_pages +=
7346                             page_capture_hash[i].num_pages[PC_PRI_LO];
7347                 }
7348 
7349                 timeout = pc_thread_longwait;
7350                 if (high_pri_pages != 0) {
7351                         timeout = pc_thread_shortwait;
7352                         page_capture_handle_outstanding();
7353                 } else if (low_pri_pages != 0) {
7354                         page_capture_async();
7355                 }
7356                 CALLB_CPR_SAFE_BEGIN(&c);
7357                 (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
7358                     timeout, TR_CLOCK_TICK);
7359                 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7360         }
7361         /*NOTREACHED*/
7362 }
7363 /*
7364  * Attempt to locate a bucket that has enough pages to satisfy the request.
7365  * The initial check is done without the lock to avoid unneeded contention.
7366  * The function returns 1 if enough pages were found, else 0 if it could not
7367  * find enough pages in a bucket.
7368  */
7369 static int
7370 pcf_decrement_bucket(pgcnt_t npages)
7371 {
7372         struct pcf      *p;
7373         struct pcf      *q;
7374         int i;
7375 
7376         p = &pcf[PCF_INDEX()];
7377         q = &pcf[pcf_fanout];
7378         for (i = 0; i < pcf_fanout; i++) {
7379                 if (p->pcf_count > npages) {
7380                         /*
7381                          * a good one to try.
7382                          */
7383                         mutex_enter(&p->pcf_lock);
7384                         if (p->pcf_count > npages) {
7385                                 p->pcf_count -= (uint_t)npages;
7386                                 /*
7387                                  * freemem is not protected by any lock.
7388                                  * Thus, we cannot have any assertion
7389                                  * containing freemem here.
7390                                  */
7391                                 freemem -= npages;
7392                                 mutex_exit(&p->pcf_lock);
7393                                 return (1);
7394                         }
7395                         mutex_exit(&p->pcf_lock);
7396                 }
7397                 p++;
7398                 if (p >= q) {
7399                         p = pcf;
7400                 }
7401         }
7402         return (0);
7403 }
7404 
7405 /*
7406  * Arguments:
7407  *      pcftotal_ret:   If the value is not NULL and we have walked all the
7408  *                      buckets but did not find enough pages then it will
7409  *                      be set to the total number of pages in all the pcf
7410  *                      buckets.
7411  *      npages:         Is the number of pages we have been requested to
7412  *                      find.
7413  *      unlock:         If set to 0 we will leave the buckets locked if the
7414  *                      requested number of pages are not found.
7415  *
7416  * Go and try to satisfy the page request  from any number of buckets.
7417  * This can be a very expensive operation as we have to lock the buckets
7418  * we are checking (and keep them locked), starting at bucket 0.
7419  *
7420  * The function returns 1 if enough pages were found, else 0 if it could not
7421  * find enough pages in the buckets.
7422  *
7423  */
7424 static int
7425 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7426 {
7427         struct pcf      *p;
7428         pgcnt_t pcftotal;
7429         int i;
7430 
7431         p = pcf;
7432         /* try to collect pages from several pcf bins */
7433         for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7434                 mutex_enter(&p->pcf_lock);
7435                 pcftotal += p->pcf_count;
7436                 if (pcftotal >= npages) {
7437                         /*
7438                          * Wow!  There are enough pages laying around
7439                          * to satisfy the request.  Do the accounting,
7440                          * drop the locks we acquired, and go back.
7441                          *
7442                          * freemem is not protected by any lock. So,
7443                          * we cannot have any assertion containing
7444                          * freemem.
7445                          */
7446                         freemem -= npages;
7447                         while (p >= pcf) {
7448                                 if (p->pcf_count <= npages) {
7449                                         npages -= p->pcf_count;
7450                                         p->pcf_count = 0;
7451                                 } else {
7452                                         p->pcf_count -= (uint_t)npages;
7453                                         npages = 0;
7454                                 }
7455                                 mutex_exit(&p->pcf_lock);
7456                                 p--;
7457                         }
7458                         ASSERT(npages == 0);
7459                         return (1);
7460                 }
7461                 p++;
7462         }
7463         if (unlock) {
7464                 /* failed to collect pages - release the locks */
7465                 while (--p >= pcf) {
7466                         mutex_exit(&p->pcf_lock);
7467                 }
7468         }
7469         if (pcftotal_ret != NULL)
7470                 *pcftotal_ret = pcftotal;
7471         return (0);
7472 }