1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
  24  * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
  25  * Copyright 2018 Joyent, Inc.
  26  */
  27 
  28 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T */
  29 /* All Rights Reserved */
  30 
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40 
  41 /*
  42  * VM - physical page management.
  43  */
  44 
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/errno.h>
  50 #include <sys/time.h>
  51 #include <sys/vnode.h>
  52 #include <sys/vm.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/swap.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/tuneable.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/cpuvar.h>
  59 #include <sys/callb.h>
  60 #include <sys/debug.h>
  61 #include <sys/tnf_probe.h>
  62 #include <sys/condvar_impl.h>
  63 #include <sys/mem_config.h>
  64 #include <sys/mem_cage.h>
  65 #include <sys/kmem.h>
  66 #include <sys/atomic.h>
  67 #include <sys/strlog.h>
  68 #include <sys/mman.h>
  69 #include <sys/ontrap.h>
  70 #include <sys/lgrp.h>
  71 #include <sys/vfs.h>
  72 
  73 #include <vm/hat.h>
  74 #include <vm/anon.h>
  75 #include <vm/page.h>
  76 #include <vm/seg.h>
  77 #include <vm/pvn.h>
  78 #include <vm/seg_kmem.h>
  79 #include <vm/vm_dep.h>
  80 #include <sys/vm_usage.h>
  81 #include <fs/fs_subr.h>
  82 #include <sys/ddi.h>
  83 #include <sys/modctl.h>
  84 
  85 static pgcnt_t max_page_get;    /* max page_get request size in pages */
  86 pgcnt_t total_pages = 0;        /* total number of pages (used by /proc) */
  87 uint64_t n_throttle = 0;        /* num times page create throttled */
  88 
  89 /*
  90  * freemem_lock protects all freemem variables:
  91  * availrmem. Also this lock protects the globals which track the
  92  * availrmem changes for accurate kernel footprint calculation.
  93  * See below for an explanation of these
  94  * globals.
  95  */
  96 kmutex_t freemem_lock;
  97 pgcnt_t availrmem;
  98 pgcnt_t availrmem_initial;
  99 
 100 /*
 101  * These globals track availrmem changes to get a more accurate
 102  * estimate of tke kernel size. Historically pp_kernel is used for
 103  * kernel size and is based on availrmem. But availrmem is adjusted for
 104  * locked pages in the system not just for kernel locked pages.
 105  * These new counters will track the pages locked through segvn and
 106  * by explicit user locking.
 107  *
 108  * pages_locked : How many pages are locked because of user specified
 109  * locking through mlock or plock.
 110  *
 111  * pages_useclaim,pages_claimed : These two variables track the
 112  * claim adjustments because of the protection changes on a segvn segment.
 113  *
 114  * All these globals are protected by the same lock which protects availrmem.
 115  */
 116 pgcnt_t pages_locked = 0;
 117 pgcnt_t pages_useclaim = 0;
 118 pgcnt_t pages_claimed = 0;
 119 
 120 
 121 /*
 122  * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
 123  */
 124 static kmutex_t new_freemem_lock;
 125 static uint_t   freemem_wait;   /* someone waiting for freemem */
 126 static kcondvar_t freemem_cv;
 127 
 128 /*
 129  * The logical page free list is maintained as two lists, the 'free'
 130  * and the 'cache' lists.
 131  * The free list contains those pages that should be reused first.
 132  *
 133  * The implementation of the lists is machine dependent.
 134  * page_get_freelist(), page_get_cachelist(),
 135  * page_list_sub(), and page_list_add()
 136  * form the interface to the machine dependent implementation.
 137  *
 138  * Pages with p_free set are on the cache list.
 139  * Pages with p_free and p_age set are on the free list,
 140  *
 141  * A page may be locked while on either list.
 142  */
 143 
 144 /*
 145  * free list accounting stuff.
 146  *
 147  *
 148  * Spread out the value for the number of pages on the
 149  * page free and page cache lists.  If there is just one
 150  * value, then it must be under just one lock.
 151  * The lock contention and cache traffic are a real bother.
 152  *
 153  * When we acquire and then drop a single pcf lock
 154  * we can start in the middle of the array of pcf structures.
 155  * If we acquire more than one pcf lock at a time, we need to
 156  * start at the front to avoid deadlocking.
 157  *
 158  * pcf_count holds the number of pages in each pool.
 159  *
 160  * pcf_block is set when page_create_get_something() has asked the
 161  * PSM page freelist and page cachelist routines without specifying
 162  * a color and nothing came back.  This is used to block anything
 163  * else from moving pages from one list to the other while the
 164  * lists are searched again.  If a page is freeed while pcf_block is
 165  * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
 166  * of clearning pcf_block, doing the wakeups, etc.
 167  */
 168 
 169 #define MAX_PCF_FANOUT NCPU
 170 static uint_t pcf_fanout = 1; /* Will get changed at boot time */
 171 static uint_t pcf_fanout_mask = 0;
 172 
 173 struct pcf {
 174         kmutex_t        pcf_lock;       /* protects the structure */
 175         uint_t          pcf_count;      /* page count */
 176         uint_t          pcf_wait;       /* number of waiters */
 177         uint_t          pcf_block;      /* pcgs flag to page_free() */
 178         uint_t          pcf_reserve;    /* pages freed after pcf_block set */
 179         uint_t          pcf_fill[10];   /* to line up on the caches */
 180 };
 181 
 182 /*
 183  * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
 184  * it will hash the cpu to).  This is done to prevent a drain condition
 185  * from happening.  This drain condition will occur when pcf_count decrement
 186  * occurs on cpu A and the increment of pcf_count always occurs on cpu B.  An
 187  * example of this shows up with device interrupts.  The dma buffer is allocated
 188  * by the cpu requesting the IO thus the pcf_count is decremented based on that.
 189  * When the memory is returned by the interrupt thread, the pcf_count will be
 190  * incremented based on the cpu servicing the interrupt.
 191  */
 192 static struct pcf pcf[MAX_PCF_FANOUT];
 193 #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
 194         (randtick() >> 24)) & (pcf_fanout_mask))
 195 
 196 static int pcf_decrement_bucket(pgcnt_t);
 197 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
 198 
 199 kmutex_t        pcgs_lock;              /* serializes page_create_get_ */
 200 kmutex_t        pcgs_cagelock;          /* serializes NOSLEEP cage allocs */
 201 kmutex_t        pcgs_wait_lock;         /* used for delay in pcgs */
 202 static kcondvar_t       pcgs_cv;        /* cv for delay in pcgs */
 203 
 204 #ifdef VM_STATS
 205 
 206 /*
 207  * No locks, but so what, they are only statistics.
 208  */
 209 
 210 static struct page_tcnt {
 211         int     pc_free_cache;          /* free's into cache list */
 212         int     pc_free_dontneed;       /* free's with dontneed */
 213         int     pc_free_pageout;        /* free's from pageout */
 214         int     pc_free_free;           /* free's into free list */
 215         int     pc_free_pages;          /* free's into large page free list */
 216         int     pc_destroy_pages;       /* large page destroy's */
 217         int     pc_get_cache;           /* get's from cache list */
 218         int     pc_get_free;            /* get's from free list */
 219         int     pc_reclaim;             /* reclaim's */
 220         int     pc_abortfree;           /* abort's of free pages */
 221         int     pc_find_hit;            /* find's that find page */
 222         int     pc_find_miss;           /* find's that don't find page */
 223         int     pc_destroy_free;        /* # of free pages destroyed */
 224 #define PC_HASH_CNT     (4*PAGE_HASHAVELEN)
 225         int     pc_find_hashlen[PC_HASH_CNT+1];
 226         int     pc_addclaim_pages;
 227         int     pc_subclaim_pages;
 228         int     pc_free_replacement_page[2];
 229         int     pc_try_demote_pages[6];
 230         int     pc_demote_pages[2];
 231 } pagecnt;
 232 
 233 uint_t  hashin_count;
 234 uint_t  hashin_not_held;
 235 uint_t  hashin_already;
 236 
 237 uint_t  hashout_count;
 238 uint_t  hashout_not_held;
 239 
 240 uint_t  page_create_count;
 241 uint_t  page_create_not_enough;
 242 uint_t  page_create_not_enough_again;
 243 uint_t  page_create_zero;
 244 uint_t  page_create_hashout;
 245 uint_t  page_create_page_lock_failed;
 246 uint_t  page_create_trylock_failed;
 247 uint_t  page_create_found_one;
 248 uint_t  page_create_hashin_failed;
 249 uint_t  page_create_dropped_phm;
 250 
 251 uint_t  page_create_new;
 252 uint_t  page_create_exists;
 253 uint_t  page_create_putbacks;
 254 uint_t  page_create_overshoot;
 255 
 256 uint_t  page_reclaim_zero;
 257 uint_t  page_reclaim_zero_locked;
 258 
 259 uint_t  page_rename_exists;
 260 uint_t  page_rename_count;
 261 
 262 uint_t  page_lookup_cnt[20];
 263 uint_t  page_lookup_nowait_cnt[10];
 264 uint_t  page_find_cnt;
 265 uint_t  page_exists_cnt;
 266 uint_t  page_exists_forreal_cnt;
 267 uint_t  page_lookup_dev_cnt;
 268 uint_t  get_cachelist_cnt;
 269 uint_t  page_create_cnt[10];
 270 uint_t  alloc_pages[9];
 271 uint_t  page_exphcontg[19];
 272 uint_t  page_create_large_cnt[10];
 273 
 274 #endif
 275 
 276 static inline page_t *
 277 page_hash_search(ulong_t index, vnode_t *vnode, u_offset_t off)
 278 {
 279         uint_t mylen = 0;
 280         page_t *page;
 281 
 282         for (page = page_hash[index]; page; page = page->p_hash, mylen++)
 283                 if (page->p_vnode == vnode && page->p_offset == off)
 284                         break;
 285 
 286 #ifdef  VM_STATS
 287         if (page != NULL)
 288                 pagecnt.pc_find_hit++;
 289         else
 290                 pagecnt.pc_find_miss++;
 291 
 292         pagecnt.pc_find_hashlen[MIN(mylen, PC_HASH_CNT)]++;
 293 #endif
 294 
 295         return (page);
 296 }
 297 
 298 
 299 #ifdef DEBUG
 300 #define MEMSEG_SEARCH_STATS
 301 #endif
 302 
 303 #ifdef MEMSEG_SEARCH_STATS
 304 struct memseg_stats {
 305     uint_t nsearch;
 306     uint_t nlastwon;
 307     uint_t nhashwon;
 308     uint_t nnotfound;
 309 } memseg_stats;
 310 
 311 #define MEMSEG_STAT_INCR(v) \
 312         atomic_inc_32(&memseg_stats.v)
 313 #else
 314 #define MEMSEG_STAT_INCR(x)
 315 #endif
 316 
 317 struct memseg *memsegs;         /* list of memory segments */
 318 
 319 /*
 320  * /etc/system tunable to control large page allocation hueristic.
 321  *
 322  * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
 323  * for large page allocation requests.  If a large page is not readily
 324  * avaliable on the local freelists we will go through additional effort
 325  * to create a large page, potentially moving smaller pages around to coalesce
 326  * larger pages in the local lgroup.
 327  * Default value of LPAP_DEFAULT will go to remote freelists if large pages
 328  * are not readily available in the local lgroup.
 329  */
 330 enum lpap {
 331         LPAP_DEFAULT,   /* default large page allocation policy */
 332         LPAP_LOCAL      /* local large page allocation policy */
 333 };
 334 
 335 enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
 336 
 337 static void page_init_mem_config(void);
 338 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
 339 static void page_do_hashout(page_t *);
 340 static void page_capture_init();
 341 int page_capture_take_action(page_t *, uint_t, void *);
 342 
 343 static void page_demote_vp_pages(page_t *);
 344 
 345 
 346 void
 347 pcf_init(void)
 348 {
 349         if (boot_ncpus != -1) {
 350                 pcf_fanout = boot_ncpus;
 351         } else {
 352                 pcf_fanout = max_ncpus;
 353         }
 354 #ifdef sun4v
 355         /*
 356          * Force at least 4 buckets if possible for sun4v.
 357          */
 358         pcf_fanout = MAX(pcf_fanout, 4);
 359 #endif /* sun4v */
 360 
 361         /*
 362          * Round up to the nearest power of 2.
 363          */
 364         pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
 365         if (!ISP2(pcf_fanout)) {
 366                 pcf_fanout = 1 << highbit(pcf_fanout);
 367 
 368                 if (pcf_fanout > MAX_PCF_FANOUT) {
 369                         pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
 370                 }
 371         }
 372         pcf_fanout_mask = pcf_fanout - 1;
 373 }
 374 
 375 /*
 376  * vm subsystem related initialization
 377  */
 378 void
 379 vm_init(void)
 380 {
 381         boolean_t callb_vm_cpr(void *, int);
 382 
 383         (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
 384         page_init_mem_config();
 385         page_retire_init();
 386         vm_usage_init();
 387         page_capture_init();
 388 }
 389 
 390 /*
 391  * This function is called at startup and when memory is added or deleted.
 392  */
 393 void
 394 init_pages_pp_maximum()
 395 {
 396         static pgcnt_t p_min;
 397         static pgcnt_t pages_pp_maximum_startup;
 398         static pgcnt_t avrmem_delta;
 399         static int init_done;
 400         static int user_set;    /* true if set in /etc/system */
 401 
 402         if (init_done == 0) {
 403 
 404                 /* If the user specified a value, save it */
 405                 if (pages_pp_maximum != 0) {
 406                         user_set = 1;
 407                         pages_pp_maximum_startup = pages_pp_maximum;
 408                 }
 409 
 410                 /*
 411                  * Setting of pages_pp_maximum is based first time
 412                  * on the value of availrmem just after the start-up
 413                  * allocations. To preserve this relationship at run
 414                  * time, use a delta from availrmem_initial.
 415                  */
 416                 ASSERT(availrmem_initial >= availrmem);
 417                 avrmem_delta = availrmem_initial - availrmem;
 418 
 419                 /* The allowable floor of pages_pp_maximum */
 420                 p_min = tune.t_minarmem + 100;
 421 
 422                 /* Make sure we don't come through here again. */
 423                 init_done = 1;
 424         }
 425         /*
 426          * Determine pages_pp_maximum, the number of currently available
 427          * pages (availrmem) that can't be `locked'. If not set by
 428          * the user, we set it to 4% of the currently available memory
 429          * plus 4MB.
 430          * But we also insist that it be greater than tune.t_minarmem;
 431          * otherwise a process could lock down a lot of memory, get swapped
 432          * out, and never have enough to get swapped back in.
 433          */
 434         if (user_set)
 435                 pages_pp_maximum = pages_pp_maximum_startup;
 436         else
 437                 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
 438                     + btop(4 * 1024 * 1024);
 439 
 440         if (pages_pp_maximum <= p_min) {
 441                 pages_pp_maximum = p_min;
 442         }
 443 }
 444 
 445 /*
 446  * In the past, we limited the maximum pages that could be gotten to essentially
 447  * 1/2 of the total pages on the system. However, this is too conservative for
 448  * some cases. For example, if we want to host a large virtual machine which
 449  * needs to use a significant portion of the system's memory. In practice,
 450  * allowing more than 1/2 of the total pages is fine, but becomes problematic
 451  * as we approach or exceed 75% of the pages on the system. Thus, we limit the
 452  * maximum to 23/32 of the total pages, which is ~72%.
 453  */
 454 void
 455 set_max_page_get(pgcnt_t target_total_pages)
 456 {
 457         max_page_get = (target_total_pages >> 5) * 23;
 458         ASSERT3U(max_page_get, >, 0);
 459 }
 460 
 461 pgcnt_t
 462 get_max_page_get()
 463 {
 464         return (max_page_get);
 465 }
 466 
 467 static pgcnt_t pending_delete;
 468 
 469 /*ARGSUSED*/
 470 static void
 471 page_mem_config_post_add(
 472         void *arg,
 473         pgcnt_t delta_pages)
 474 {
 475         set_max_page_get(total_pages - pending_delete);
 476         init_pages_pp_maximum();
 477 }
 478 
 479 /*ARGSUSED*/
 480 static int
 481 page_mem_config_pre_del(
 482         void *arg,
 483         pgcnt_t delta_pages)
 484 {
 485         pgcnt_t nv;
 486 
 487         nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
 488         set_max_page_get(total_pages - nv);
 489         return (0);
 490 }
 491 
 492 /*ARGSUSED*/
 493 static void
 494 page_mem_config_post_del(
 495         void *arg,
 496         pgcnt_t delta_pages,
 497         int cancelled)
 498 {
 499         pgcnt_t nv;
 500 
 501         nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
 502         set_max_page_get(total_pages - nv);
 503         if (!cancelled)
 504                 init_pages_pp_maximum();
 505 }
 506 
 507 static kphysm_setup_vector_t page_mem_config_vec = {
 508         KPHYSM_SETUP_VECTOR_VERSION,
 509         page_mem_config_post_add,
 510         page_mem_config_pre_del,
 511         page_mem_config_post_del,
 512 };
 513 
 514 static void
 515 page_init_mem_config(void)
 516 {
 517         int ret;
 518 
 519         ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
 520         ASSERT(ret == 0);
 521 }
 522 
 523 /*
 524  * Evenly spread out the PCF counters for large free pages
 525  */
 526 static void
 527 page_free_large_ctr(pgcnt_t npages)
 528 {
 529         static struct pcf       *p = pcf;
 530         pgcnt_t                 lump;
 531 
 532         freemem += npages;
 533 
 534         lump = roundup(npages, pcf_fanout) / pcf_fanout;
 535 
 536         while (npages > 0) {
 537 
 538                 ASSERT(!p->pcf_block);
 539 
 540                 if (lump < npages) {
 541                         p->pcf_count += (uint_t)lump;
 542                         npages -= lump;
 543                 } else {
 544                         p->pcf_count += (uint_t)npages;
 545                         npages = 0;
 546                 }
 547 
 548                 ASSERT(!p->pcf_wait);
 549 
 550                 if (++p > &pcf[pcf_fanout - 1])
 551                         p = pcf;
 552         }
 553 
 554         ASSERT(npages == 0);
 555 }
 556 
 557 /*
 558  * Add a physical chunk of memory to the system free lists during startup.
 559  * Platform specific startup() allocates the memory for the page structs.
 560  *
 561  * num  - number of page structures
 562  * base - page number (pfn) to be associated with the first page.
 563  *
 564  * Since we are doing this during startup (ie. single threaded), we will
 565  * use shortcut routines to avoid any locking overhead while putting all
 566  * these pages on the freelists.
 567  *
 568  * NOTE: Any changes performed to page_free(), must also be performed to
 569  *       add_physmem() since this is how we initialize all page_t's at
 570  *       boot time.
 571  */
 572 void
 573 add_physmem(
 574         page_t  *pp,
 575         pgcnt_t num,
 576         pfn_t   pnum)
 577 {
 578         page_t  *root = NULL;
 579         uint_t  szc = page_num_pagesizes() - 1;
 580         pgcnt_t large = page_get_pagecnt(szc);
 581         pgcnt_t cnt = 0;
 582 
 583         TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
 584             "add_physmem:pp %p num %lu", pp, num);
 585 
 586         /*
 587          * Arbitrarily limit the max page_get request
 588          * to 1/2 of the page structs we have.
 589          */
 590         total_pages += num;
 591         set_max_page_get(total_pages);
 592 
 593         PLCNT_MODIFY_MAX(pnum, (long)num);
 594 
 595         /*
 596          * The physical space for the pages array
 597          * representing ram pages has already been
 598          * allocated.  Here we initialize each lock
 599          * in the page structure, and put each on
 600          * the free list
 601          */
 602         for (; num; pp++, pnum++, num--) {
 603 
 604                 /*
 605                  * this needs to fill in the page number
 606                  * and do any other arch specific initialization
 607                  */
 608                 add_physmem_cb(pp, pnum);
 609 
 610                 pp->p_lckcnt = 0;
 611                 pp->p_cowcnt = 0;
 612                 pp->p_slckcnt = 0;
 613 
 614                 /*
 615                  * Initialize the page lock as unlocked, since nobody
 616                  * can see or access this page yet.
 617                  */
 618                 pp->p_selock = 0;
 619 
 620                 /*
 621                  * Initialize IO lock
 622                  */
 623                 page_iolock_init(pp);
 624 
 625                 /*
 626                  * initialize other fields in the page_t
 627                  */
 628                 PP_SETFREE(pp);
 629                 page_clr_all_props(pp);
 630                 PP_SETAGED(pp);
 631                 pp->p_offset = (u_offset_t)-1;
 632                 pp->p_next = pp;
 633                 pp->p_prev = pp;
 634 
 635                 /*
 636                  * Simple case: System doesn't support large pages.
 637                  */
 638                 if (szc == 0) {
 639                         pp->p_szc = 0;
 640                         page_free_at_startup(pp);
 641                         continue;
 642                 }
 643 
 644                 /*
 645                  * Handle unaligned pages, we collect them up onto
 646                  * the root page until we have a full large page.
 647                  */
 648                 if (!IS_P2ALIGNED(pnum, large)) {
 649 
 650                         /*
 651                          * If not in a large page,
 652                          * just free as small page.
 653                          */
 654                         if (root == NULL) {
 655                                 pp->p_szc = 0;
 656                                 page_free_at_startup(pp);
 657                                 continue;
 658                         }
 659 
 660                         /*
 661                          * Link a constituent page into the large page.
 662                          */
 663                         pp->p_szc = szc;
 664                         page_list_concat(&root, &pp);
 665 
 666                         /*
 667                          * When large page is fully formed, free it.
 668                          */
 669                         if (++cnt == large) {
 670                                 page_free_large_ctr(cnt);
 671                                 page_list_add_pages(root, PG_LIST_ISINIT);
 672                                 root = NULL;
 673                                 cnt = 0;
 674                         }
 675                         continue;
 676                 }
 677 
 678                 /*
 679                  * At this point we have a page number which
 680                  * is aligned. We assert that we aren't already
 681                  * in a different large page.
 682                  */
 683                 ASSERT(IS_P2ALIGNED(pnum, large));
 684                 ASSERT(root == NULL && cnt == 0);
 685 
 686                 /*
 687                  * If insufficient number of pages left to form
 688                  * a large page, just free the small page.
 689                  */
 690                 if (num < large) {
 691                         pp->p_szc = 0;
 692                         page_free_at_startup(pp);
 693                         continue;
 694                 }
 695 
 696                 /*
 697                  * Otherwise start a new large page.
 698                  */
 699                 pp->p_szc = szc;
 700                 cnt++;
 701                 root = pp;
 702         }
 703         ASSERT(root == NULL && cnt == 0);
 704 }
 705 
 706 /*
 707  * Find a page representing the specified [vp, offset].
 708  * If we find the page but it is intransit coming in,
 709  * it will have an "exclusive" lock and we wait for
 710  * the i/o to complete.  A page found on the free list
 711  * is always reclaimed and then locked.  On success, the page
 712  * is locked, its data is valid and it isn't on the free
 713  * list, while a NULL is returned if the page doesn't exist.
 714  */
 715 page_t *
 716 page_lookup(vnode_t *vp, u_offset_t off, se_t se)
 717 {
 718         return (page_lookup_create(vp, off, se, NULL, NULL, 0));
 719 }
 720 
 721 /*
 722  * Find a page representing the specified [vp, offset].
 723  * We either return the one we found or, if passed in,
 724  * create one with identity of [vp, offset] of the
 725  * pre-allocated page. If we find existing page but it is
 726  * intransit coming in, it will have an "exclusive" lock
 727  * and we wait for the i/o to complete.  A page found on
 728  * the free list is always reclaimed and then locked.
 729  * On success, the page is locked, its data is valid and
 730  * it isn't on the free list, while a NULL is returned
 731  * if the page doesn't exist and newpp is NULL;
 732  */
 733 page_t *
 734 page_lookup_create(
 735         vnode_t *vp,
 736         u_offset_t off,
 737         se_t se,
 738         page_t *newpp,
 739         spgcnt_t *nrelocp,
 740         int flags)
 741 {
 742         page_t          *pp;
 743         kmutex_t        *phm;
 744         ulong_t         index;
 745         uint_t          hash_locked;
 746         uint_t          es;
 747 
 748         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 749         VM_STAT_ADD(page_lookup_cnt[0]);
 750         ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
 751 
 752         /*
 753          * Acquire the appropriate page hash lock since
 754          * we have to search the hash list.  Pages that
 755          * hash to this list can't change identity while
 756          * this lock is held.
 757          */
 758         hash_locked = 0;
 759         index = PAGE_HASH_FUNC(vp, off);
 760         phm = NULL;
 761 top:
 762         pp = page_hash_search(index, vp, off);
 763         if (pp != NULL) {
 764                 VM_STAT_ADD(page_lookup_cnt[1]);
 765                 es = (newpp != NULL) ? 1 : 0;
 766                 es |= flags;
 767                 if (!hash_locked) {
 768                         VM_STAT_ADD(page_lookup_cnt[2]);
 769                         if (!page_try_reclaim_lock(pp, se, es)) {
 770                                 /*
 771                                  * On a miss, acquire the phm.  Then
 772                                  * next time, page_lock() will be called,
 773                                  * causing a wait if the page is busy.
 774                                  * just looping with page_trylock() would
 775                                  * get pretty boring.
 776                                  */
 777                                 VM_STAT_ADD(page_lookup_cnt[3]);
 778                                 phm = PAGE_HASH_MUTEX(index);
 779                                 mutex_enter(phm);
 780                                 hash_locked = 1;
 781                                 goto top;
 782                         }
 783                 } else {
 784                         VM_STAT_ADD(page_lookup_cnt[4]);
 785                         if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
 786                                 VM_STAT_ADD(page_lookup_cnt[5]);
 787                                 goto top;
 788                         }
 789                 }
 790 
 791                 /*
 792                  * Since `pp' is locked it can not change identity now.
 793                  * Reconfirm we locked the correct page.
 794                  *
 795                  * Both the p_vnode and p_offset *must* be cast volatile
 796                  * to force a reload of their values: The page_hash_search
 797                  * function will have stuffed p_vnode and p_offset into
 798                  * registers before calling page_trylock(); another thread,
 799                  * actually holding the hash lock, could have changed the
 800                  * page's identity in memory, but our registers would not
 801                  * be changed, fooling the reconfirmation.  If the hash
 802                  * lock was held during the search, the casting would
 803                  * not be needed.
 804                  */
 805                 VM_STAT_ADD(page_lookup_cnt[6]);
 806                 if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 807                     ((volatile u_offset_t)(pp->p_offset) != off)) {
 808                         VM_STAT_ADD(page_lookup_cnt[7]);
 809                         if (hash_locked) {
 810                                 panic("page_lookup_create: lost page %p",
 811                                     (void *)pp);
 812                                 /*NOTREACHED*/
 813                         }
 814                         page_unlock(pp);
 815                         phm = PAGE_HASH_MUTEX(index);
 816                         mutex_enter(phm);
 817                         hash_locked = 1;
 818                         goto top;
 819                 }
 820 
 821                 /*
 822                  * If page_trylock() was called, then pp may still be on
 823                  * the cachelist (can't be on the free list, it would not
 824                  * have been found in the search).  If it is on the
 825                  * cachelist it must be pulled now. To pull the page from
 826                  * the cachelist, it must be exclusively locked.
 827                  *
 828                  * The other big difference between page_trylock() and
 829                  * page_lock(), is that page_lock() will pull the
 830                  * page from whatever free list (the cache list in this
 831                  * case) the page is on.  If page_trylock() was used
 832                  * above, then we have to do the reclaim ourselves.
 833                  */
 834                 if ((!hash_locked) && (PP_ISFREE(pp))) {
 835                         ASSERT(PP_ISAGED(pp) == 0);
 836                         VM_STAT_ADD(page_lookup_cnt[8]);
 837 
 838                         /*
 839                          * page_relcaim will insure that we
 840                          * have this page exclusively
 841                          */
 842 
 843                         if (!page_reclaim(pp, NULL)) {
 844                                 /*
 845                                  * Page_reclaim dropped whatever lock
 846                                  * we held.
 847                                  */
 848                                 VM_STAT_ADD(page_lookup_cnt[9]);
 849                                 phm = PAGE_HASH_MUTEX(index);
 850                                 mutex_enter(phm);
 851                                 hash_locked = 1;
 852                                 goto top;
 853                         } else if (se == SE_SHARED && newpp == NULL) {
 854                                 VM_STAT_ADD(page_lookup_cnt[10]);
 855                                 page_downgrade(pp);
 856                         }
 857                 }
 858 
 859                 if (hash_locked) {
 860                         mutex_exit(phm);
 861                 }
 862 
 863                 if (newpp != NULL && pp->p_szc < newpp->p_szc &&
 864                     PAGE_EXCL(pp) && nrelocp != NULL) {
 865                         ASSERT(nrelocp != NULL);
 866                         (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
 867                             NULL);
 868                         if (*nrelocp > 0) {
 869                                 VM_STAT_COND_ADD(*nrelocp == 1,
 870                                     page_lookup_cnt[11]);
 871                                 VM_STAT_COND_ADD(*nrelocp > 1,
 872                                     page_lookup_cnt[12]);
 873                                 pp = newpp;
 874                                 se = SE_EXCL;
 875                         } else {
 876                                 if (se == SE_SHARED) {
 877                                         page_downgrade(pp);
 878                                 }
 879                                 VM_STAT_ADD(page_lookup_cnt[13]);
 880                         }
 881                 } else if (newpp != NULL && nrelocp != NULL) {
 882                         if (PAGE_EXCL(pp) && se == SE_SHARED) {
 883                                 page_downgrade(pp);
 884                         }
 885                         VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
 886                             page_lookup_cnt[14]);
 887                         VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
 888                             page_lookup_cnt[15]);
 889                         VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
 890                             page_lookup_cnt[16]);
 891                 } else if (newpp != NULL && PAGE_EXCL(pp)) {
 892                         se = SE_EXCL;
 893                 }
 894         } else if (!hash_locked) {
 895                 VM_STAT_ADD(page_lookup_cnt[17]);
 896                 phm = PAGE_HASH_MUTEX(index);
 897                 mutex_enter(phm);
 898                 hash_locked = 1;
 899                 goto top;
 900         } else if (newpp != NULL) {
 901                 /*
 902                  * If we have a preallocated page then
 903                  * insert it now and basically behave like
 904                  * page_create.
 905                  */
 906                 VM_STAT_ADD(page_lookup_cnt[18]);
 907                 /*
 908                  * Since we hold the page hash mutex and
 909                  * just searched for this page, page_hashin
 910                  * had better not fail.  If it does, that
 911                  * means some thread did not follow the
 912                  * page hash mutex rules.  Panic now and
 913                  * get it over with.  As usual, go down
 914                  * holding all the locks.
 915                  */
 916                 ASSERT(MUTEX_HELD(phm));
 917                 if (!page_hashin(newpp, vp, off, phm)) {
 918                         ASSERT(MUTEX_HELD(phm));
 919                         panic("page_lookup_create: hashin failed %p %p %llx %p",
 920                             (void *)newpp, (void *)vp, off, (void *)phm);
 921                         /*NOTREACHED*/
 922                 }
 923                 ASSERT(MUTEX_HELD(phm));
 924                 mutex_exit(phm);
 925                 phm = NULL;
 926                 page_set_props(newpp, P_REF);
 927                 page_io_lock(newpp);
 928                 pp = newpp;
 929                 se = SE_EXCL;
 930         } else {
 931                 VM_STAT_ADD(page_lookup_cnt[19]);
 932                 mutex_exit(phm);
 933         }
 934 
 935         ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
 936 
 937         ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
 938 
 939         return (pp);
 940 }
 941 
 942 /*
 943  * Search the hash list for the page representing the
 944  * specified [vp, offset] and return it locked.  Skip
 945  * free pages and pages that cannot be locked as requested.
 946  * Used while attempting to kluster pages.
 947  */
 948 page_t *
 949 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
 950 {
 951         page_t          *pp;
 952         kmutex_t        *phm;
 953         ulong_t         index;
 954         uint_t          locked;
 955 
 956         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
 957         VM_STAT_ADD(page_lookup_nowait_cnt[0]);
 958 
 959         index = PAGE_HASH_FUNC(vp, off);
 960         pp = page_hash_search(index, vp, off);
 961         locked = 0;
 962         if (pp == NULL) {
 963 top:
 964                 VM_STAT_ADD(page_lookup_nowait_cnt[1]);
 965                 locked = 1;
 966                 phm = PAGE_HASH_MUTEX(index);
 967                 mutex_enter(phm);
 968                 pp = page_hash_search(index, vp, off);
 969         }
 970 
 971         if (pp == NULL || PP_ISFREE(pp)) {
 972                 VM_STAT_ADD(page_lookup_nowait_cnt[2]);
 973                 pp = NULL;
 974         } else {
 975                 if (!page_trylock(pp, se)) {
 976                         VM_STAT_ADD(page_lookup_nowait_cnt[3]);
 977                         pp = NULL;
 978                 } else {
 979                         VM_STAT_ADD(page_lookup_nowait_cnt[4]);
 980                         /*
 981                          * See the comment in page_lookup()
 982                          */
 983                         if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
 984                             ((u_offset_t)(pp->p_offset) != off)) {
 985                                 VM_STAT_ADD(page_lookup_nowait_cnt[5]);
 986                                 if (locked) {
 987                                         panic("page_lookup_nowait %p",
 988                                             (void *)pp);
 989                                         /*NOTREACHED*/
 990                                 }
 991                                 page_unlock(pp);
 992                                 goto top;
 993                         }
 994                         if (PP_ISFREE(pp)) {
 995                                 VM_STAT_ADD(page_lookup_nowait_cnt[6]);
 996                                 page_unlock(pp);
 997                                 pp = NULL;
 998                         }
 999                 }
1000         }
1001         if (locked) {
1002                 VM_STAT_ADD(page_lookup_nowait_cnt[7]);
1003                 mutex_exit(phm);
1004         }
1005 
1006         ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
1007 
1008         return (pp);
1009 }
1010 
1011 /*
1012  * Search the hash list for a page with the specified [vp, off]
1013  * that is known to exist and is already locked.  This routine
1014  * is typically used by segment SOFTUNLOCK routines.
1015  */
1016 page_t *
1017 page_find(vnode_t *vp, u_offset_t off)
1018 {
1019         page_t          *pp;
1020         kmutex_t        *phm;
1021         ulong_t         index;
1022 
1023         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1024         VM_STAT_ADD(page_find_cnt);
1025 
1026         index = PAGE_HASH_FUNC(vp, off);
1027         phm = PAGE_HASH_MUTEX(index);
1028 
1029         mutex_enter(phm);
1030         pp = page_hash_search(index, vp, off);
1031         mutex_exit(phm);
1032 
1033         ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1034         return (pp);
1035 }
1036 
1037 /*
1038  * Determine whether a page with the specified [vp, off]
1039  * currently exists in the system.  Obviously this should
1040  * only be considered as a hint since nothing prevents the
1041  * page from disappearing or appearing immediately after
1042  * the return from this routine. Subsequently, we don't
1043  * even bother to lock the list.
1044  */
1045 page_t *
1046 page_exists(vnode_t *vp, u_offset_t off)
1047 {
1048         ulong_t         index;
1049 
1050         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1051         VM_STAT_ADD(page_exists_cnt);
1052 
1053         index = PAGE_HASH_FUNC(vp, off);
1054 
1055         return (page_hash_search(index, vp, off));
1056 }
1057 
1058 /*
1059  * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1060  * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
1061  * with these pages locked SHARED. If necessary reclaim pages from
1062  * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1063  *
1064  * If we fail to lock pages still return 1 if pages exist and contiguous.
1065  * But in this case return value is just a hint. ppa array won't be filled.
1066  * Caller should initialize ppa[0] as NULL to distinguish return value.
1067  *
1068  * Returns 0 if pages don't exist or not physically contiguous.
1069  *
1070  * This routine doesn't work for anonymous(swapfs) pages.
1071  */
1072 int
1073 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1074 {
1075         pgcnt_t pages;
1076         pfn_t pfn;
1077         page_t *rootpp;
1078         pgcnt_t i;
1079         pgcnt_t j;
1080         u_offset_t save_off = off;
1081         ulong_t index;
1082         kmutex_t *phm;
1083         page_t *pp;
1084         uint_t pszc;
1085         int loopcnt = 0;
1086 
1087         ASSERT(szc != 0);
1088         ASSERT(vp != NULL);
1089         ASSERT(!IS_SWAPFSVP(vp));
1090         ASSERT(!VN_ISKAS(vp));
1091 
1092 again:
1093         if (++loopcnt > 3) {
1094                 VM_STAT_ADD(page_exphcontg[0]);
1095                 return (0);
1096         }
1097 
1098         index = PAGE_HASH_FUNC(vp, off);
1099         phm = PAGE_HASH_MUTEX(index);
1100 
1101         mutex_enter(phm);
1102         pp = page_hash_search(index, vp, off);
1103         mutex_exit(phm);
1104 
1105         VM_STAT_ADD(page_exphcontg[1]);
1106 
1107         if (pp == NULL) {
1108                 VM_STAT_ADD(page_exphcontg[2]);
1109                 return (0);
1110         }
1111 
1112         pages = page_get_pagecnt(szc);
1113         rootpp = pp;
1114         pfn = rootpp->p_pagenum;
1115 
1116         if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1117                 VM_STAT_ADD(page_exphcontg[3]);
1118                 if (!page_trylock(pp, SE_SHARED)) {
1119                         VM_STAT_ADD(page_exphcontg[4]);
1120                         return (1);
1121                 }
1122                 /*
1123                  * Also check whether p_pagenum was modified by DR.
1124                  */
1125                 if (pp->p_szc != pszc || pp->p_vnode != vp ||
1126                     pp->p_offset != off || pp->p_pagenum != pfn) {
1127                         VM_STAT_ADD(page_exphcontg[5]);
1128                         page_unlock(pp);
1129                         off = save_off;
1130                         goto again;
1131                 }
1132                 /*
1133                  * szc was non zero and vnode and offset matched after we
1134                  * locked the page it means it can't become free on us.
1135                  */
1136                 ASSERT(!PP_ISFREE(pp));
1137                 if (!IS_P2ALIGNED(pfn, pages)) {
1138                         page_unlock(pp);
1139                         return (0);
1140                 }
1141                 ppa[0] = pp;
1142                 pp++;
1143                 off += PAGESIZE;
1144                 pfn++;
1145                 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1146                         if (!page_trylock(pp, SE_SHARED)) {
1147                                 VM_STAT_ADD(page_exphcontg[6]);
1148                                 pp--;
1149                                 while (i-- > 0) {
1150                                         page_unlock(pp);
1151                                         pp--;
1152                                 }
1153                                 ppa[0] = NULL;
1154                                 return (1);
1155                         }
1156                         if (pp->p_szc != pszc) {
1157                                 VM_STAT_ADD(page_exphcontg[7]);
1158                                 page_unlock(pp);
1159                                 pp--;
1160                                 while (i-- > 0) {
1161                                         page_unlock(pp);
1162                                         pp--;
1163                                 }
1164                                 ppa[0] = NULL;
1165                                 off = save_off;
1166                                 goto again;
1167                         }
1168                         /*
1169                          * szc the same as for previous already locked pages
1170                          * with right identity. Since this page had correct
1171                          * szc after we locked it can't get freed or destroyed
1172                          * and therefore must have the expected identity.
1173                          */
1174                         ASSERT(!PP_ISFREE(pp));
1175                         if (pp->p_vnode != vp ||
1176                             pp->p_offset != off) {
1177                                 panic("page_exists_physcontig: "
1178                                     "large page identity doesn't match");
1179                         }
1180                         ppa[i] = pp;
1181                         ASSERT(pp->p_pagenum == pfn);
1182                 }
1183                 VM_STAT_ADD(page_exphcontg[8]);
1184                 ppa[pages] = NULL;
1185                 return (1);
1186         } else if (pszc >= szc) {
1187                 VM_STAT_ADD(page_exphcontg[9]);
1188                 if (!IS_P2ALIGNED(pfn, pages)) {
1189                         return (0);
1190                 }
1191                 return (1);
1192         }
1193 
1194         if (!IS_P2ALIGNED(pfn, pages)) {
1195                 VM_STAT_ADD(page_exphcontg[10]);
1196                 return (0);
1197         }
1198 
1199         if (page_numtomemseg_nolock(pfn) !=
1200             page_numtomemseg_nolock(pfn + pages - 1)) {
1201                 VM_STAT_ADD(page_exphcontg[11]);
1202                 return (0);
1203         }
1204 
1205         /*
1206          * We loop up 4 times across pages to promote page size.
1207          * We're extra cautious to promote page size atomically with respect
1208          * to everybody else.  But we can probably optimize into 1 loop if
1209          * this becomes an issue.
1210          */
1211 
1212         for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1213                 if (!page_trylock(pp, SE_EXCL)) {
1214                         VM_STAT_ADD(page_exphcontg[12]);
1215                         break;
1216                 }
1217                 /*
1218                  * Check whether p_pagenum was modified by DR.
1219                  */
1220                 if (pp->p_pagenum != pfn) {
1221                         page_unlock(pp);
1222                         break;
1223                 }
1224                 if (pp->p_vnode != vp ||
1225                     pp->p_offset != off) {
1226                         VM_STAT_ADD(page_exphcontg[13]);
1227                         page_unlock(pp);
1228                         break;
1229                 }
1230                 if (pp->p_szc >= szc) {
1231                         ASSERT(i == 0);
1232                         page_unlock(pp);
1233                         off = save_off;
1234                         goto again;
1235                 }
1236         }
1237 
1238         if (i != pages) {
1239                 VM_STAT_ADD(page_exphcontg[14]);
1240                 --pp;
1241                 while (i-- > 0) {
1242                         page_unlock(pp);
1243                         --pp;
1244                 }
1245                 return (0);
1246         }
1247 
1248         pp = rootpp;
1249         for (i = 0; i < pages; i++, pp++) {
1250                 if (PP_ISFREE(pp)) {
1251                         VM_STAT_ADD(page_exphcontg[15]);
1252                         ASSERT(!PP_ISAGED(pp));
1253                         ASSERT(pp->p_szc == 0);
1254                         if (!page_reclaim(pp, NULL)) {
1255                                 break;
1256                         }
1257                 } else {
1258                         ASSERT(pp->p_szc < szc);
1259                         VM_STAT_ADD(page_exphcontg[16]);
1260                         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1261                 }
1262         }
1263         if (i < pages) {
1264                 VM_STAT_ADD(page_exphcontg[17]);
1265                 /*
1266                  * page_reclaim failed because we were out of memory.
1267                  * drop the rest of the locks and return because this page
1268                  * must be already reallocated anyway.
1269                  */
1270                 pp = rootpp;
1271                 for (j = 0; j < pages; j++, pp++) {
1272                         if (j != i) {
1273                                 page_unlock(pp);
1274                         }
1275                 }
1276                 return (0);
1277         }
1278 
1279         off = save_off;
1280         pp = rootpp;
1281         for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1282                 ASSERT(PAGE_EXCL(pp));
1283                 ASSERT(!PP_ISFREE(pp));
1284                 ASSERT(!hat_page_is_mapped(pp));
1285                 ASSERT(pp->p_vnode == vp);
1286                 ASSERT(pp->p_offset == off);
1287                 pp->p_szc = szc;
1288         }
1289         pp = rootpp;
1290         for (i = 0; i < pages; i++, pp++) {
1291                 if (ppa == NULL) {
1292                         page_unlock(pp);
1293                 } else {
1294                         ppa[i] = pp;
1295                         page_downgrade(ppa[i]);
1296                 }
1297         }
1298         if (ppa != NULL) {
1299                 ppa[pages] = NULL;
1300         }
1301         VM_STAT_ADD(page_exphcontg[18]);
1302         ASSERT(vp->v_pages != NULL);
1303         return (1);
1304 }
1305 
1306 /*
1307  * Determine whether a page with the specified [vp, off]
1308  * currently exists in the system and if so return its
1309  * size code. Obviously this should only be considered as
1310  * a hint since nothing prevents the page from disappearing
1311  * or appearing immediately after the return from this routine.
1312  */
1313 int
1314 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1315 {
1316         page_t          *pp;
1317         kmutex_t        *phm;
1318         ulong_t         index;
1319         int             rc = 0;
1320 
1321         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1322         ASSERT(szc != NULL);
1323         VM_STAT_ADD(page_exists_forreal_cnt);
1324 
1325         index = PAGE_HASH_FUNC(vp, off);
1326         phm = PAGE_HASH_MUTEX(index);
1327 
1328         mutex_enter(phm);
1329         pp = page_hash_search(index, vp, off);
1330         if (pp != NULL) {
1331                 *szc = pp->p_szc;
1332                 rc = 1;
1333         }
1334         mutex_exit(phm);
1335         return (rc);
1336 }
1337 
1338 /* wakeup threads waiting for pages in page_create_get_something() */
1339 void
1340 wakeup_pcgs(void)
1341 {
1342         if (!CV_HAS_WAITERS(&pcgs_cv))
1343                 return;
1344         cv_broadcast(&pcgs_cv);
1345 }
1346 
1347 /*
1348  * 'freemem' is used all over the kernel as an indication of how many
1349  * pages are free (either on the cache list or on the free page list)
1350  * in the system.  In very few places is a really accurate 'freemem'
1351  * needed.  To avoid contention of the lock protecting a the
1352  * single freemem, it was spread out into NCPU buckets.  Set_freemem
1353  * sets freemem to the total of all NCPU buckets.  It is called from
1354  * clock() on each TICK.
1355  */
1356 void
1357 set_freemem(void)
1358 {
1359         struct pcf      *p;
1360         ulong_t         t;
1361         uint_t          i;
1362 
1363         t = 0;
1364         p = pcf;
1365         for (i = 0;  i < pcf_fanout; i++) {
1366                 t += p->pcf_count;
1367                 p++;
1368         }
1369         freemem = t;
1370 
1371         /*
1372          * Don't worry about grabbing mutex.  It's not that
1373          * critical if we miss a tick or two.  This is
1374          * where we wakeup possible delayers in
1375          * page_create_get_something().
1376          */
1377         wakeup_pcgs();
1378 }
1379 
1380 ulong_t
1381 get_freemem()
1382 {
1383         struct pcf      *p;
1384         ulong_t         t;
1385         uint_t          i;
1386 
1387         t = 0;
1388         p = pcf;
1389         for (i = 0; i < pcf_fanout; i++) {
1390                 t += p->pcf_count;
1391                 p++;
1392         }
1393         /*
1394          * We just calculated it, might as well set it.
1395          */
1396         freemem = t;
1397         return (t);
1398 }
1399 
1400 /*
1401  * Acquire all of the page cache & free (pcf) locks.
1402  */
1403 void
1404 pcf_acquire_all()
1405 {
1406         struct pcf      *p;
1407         uint_t          i;
1408 
1409         p = pcf;
1410         for (i = 0; i < pcf_fanout; i++) {
1411                 mutex_enter(&p->pcf_lock);
1412                 p++;
1413         }
1414 }
1415 
1416 /*
1417  * Release all the pcf_locks.
1418  */
1419 void
1420 pcf_release_all()
1421 {
1422         struct pcf      *p;
1423         uint_t          i;
1424 
1425         p = pcf;
1426         for (i = 0; i < pcf_fanout; i++) {
1427                 mutex_exit(&p->pcf_lock);
1428                 p++;
1429         }
1430 }
1431 
1432 /*
1433  * Inform the VM system that we need some pages freed up.
1434  * Calls must be symmetric, e.g.:
1435  *
1436  *      page_needfree(100);
1437  *      wait a bit;
1438  *      page_needfree(-100);
1439  */
1440 void
1441 page_needfree(spgcnt_t npages)
1442 {
1443         mutex_enter(&new_freemem_lock);
1444         needfree += npages;
1445         mutex_exit(&new_freemem_lock);
1446 }
1447 
1448 /*
1449  * Throttle for page_create(): try to prevent freemem from dropping
1450  * below throttlefree.  We can't provide a 100% guarantee because
1451  * KM_NOSLEEP allocations, page_reclaim(), and various other things
1452  * nibble away at the freelist.  However, we can block all PG_WAIT
1453  * allocations until memory becomes available.  The motivation is
1454  * that several things can fall apart when there's no free memory:
1455  *
1456  * (1) If pageout() needs memory to push a page, the system deadlocks.
1457  *
1458  * (2) By (broken) specification, timeout(9F) can neither fail nor
1459  *     block, so it has no choice but to panic the system if it
1460  *     cannot allocate a callout structure.
1461  *
1462  * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1463  *     it panics if it cannot allocate a callback structure.
1464  *
1465  * (4) Untold numbers of third-party drivers have not yet been hardened
1466  *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1467  *     success and panic the system with a data fault on failure.
1468  *     (The long-term solution to this particular problem is to ship
1469  *     hostile fault-injecting DEBUG kernels with the DDK.)
1470  *
1471  * It is theoretically impossible to guarantee success of non-blocking
1472  * allocations, but in practice, this throttle is very hard to break.
1473  */
1474 static int
1475 page_create_throttle(pgcnt_t npages, int flags)
1476 {
1477         ulong_t fm;
1478         uint_t  i;
1479         pgcnt_t tf;     /* effective value of throttlefree */
1480 
1481         atomic_inc_64(&n_throttle);
1482 
1483         /*
1484          * Normal priority allocations.
1485          */
1486         if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1487                 ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1488                 return (freemem >= npages + throttlefree);
1489         }
1490 
1491         /*
1492          * Never deny pages when:
1493          * - it's a thread that cannot block [NOMEMWAIT()]
1494          * - the allocation cannot block and must not fail
1495          * - the allocation cannot block and is pageout dispensated
1496          */
1497         if (NOMEMWAIT() ||
1498             ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1499             ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1500                 return (1);
1501 
1502         /*
1503          * If the allocation can't block, we look favorably upon it
1504          * unless we're below pageout_reserve.  In that case we fail
1505          * the allocation because we want to make sure there are a few
1506          * pages available for pageout.
1507          */
1508         if ((flags & PG_WAIT) == 0)
1509                 return (freemem >= npages + pageout_reserve);
1510 
1511         /* Calculate the effective throttlefree value */
1512         tf = throttlefree -
1513             ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1514 
1515         WAKE_PAGEOUT_SCANNER();
1516 
1517         for (;;) {
1518                 fm = 0;
1519                 pcf_acquire_all();
1520                 mutex_enter(&new_freemem_lock);
1521                 for (i = 0; i < pcf_fanout; i++) {
1522                         fm += pcf[i].pcf_count;
1523                         pcf[i].pcf_wait++;
1524                         mutex_exit(&pcf[i].pcf_lock);
1525                 }
1526                 freemem = fm;
1527                 if (freemem >= npages + tf) {
1528                         mutex_exit(&new_freemem_lock);
1529                         break;
1530                 }
1531                 needfree += npages;
1532                 freemem_wait++;
1533                 cv_wait(&freemem_cv, &new_freemem_lock);
1534                 freemem_wait--;
1535                 needfree -= npages;
1536                 mutex_exit(&new_freemem_lock);
1537         }
1538         return (1);
1539 }
1540 
1541 /*
1542  * page_create_wait() is called to either coalesce pages from the
1543  * different pcf buckets or to wait because there simply are not
1544  * enough pages to satisfy the caller's request.
1545  *
1546  * Sadly, this is called from platform/vm/vm_machdep.c
1547  */
1548 int
1549 page_create_wait(pgcnt_t npages, uint_t flags)
1550 {
1551         pgcnt_t         total;
1552         uint_t          i;
1553         struct pcf      *p;
1554 
1555         /*
1556          * Wait until there are enough free pages to satisfy our
1557          * entire request.
1558          * We set needfree += npages before prodding pageout, to make sure
1559          * it does real work when npages > lotsfree > freemem.
1560          */
1561         VM_STAT_ADD(page_create_not_enough);
1562 
1563         ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1564 checkagain:
1565         if ((flags & PG_NORELOC) &&
1566             kcage_freemem < kcage_throttlefree + npages)
1567                 (void) kcage_create_throttle(npages, flags);
1568 
1569         if (freemem < npages + throttlefree)
1570                 if (!page_create_throttle(npages, flags))
1571                         return (0);
1572 
1573         if (pcf_decrement_bucket(npages) ||
1574             pcf_decrement_multiple(&total, npages, 0))
1575                 return (1);
1576 
1577         /*
1578          * All of the pcf locks are held, there are not enough pages
1579          * to satisfy the request (npages < total).
1580          * Be sure to acquire the new_freemem_lock before dropping
1581          * the pcf locks.  This prevents dropping wakeups in page_free().
1582          * The order is always pcf_lock then new_freemem_lock.
1583          *
1584          * Since we hold all the pcf locks, it is a good time to set freemem.
1585          *
1586          * If the caller does not want to wait, return now.
1587          * Else turn the pageout daemon loose to find something
1588          * and wait till it does.
1589          *
1590          */
1591         freemem = total;
1592 
1593         if ((flags & PG_WAIT) == 0) {
1594                 pcf_release_all();
1595 
1596                 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1597                 "page_create_nomem:npages %ld freemem %ld", npages, freemem);
1598                 return (0);
1599         }
1600 
1601         ASSERT(proc_pageout != NULL);
1602         WAKE_PAGEOUT_SCANNER();
1603 
1604         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1605             "page_create_sleep_start: freemem %ld needfree %ld",
1606             freemem, needfree);
1607 
1608         /*
1609          * We are going to wait.
1610          * We currently hold all of the pcf_locks,
1611          * get the new_freemem_lock (it protects freemem_wait),
1612          * before dropping the pcf_locks.
1613          */
1614         mutex_enter(&new_freemem_lock);
1615 
1616         p = pcf;
1617         for (i = 0; i < pcf_fanout; i++) {
1618                 p->pcf_wait++;
1619                 mutex_exit(&p->pcf_lock);
1620                 p++;
1621         }
1622 
1623         needfree += npages;
1624         freemem_wait++;
1625 
1626         cv_wait(&freemem_cv, &new_freemem_lock);
1627 
1628         freemem_wait--;
1629         needfree -= npages;
1630 
1631         mutex_exit(&new_freemem_lock);
1632 
1633         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1634             "page_create_sleep_end: freemem %ld needfree %ld",
1635             freemem, needfree);
1636 
1637         VM_STAT_ADD(page_create_not_enough_again);
1638         goto checkagain;
1639 }
1640 /*
1641  * A routine to do the opposite of page_create_wait().
1642  */
1643 void
1644 page_create_putback(spgcnt_t npages)
1645 {
1646         struct pcf      *p;
1647         pgcnt_t         lump;
1648         uint_t          *which;
1649 
1650         /*
1651          * When a contiguous lump is broken up, we have to
1652          * deal with lots of pages (min 64) so lets spread
1653          * the wealth around.
1654          */
1655         lump = roundup(npages, pcf_fanout) / pcf_fanout;
1656         freemem += npages;
1657 
1658         for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1659                 which = &p->pcf_count;
1660 
1661                 mutex_enter(&p->pcf_lock);
1662 
1663                 if (p->pcf_block) {
1664                         which = &p->pcf_reserve;
1665                 }
1666 
1667                 if (lump < npages) {
1668                         *which += (uint_t)lump;
1669                         npages -= lump;
1670                 } else {
1671                         *which += (uint_t)npages;
1672                         npages = 0;
1673                 }
1674 
1675                 if (p->pcf_wait) {
1676                         mutex_enter(&new_freemem_lock);
1677                         /*
1678                          * Check to see if some other thread
1679                          * is actually waiting.  Another bucket
1680                          * may have woken it up by now.  If there
1681                          * are no waiters, then set our pcf_wait
1682                          * count to zero to avoid coming in here
1683                          * next time.
1684                          */
1685                         if (freemem_wait) {
1686                                 if (npages > 1) {
1687                                         cv_broadcast(&freemem_cv);
1688                                 } else {
1689                                         cv_signal(&freemem_cv);
1690                                 }
1691                                 p->pcf_wait--;
1692                         } else {
1693                                 p->pcf_wait = 0;
1694                         }
1695                         mutex_exit(&new_freemem_lock);
1696                 }
1697                 mutex_exit(&p->pcf_lock);
1698         }
1699         ASSERT(npages == 0);
1700 }
1701 
1702 /*
1703  * A helper routine for page_create_get_something.
1704  * The indenting got to deep down there.
1705  * Unblock the pcf counters.  Any pages freed after
1706  * pcf_block got set are moved to pcf_count and
1707  * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1708  */
1709 static void
1710 pcgs_unblock(void)
1711 {
1712         int             i;
1713         struct pcf      *p;
1714 
1715         /* Update freemem while we're here. */
1716         freemem = 0;
1717         p = pcf;
1718         for (i = 0; i < pcf_fanout; i++) {
1719                 mutex_enter(&p->pcf_lock);
1720                 ASSERT(p->pcf_count == 0);
1721                 p->pcf_count = p->pcf_reserve;
1722                 p->pcf_block = 0;
1723                 freemem += p->pcf_count;
1724                 if (p->pcf_wait) {
1725                         mutex_enter(&new_freemem_lock);
1726                         if (freemem_wait) {
1727                                 if (p->pcf_reserve > 1) {
1728                                         cv_broadcast(&freemem_cv);
1729                                         p->pcf_wait = 0;
1730                                 } else {
1731                                         cv_signal(&freemem_cv);
1732                                         p->pcf_wait--;
1733                                 }
1734                         } else {
1735                                 p->pcf_wait = 0;
1736                         }
1737                         mutex_exit(&new_freemem_lock);
1738                 }
1739                 p->pcf_reserve = 0;
1740                 mutex_exit(&p->pcf_lock);
1741                 p++;
1742         }
1743 }
1744 
1745 /*
1746  * Called from page_create_va() when both the cache and free lists
1747  * have been checked once.
1748  *
1749  * Either returns a page or panics since the accounting was done
1750  * way before we got here.
1751  *
1752  * We don't come here often, so leave the accounting on permanently.
1753  */
1754 
1755 #define MAX_PCGS        100
1756 
1757 #ifdef  DEBUG
1758 #define PCGS_TRIES      100
1759 #else   /* DEBUG */
1760 #define PCGS_TRIES      10
1761 #endif  /* DEBUG */
1762 
1763 #ifdef  VM_STATS
1764 uint_t  pcgs_counts[PCGS_TRIES];
1765 uint_t  pcgs_too_many;
1766 uint_t  pcgs_entered;
1767 uint_t  pcgs_entered_noreloc;
1768 uint_t  pcgs_locked;
1769 uint_t  pcgs_cagelocked;
1770 #endif  /* VM_STATS */
1771 
1772 static page_t *
1773 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1774     caddr_t vaddr, uint_t flags)
1775 {
1776         uint_t          count;
1777         page_t          *pp;
1778         uint_t          locked, i;
1779         struct  pcf     *p;
1780         lgrp_t          *lgrp;
1781         int             cagelocked = 0;
1782 
1783         VM_STAT_ADD(pcgs_entered);
1784 
1785         /*
1786          * Tap any reserve freelists: if we fail now, we'll die
1787          * since the page(s) we're looking for have already been
1788          * accounted for.
1789          */
1790         flags |= PG_PANIC;
1791 
1792         if ((flags & PG_NORELOC) != 0) {
1793                 VM_STAT_ADD(pcgs_entered_noreloc);
1794                 /*
1795                  * Requests for free pages from critical threads
1796                  * such as pageout still won't throttle here, but
1797                  * we must try again, to give the cageout thread
1798                  * another chance to catch up. Since we already
1799                  * accounted for the pages, we had better get them
1800                  * this time.
1801                  *
1802                  * N.B. All non-critical threads acquire the pcgs_cagelock
1803                  * to serialize access to the freelists. This implements a
1804                  * turnstile-type synchornization to avoid starvation of
1805                  * critical requests for PG_NORELOC memory by non-critical
1806                  * threads: all non-critical threads must acquire a 'ticket'
1807                  * before passing through, which entails making sure
1808                  * kcage_freemem won't fall below minfree prior to grabbing
1809                  * pages from the freelists.
1810                  */
1811                 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1812                         mutex_enter(&pcgs_cagelock);
1813                         cagelocked = 1;
1814                         VM_STAT_ADD(pcgs_cagelocked);
1815                 }
1816         }
1817 
1818         /*
1819          * Time to get serious.
1820          * We failed to get a `correctly colored' page from both the
1821          * free and cache lists.
1822          * We escalate in stage.
1823          *
1824          * First try both lists without worring about color.
1825          *
1826          * Then, grab all page accounting locks (ie. pcf[]) and
1827          * steal any pages that they have and set the pcf_block flag to
1828          * stop deletions from the lists.  This will help because
1829          * a page can get added to the free list while we are looking
1830          * at the cache list, then another page could be added to the cache
1831          * list allowing the page on the free list to be removed as we
1832          * move from looking at the cache list to the free list. This
1833          * could happen over and over. We would never find the page
1834          * we have accounted for.
1835          *
1836          * Noreloc pages are a subset of the global (relocatable) page pool.
1837          * They are not tracked separately in the pcf bins, so it is
1838          * impossible to know when doing pcf accounting if the available
1839          * page(s) are noreloc pages or not. When looking for a noreloc page
1840          * it is quite easy to end up here even if the global (relocatable)
1841          * page pool has plenty of free pages but the noreloc pool is empty.
1842          *
1843          * When the noreloc pool is empty (or low), additional noreloc pages
1844          * are created by converting pages from the global page pool. This
1845          * process will stall during pcf accounting if the pcf bins are
1846          * already locked. Such is the case when a noreloc allocation is
1847          * looping here in page_create_get_something waiting for more noreloc
1848          * pages to appear.
1849          *
1850          * Short of adding a new field to the pcf bins to accurately track
1851          * the number of free noreloc pages, we instead do not grab the
1852          * pcgs_lock, do not set the pcf blocks and do not timeout when
1853          * allocating a noreloc page. This allows noreloc allocations to
1854          * loop without blocking global page pool allocations.
1855          *
1856          * NOTE: the behaviour of page_create_get_something has not changed
1857          * for the case of global page pool allocations.
1858          */
1859 
1860         flags &= ~PG_MATCH_COLOR;
1861         locked = 0;
1862 #if defined(__i386) || defined(__amd64)
1863         flags = page_create_update_flags_x86(flags);
1864 #endif
1865 
1866         lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1867 
1868         for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1869                 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1870                     flags, lgrp);
1871                 if (pp == NULL) {
1872                         pp = page_get_cachelist(vp, off, seg, vaddr,
1873                             flags, lgrp);
1874                 }
1875                 if (pp == NULL) {
1876                         /*
1877                          * Serialize.  Don't fight with other pcgs().
1878                          */
1879                         if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1880                                 mutex_enter(&pcgs_lock);
1881                                 VM_STAT_ADD(pcgs_locked);
1882                                 locked = 1;
1883                                 p = pcf;
1884                                 for (i = 0; i < pcf_fanout; i++) {
1885                                         mutex_enter(&p->pcf_lock);
1886                                         ASSERT(p->pcf_block == 0);
1887                                         p->pcf_block = 1;
1888                                         p->pcf_reserve = p->pcf_count;
1889                                         p->pcf_count = 0;
1890                                         mutex_exit(&p->pcf_lock);
1891                                         p++;
1892                                 }
1893                                 freemem = 0;
1894                         }
1895 
1896                         if (count) {
1897                                 /*
1898                                  * Since page_free() puts pages on
1899                                  * a list then accounts for it, we
1900                                  * just have to wait for page_free()
1901                                  * to unlock any page it was working
1902                                  * with. The page_lock()-page_reclaim()
1903                                  * path falls in the same boat.
1904                                  *
1905                                  * We don't need to check on the
1906                                  * PG_WAIT flag, we have already
1907                                  * accounted for the page we are
1908                                  * looking for in page_create_va().
1909                                  *
1910                                  * We just wait a moment to let any
1911                                  * locked pages on the lists free up,
1912                                  * then continue around and try again.
1913                                  *
1914                                  * Will be awakened by set_freemem().
1915                                  */
1916                                 mutex_enter(&pcgs_wait_lock);
1917                                 cv_wait(&pcgs_cv, &pcgs_wait_lock);
1918                                 mutex_exit(&pcgs_wait_lock);
1919                         }
1920                 } else {
1921 #ifdef VM_STATS
1922                         if (count >= PCGS_TRIES) {
1923                                 VM_STAT_ADD(pcgs_too_many);
1924                         } else {
1925                                 VM_STAT_ADD(pcgs_counts[count]);
1926                         }
1927 #endif
1928                         if (locked) {
1929                                 pcgs_unblock();
1930                                 mutex_exit(&pcgs_lock);
1931                         }
1932                         if (cagelocked)
1933                                 mutex_exit(&pcgs_cagelock);
1934                         return (pp);
1935                 }
1936         }
1937         /*
1938          * we go down holding the pcf locks.
1939          */
1940         panic("no %spage found %d",
1941             ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1942         /*NOTREACHED*/
1943 }
1944 
1945 /*
1946  * Create enough pages for "bytes" worth of data starting at
1947  * "off" in "vp".
1948  *
1949  *      Where flag must be one of:
1950  *
1951  *              PG_EXCL:        Exclusive create (fail if any page already
1952  *                              exists in the page cache) which does not
1953  *                              wait for memory to become available.
1954  *
1955  *              PG_WAIT:        Non-exclusive create which can wait for
1956  *                              memory to become available.
1957  *
1958  *              PG_PHYSCONTIG:  Allocate physically contiguous pages.
1959  *                              (Not Supported)
1960  *
1961  * A doubly linked list of pages is returned to the caller.  Each page
1962  * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1963  * lock.
1964  *
1965  * Unable to change the parameters to page_create() in a minor release,
1966  * we renamed page_create() to page_create_va(), changed all known calls
1967  * from page_create() to page_create_va(), and created this wrapper.
1968  *
1969  * Upon a major release, we should break compatibility by deleting this
1970  * wrapper, and replacing all the strings "page_create_va", with "page_create".
1971  *
1972  * NOTE: There is a copy of this interface as page_create_io() in
1973  *       i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1974  *       there.
1975  */
1976 page_t *
1977 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1978 {
1979         caddr_t random_vaddr;
1980         struct seg kseg;
1981 
1982 #ifdef DEBUG
1983         cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1984             (void *)caller());
1985 #endif
1986 
1987         random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1988             (uintptr_t)(off >> PAGESHIFT));
1989         kseg.s_as = &kas;
1990 
1991         return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1992 }
1993 
1994 #ifdef DEBUG
1995 uint32_t pg_alloc_pgs_mtbf = 0;
1996 #endif
1997 
1998 /*
1999  * Used for large page support. It will attempt to allocate
2000  * a large page(s) off the freelist.
2001  *
2002  * Returns non zero on failure.
2003  */
2004 int
2005 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
2006     page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
2007 {
2008         pgcnt_t         npgs, curnpgs, totpgs;
2009         size_t          pgsz;
2010         page_t          *pplist = NULL, *pp;
2011         int             err = 0;
2012         lgrp_t          *lgrp;
2013 
2014         ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
2015         ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
2016 
2017         /*
2018          * Check if system heavily prefers local large pages over remote
2019          * on systems with multiple lgroups.
2020          */
2021         if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2022                 pgflags = PG_LOCAL;
2023         }
2024 
2025         VM_STAT_ADD(alloc_pages[0]);
2026 
2027 #ifdef DEBUG
2028         if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2029                 return (ENOMEM);
2030         }
2031 #endif
2032 
2033         /*
2034          * One must be NULL but not both.
2035          * And one must be non NULL but not both.
2036          */
2037         ASSERT(basepp != NULL || ppa != NULL);
2038         ASSERT(basepp == NULL || ppa == NULL);
2039 
2040 #if defined(__i386) || defined(__amd64)
2041         while (page_chk_freelist(szc) == 0) {
2042                 VM_STAT_ADD(alloc_pages[8]);
2043                 if (anypgsz == 0 || --szc == 0)
2044                         return (ENOMEM);
2045         }
2046 #endif
2047 
2048         pgsz = page_get_pagesize(szc);
2049         totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2050 
2051         ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2052 
2053         (void) page_create_wait(npgs, PG_WAIT);
2054 
2055         while (npgs && szc) {
2056                 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2057                 if (pgflags == PG_LOCAL) {
2058                         pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2059                             pgflags, lgrp);
2060                         if (pp == NULL) {
2061                                 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2062                                     0, lgrp);
2063                         }
2064                 } else {
2065                         pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2066                             0, lgrp);
2067                 }
2068                 if (pp != NULL) {
2069                         VM_STAT_ADD(alloc_pages[1]);
2070                         page_list_concat(&pplist, &pp);
2071                         ASSERT(npgs >= curnpgs);
2072                         npgs -= curnpgs;
2073                 } else if (anypgsz) {
2074                         VM_STAT_ADD(alloc_pages[2]);
2075                         szc--;
2076                         pgsz = page_get_pagesize(szc);
2077                         curnpgs = pgsz >> PAGESHIFT;
2078                 } else {
2079                         VM_STAT_ADD(alloc_pages[3]);
2080                         ASSERT(npgs == totpgs);
2081                         page_create_putback(npgs);
2082                         return (ENOMEM);
2083                 }
2084         }
2085         if (szc == 0) {
2086                 VM_STAT_ADD(alloc_pages[4]);
2087                 ASSERT(npgs != 0);
2088                 page_create_putback(npgs);
2089                 err = ENOMEM;
2090         } else if (basepp != NULL) {
2091                 ASSERT(npgs == 0);
2092                 ASSERT(ppa == NULL);
2093                 *basepp = pplist;
2094         }
2095 
2096         npgs = totpgs - npgs;
2097         pp = pplist;
2098 
2099         /*
2100          * Clear the free and age bits. Also if we were passed in a ppa then
2101          * fill it in with all the constituent pages from the large page. But
2102          * if we failed to allocate all the pages just free what we got.
2103          */
2104         while (npgs != 0) {
2105                 ASSERT(PP_ISFREE(pp));
2106                 ASSERT(PP_ISAGED(pp));
2107                 if (ppa != NULL || err != 0) {
2108                         if (err == 0) {
2109                                 VM_STAT_ADD(alloc_pages[5]);
2110                                 PP_CLRFREE(pp);
2111                                 PP_CLRAGED(pp);
2112                                 page_sub(&pplist, pp);
2113                                 *ppa++ = pp;
2114                                 npgs--;
2115                         } else {
2116                                 VM_STAT_ADD(alloc_pages[6]);
2117                                 ASSERT(pp->p_szc != 0);
2118                                 curnpgs = page_get_pagecnt(pp->p_szc);
2119                                 page_list_break(&pp, &pplist, curnpgs);
2120                                 page_list_add_pages(pp, 0);
2121                                 page_create_putback(curnpgs);
2122                                 ASSERT(npgs >= curnpgs);
2123                                 npgs -= curnpgs;
2124                         }
2125                         pp = pplist;
2126                 } else {
2127                         VM_STAT_ADD(alloc_pages[7]);
2128                         PP_CLRFREE(pp);
2129                         PP_CLRAGED(pp);
2130                         pp = pp->p_next;
2131                         npgs--;
2132                 }
2133         }
2134         return (err);
2135 }
2136 
2137 /*
2138  * Get a single large page off of the freelists, and set it up for use.
2139  * Number of bytes requested must be a supported page size.
2140  *
2141  * Note that this call may fail even if there is sufficient
2142  * memory available or PG_WAIT is set, so the caller must
2143  * be willing to fallback on page_create_va(), block and retry,
2144  * or fail the requester.
2145  */
2146 page_t *
2147 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2148     struct seg *seg, caddr_t vaddr, void *arg)
2149 {
2150         pgcnt_t         npages;
2151         page_t          *pp;
2152         page_t          *rootpp;
2153         lgrp_t          *lgrp;
2154         lgrp_id_t       *lgrpid = (lgrp_id_t *)arg;
2155 
2156         ASSERT(vp != NULL);
2157 
2158         ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2159             PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2160         /* but no others */
2161 
2162         ASSERT((flags & PG_EXCL) == PG_EXCL);
2163 
2164         npages = btop(bytes);
2165 
2166         if (!kcage_on || panicstr) {
2167                 /*
2168                  * Cage is OFF, or we are single threaded in
2169                  * panic, so make everything a RELOC request.
2170                  */
2171                 flags &= ~PG_NORELOC;
2172         }
2173 
2174         /*
2175          * Make sure there's adequate physical memory available.
2176          * Note: PG_WAIT is ignored here.
2177          */
2178         if (freemem <= throttlefree + npages) {
2179                 VM_STAT_ADD(page_create_large_cnt[1]);
2180                 return (NULL);
2181         }
2182 
2183         /*
2184          * If cage is on, dampen draw from cage when available
2185          * cage space is low.
2186          */
2187         if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
2188             kcage_freemem < kcage_throttlefree + npages) {
2189 
2190                 /*
2191                  * The cage is on, the caller wants PG_NORELOC
2192                  * pages and available cage memory is very low.
2193                  * Call kcage_create_throttle() to attempt to
2194                  * control demand on the cage.
2195                  */
2196                 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2197                         VM_STAT_ADD(page_create_large_cnt[2]);
2198                         return (NULL);
2199                 }
2200         }
2201 
2202         if (!pcf_decrement_bucket(npages) &&
2203             !pcf_decrement_multiple(NULL, npages, 1)) {
2204                 VM_STAT_ADD(page_create_large_cnt[4]);
2205                 return (NULL);
2206         }
2207 
2208         /*
2209          * This is where this function behaves fundamentally differently
2210          * than page_create_va(); since we're intending to map the page
2211          * with a single TTE, we have to get it as a physically contiguous
2212          * hardware pagesize chunk.  If we can't, we fail.
2213          */
2214         if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2215             LGRP_EXISTS(lgrp_table[*lgrpid]))
2216                 lgrp = lgrp_table[*lgrpid];
2217         else
2218                 lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2219 
2220         if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2221             bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2222                 page_create_putback(npages);
2223                 VM_STAT_ADD(page_create_large_cnt[5]);
2224                 return (NULL);
2225         }
2226 
2227         /*
2228          * if we got the page with the wrong mtype give it back this is a
2229          * workaround for CR 6249718. When CR 6249718 is fixed we never get
2230          * inside "if" and the workaround becomes just a nop
2231          */
2232         if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2233                 page_list_add_pages(rootpp, 0);
2234                 page_create_putback(npages);
2235                 VM_STAT_ADD(page_create_large_cnt[6]);
2236                 return (NULL);
2237         }
2238 
2239         /*
2240          * If satisfying this request has left us with too little
2241          * memory, start the wheels turning to get some back.  The
2242          * first clause of the test prevents waking up the pageout
2243          * daemon in situations where it would decide that there's
2244          * nothing to do.
2245          */
2246         if (nscan < desscan && freemem < minfree) {
2247                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2248                     "pageout_cv_signal:freemem %ld", freemem);
2249                 WAKE_PAGEOUT_SCANNER();
2250         }
2251 
2252         pp = rootpp;
2253         while (npages--) {
2254                 ASSERT(PAGE_EXCL(pp));
2255                 ASSERT(pp->p_vnode == NULL);
2256                 ASSERT(!hat_page_is_mapped(pp));
2257                 PP_CLRFREE(pp);
2258                 PP_CLRAGED(pp);
2259                 if (!page_hashin(pp, vp, off, NULL))
2260                         panic("page_create_large: hashin failed: page %p",
2261                             (void *)pp);
2262                 page_io_lock(pp);
2263                 off += PAGESIZE;
2264                 pp = pp->p_next;
2265         }
2266 
2267         VM_STAT_ADD(page_create_large_cnt[0]);
2268         return (rootpp);
2269 }
2270 
2271 page_t *
2272 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2273     struct seg *seg, caddr_t vaddr)
2274 {
2275         page_t          *plist = NULL;
2276         pgcnt_t         npages;
2277         pgcnt_t         found_on_free = 0;
2278         pgcnt_t         pages_req;
2279         page_t          *npp = NULL;
2280         struct pcf      *p;
2281         lgrp_t          *lgrp;
2282 
2283         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2284             "page_create_start:vp %p off %llx bytes %lu flags %x",
2285             vp, off, bytes, flags);
2286 
2287         ASSERT(bytes != 0 && vp != NULL);
2288 
2289         if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2290                 panic("page_create: invalid flags");
2291                 /*NOTREACHED*/
2292         }
2293         ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2294             PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2295             /* but no others */
2296 
2297         pages_req = npages = btopr(bytes);
2298         /*
2299          * Try to see whether request is too large to *ever* be
2300          * satisfied, in order to prevent deadlock.  We arbitrarily
2301          * decide to limit maximum size requests to max_page_get.
2302          */
2303         if (npages >= max_page_get) {
2304                 if ((flags & PG_WAIT) == 0) {
2305                         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2306                             "page_create_toobig:vp %p off %llx npages "
2307                             "%lu max_page_get %lu",
2308                             vp, off, npages, max_page_get);
2309                         return (NULL);
2310                 } else {
2311                         cmn_err(CE_WARN,
2312                             "Request for too much kernel memory "
2313                             "(%lu bytes), will hang forever", bytes);
2314                         for (;;)
2315                                 delay(1000000000);
2316                 }
2317         }
2318 
2319         if (!kcage_on || panicstr) {
2320                 /*
2321                  * Cage is OFF, or we are single threaded in
2322                  * panic, so make everything a RELOC request.
2323                  */
2324                 flags &= ~PG_NORELOC;
2325         }
2326 
2327         if (freemem <= throttlefree + npages)
2328                 if (!page_create_throttle(npages, flags))
2329                         return (NULL);
2330 
2331         /*
2332          * If cage is on, dampen draw from cage when available
2333          * cage space is low.
2334          */
2335         if ((flags & PG_NORELOC) &&
2336             kcage_freemem < kcage_throttlefree + npages) {
2337 
2338                 /*
2339                  * The cage is on, the caller wants PG_NORELOC
2340                  * pages and available cage memory is very low.
2341                  * Call kcage_create_throttle() to attempt to
2342                  * control demand on the cage.
2343                  */
2344                 if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2345                         return (NULL);
2346         }
2347 
2348         VM_STAT_ADD(page_create_cnt[0]);
2349 
2350         if (!pcf_decrement_bucket(npages)) {
2351                 /*
2352                  * Have to look harder.  If npages is greater than
2353                  * one, then we might have to coalesce the counters.
2354                  *
2355                  * Go wait.  We come back having accounted
2356                  * for the memory.
2357                  */
2358                 VM_STAT_ADD(page_create_cnt[1]);
2359                 if (!page_create_wait(npages, flags)) {
2360                         VM_STAT_ADD(page_create_cnt[2]);
2361                         return (NULL);
2362                 }
2363         }
2364 
2365         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2366             "page_create_success:vp %p off %llx", vp, off);
2367 
2368         /*
2369          * If satisfying this request has left us with too little
2370          * memory, start the wheels turning to get some back.  The
2371          * first clause of the test prevents waking up the pageout
2372          * daemon in situations where it would decide that there's
2373          * nothing to do.
2374          */
2375         if (nscan < desscan && freemem < minfree) {
2376                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2377                     "pageout_cv_signal:freemem %ld", freemem);
2378                 WAKE_PAGEOUT_SCANNER();
2379         }
2380 
2381         /*
2382          * Loop around collecting the requested number of pages.
2383          * Most of the time, we have to `create' a new page. With
2384          * this in mind, pull the page off the free list before
2385          * getting the hash lock.  This will minimize the hash
2386          * lock hold time, nesting, and the like.  If it turns
2387          * out we don't need the page, we put it back at the end.
2388          */
2389         while (npages--) {
2390                 page_t          *pp;
2391                 kmutex_t        *phm = NULL;
2392                 ulong_t         index;
2393 
2394                 index = PAGE_HASH_FUNC(vp, off);
2395 top:
2396                 ASSERT(phm == NULL);
2397                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
2398                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2399 
2400                 if (npp == NULL) {
2401                         /*
2402                          * Try to get a page from the freelist (ie,
2403                          * a page with no [vp, off] tag).  If that
2404                          * fails, use the cachelist.
2405                          *
2406                          * During the first attempt at both the free
2407                          * and cache lists we try for the correct color.
2408                          */
2409                         /*
2410                          * XXXX-how do we deal with virtual indexed
2411                          * caches and and colors?
2412                          */
2413                         VM_STAT_ADD(page_create_cnt[4]);
2414                         /*
2415                          * Get lgroup to allocate next page of shared memory
2416                          * from and use it to specify where to allocate
2417                          * the physical memory
2418                          */
2419                         lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2420                         npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2421                             flags | PG_MATCH_COLOR, lgrp);
2422                         if (npp == NULL) {
2423                                 npp = page_get_cachelist(vp, off, seg,
2424                                     vaddr, flags | PG_MATCH_COLOR, lgrp);
2425                                 if (npp == NULL) {
2426                                         npp = page_create_get_something(vp,
2427                                             off, seg, vaddr,
2428                                             flags & ~PG_MATCH_COLOR);
2429                                 }
2430 
2431                                 if (PP_ISAGED(npp) == 0) {
2432                                         /*
2433                                          * Since this page came from the
2434                                          * cachelist, we must destroy the
2435                                          * old vnode association.
2436                                          */
2437                                         page_hashout(npp, NULL);
2438                                 }
2439                         }
2440                 }
2441 
2442                 /*
2443                  * We own this page!
2444                  */
2445                 ASSERT(PAGE_EXCL(npp));
2446                 ASSERT(npp->p_vnode == NULL);
2447                 ASSERT(!hat_page_is_mapped(npp));
2448                 PP_CLRFREE(npp);
2449                 PP_CLRAGED(npp);
2450 
2451                 /*
2452                  * Here we have a page in our hot little mits and are
2453                  * just waiting to stuff it on the appropriate lists.
2454                  * Get the mutex and check to see if it really does
2455                  * not exist.
2456                  */
2457                 phm = PAGE_HASH_MUTEX(index);
2458                 mutex_enter(phm);
2459                 pp = page_hash_search(index, vp, off);
2460                 if (pp == NULL) {
2461                         VM_STAT_ADD(page_create_new);
2462                         pp = npp;
2463                         npp = NULL;
2464                         if (!page_hashin(pp, vp, off, phm)) {
2465                                 /*
2466                                  * Since we hold the page hash mutex and
2467                                  * just searched for this page, page_hashin
2468                                  * had better not fail.  If it does, that
2469                                  * means somethread did not follow the
2470                                  * page hash mutex rules.  Panic now and
2471                                  * get it over with.  As usual, go down
2472                                  * holding all the locks.
2473                                  */
2474                                 ASSERT(MUTEX_HELD(phm));
2475                                 panic("page_create: "
2476                                     "hashin failed %p %p %llx %p",
2477                                     (void *)pp, (void *)vp, off, (void *)phm);
2478                                 /*NOTREACHED*/
2479                         }
2480                         ASSERT(MUTEX_HELD(phm));
2481                         mutex_exit(phm);
2482                         phm = NULL;
2483 
2484                         /*
2485                          * Hat layer locking need not be done to set
2486                          * the following bits since the page is not hashed
2487                          * and was on the free list (i.e., had no mappings).
2488                          *
2489                          * Set the reference bit to protect
2490                          * against immediate pageout
2491                          *
2492                          * XXXmh modify freelist code to set reference
2493                          * bit so we don't have to do it here.
2494                          */
2495                         page_set_props(pp, P_REF);
2496                         found_on_free++;
2497                 } else {
2498                         VM_STAT_ADD(page_create_exists);
2499                         if (flags & PG_EXCL) {
2500                                 /*
2501                                  * Found an existing page, and the caller
2502                                  * wanted all new pages.  Undo all of the work
2503                                  * we have done.
2504                                  */
2505                                 mutex_exit(phm);
2506                                 phm = NULL;
2507                                 while (plist != NULL) {
2508                                         pp = plist;
2509                                         page_sub(&plist, pp);
2510                                         page_io_unlock(pp);
2511                                         /* large pages should not end up here */
2512                                         ASSERT(pp->p_szc == 0);
2513                                         /*LINTED: constant in conditional ctx*/
2514                                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
2515                                 }
2516                                 VM_STAT_ADD(page_create_found_one);
2517                                 goto fail;
2518                         }
2519                         ASSERT(flags & PG_WAIT);
2520                         if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2521                                 /*
2522                                  * Start all over again if we blocked trying
2523                                  * to lock the page.
2524                                  */
2525                                 mutex_exit(phm);
2526                                 VM_STAT_ADD(page_create_page_lock_failed);
2527                                 phm = NULL;
2528                                 goto top;
2529                         }
2530                         mutex_exit(phm);
2531                         phm = NULL;
2532 
2533                         if (PP_ISFREE(pp)) {
2534                                 ASSERT(PP_ISAGED(pp) == 0);
2535                                 VM_STAT_ADD(pagecnt.pc_get_cache);
2536                                 page_list_sub(pp, PG_CACHE_LIST);
2537                                 PP_CLRFREE(pp);
2538                                 found_on_free++;
2539                         }
2540                 }
2541 
2542                 /*
2543                  * Got a page!  It is locked.  Acquire the i/o
2544                  * lock since we are going to use the p_next and
2545                  * p_prev fields to link the requested pages together.
2546                  */
2547                 page_io_lock(pp);
2548                 page_add(&plist, pp);
2549                 plist = plist->p_next;
2550                 off += PAGESIZE;
2551                 vaddr += PAGESIZE;
2552         }
2553 
2554         ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2555 fail:
2556         if (npp != NULL) {
2557                 /*
2558                  * Did not need this page after all.
2559                  * Put it back on the free list.
2560                  */
2561                 VM_STAT_ADD(page_create_putbacks);
2562                 PP_SETFREE(npp);
2563                 PP_SETAGED(npp);
2564                 npp->p_offset = (u_offset_t)-1;
2565                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2566                 page_unlock(npp);
2567 
2568         }
2569 
2570         ASSERT(pages_req >= found_on_free);
2571 
2572         {
2573                 uint_t overshoot = (uint_t)(pages_req - found_on_free);
2574 
2575                 if (overshoot) {
2576                         VM_STAT_ADD(page_create_overshoot);
2577                         p = &pcf[PCF_INDEX()];
2578                         mutex_enter(&p->pcf_lock);
2579                         if (p->pcf_block) {
2580                                 p->pcf_reserve += overshoot;
2581                         } else {
2582                                 p->pcf_count += overshoot;
2583                                 if (p->pcf_wait) {
2584                                         mutex_enter(&new_freemem_lock);
2585                                         if (freemem_wait) {
2586                                                 cv_signal(&freemem_cv);
2587                                                 p->pcf_wait--;
2588                                         } else {
2589                                                 p->pcf_wait = 0;
2590                                         }
2591                                         mutex_exit(&new_freemem_lock);
2592                                 }
2593                         }
2594                         mutex_exit(&p->pcf_lock);
2595                         /* freemem is approximate, so this test OK */
2596                         if (!p->pcf_block)
2597                                 freemem += overshoot;
2598                 }
2599         }
2600 
2601         return (plist);
2602 }
2603 
2604 /*
2605  * One or more constituent pages of this large page has been marked
2606  * toxic. Simply demote the large page to PAGESIZE pages and let
2607  * page_free() handle it. This routine should only be called by
2608  * large page free routines (page_free_pages() and page_destroy_pages().
2609  * All pages are locked SE_EXCL and have already been marked free.
2610  */
2611 static void
2612 page_free_toxic_pages(page_t *rootpp)
2613 {
2614         page_t  *tpp;
2615         pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2616         uint_t  szc = rootpp->p_szc;
2617 
2618         for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2619                 ASSERT(tpp->p_szc == szc);
2620                 ASSERT((PAGE_EXCL(tpp) &&
2621                     !page_iolock_assert(tpp)) || panicstr);
2622                 tpp->p_szc = 0;
2623         }
2624 
2625         while (rootpp != NULL) {
2626                 tpp = rootpp;
2627                 page_sub(&rootpp, tpp);
2628                 ASSERT(PP_ISFREE(tpp));
2629                 PP_CLRFREE(tpp);
2630                 page_free(tpp, 1);
2631         }
2632 }
2633 
2634 /*
2635  * Put page on the "free" list.
2636  * The free list is really two lists maintained by
2637  * the PSM of whatever machine we happen to be on.
2638  */
2639 void
2640 page_free(page_t *pp, int dontneed)
2641 {
2642         struct pcf      *p;
2643         uint_t          pcf_index;
2644 
2645         ASSERT((PAGE_EXCL(pp) &&
2646             !page_iolock_assert(pp)) || panicstr);
2647 
2648         if (PP_ISFREE(pp)) {
2649                 panic("page_free: page %p is free", (void *)pp);
2650         }
2651 
2652         if (pp->p_szc != 0) {
2653                 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2654                     PP_ISKAS(pp)) {
2655                         panic("page_free: anon or kernel "
2656                             "or no vnode large page %p", (void *)pp);
2657                 }
2658                 page_demote_vp_pages(pp);
2659                 ASSERT(pp->p_szc == 0);
2660         }
2661 
2662         /*
2663          * The page_struct_lock need not be acquired to examine these
2664          * fields since the page has an "exclusive" lock.
2665          */
2666         if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2667             pp->p_slckcnt != 0) {
2668                 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2669                     "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2670                     pp->p_cowcnt, pp->p_slckcnt);
2671                 /*NOTREACHED*/
2672         }
2673 
2674         ASSERT(!hat_page_getshare(pp));
2675 
2676         PP_SETFREE(pp);
2677         ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2678             !hat_ismod(pp));
2679         page_clr_all_props(pp);
2680         ASSERT(!hat_page_getshare(pp));
2681 
2682         /*
2683          * Now we add the page to the head of the free list.
2684          * But if this page is associated with a paged vnode
2685          * then we adjust the head forward so that the page is
2686          * effectively at the end of the list.
2687          */
2688         if (pp->p_vnode == NULL) {
2689                 /*
2690                  * Page has no identity, put it on the free list.
2691                  */
2692                 PP_SETAGED(pp);
2693                 pp->p_offset = (u_offset_t)-1;
2694                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2695                 VM_STAT_ADD(pagecnt.pc_free_free);
2696                 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2697                     "page_free_free:pp %p", pp);
2698         } else {
2699                 PP_CLRAGED(pp);
2700 
2701                 if (!dontneed) {
2702                         /* move it to the tail of the list */
2703                         page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2704 
2705                         VM_STAT_ADD(pagecnt.pc_free_cache);
2706                         TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2707                             "page_free_cache_tail:pp %p", pp);
2708                 } else {
2709                         page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2710 
2711                         VM_STAT_ADD(pagecnt.pc_free_dontneed);
2712                         TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2713                             "page_free_cache_head:pp %p", pp);
2714                 }
2715         }
2716         page_unlock(pp);
2717 
2718         /*
2719          * Now do the `freemem' accounting.
2720          */
2721         pcf_index = PCF_INDEX();
2722         p = &pcf[pcf_index];
2723 
2724         mutex_enter(&p->pcf_lock);
2725         if (p->pcf_block) {
2726                 p->pcf_reserve += 1;
2727         } else {
2728                 p->pcf_count += 1;
2729                 if (p->pcf_wait) {
2730                         mutex_enter(&new_freemem_lock);
2731                         /*
2732                          * Check to see if some other thread
2733                          * is actually waiting.  Another bucket
2734                          * may have woken it up by now.  If there
2735                          * are no waiters, then set our pcf_wait
2736                          * count to zero to avoid coming in here
2737                          * next time.  Also, since only one page
2738                          * was put on the free list, just wake
2739                          * up one waiter.
2740                          */
2741                         if (freemem_wait) {
2742                                 cv_signal(&freemem_cv);
2743                                 p->pcf_wait--;
2744                         } else {
2745                                 p->pcf_wait = 0;
2746                         }
2747                         mutex_exit(&new_freemem_lock);
2748                 }
2749         }
2750         mutex_exit(&p->pcf_lock);
2751 
2752         /* freemem is approximate, so this test OK */
2753         if (!p->pcf_block)
2754                 freemem += 1;
2755 }
2756 
2757 /*
2758  * Put page on the "free" list during intial startup.
2759  * This happens during initial single threaded execution.
2760  */
2761 void
2762 page_free_at_startup(page_t *pp)
2763 {
2764         struct pcf      *p;
2765         uint_t          pcf_index;
2766 
2767         page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2768         VM_STAT_ADD(pagecnt.pc_free_free);
2769 
2770         /*
2771          * Now do the `freemem' accounting.
2772          */
2773         pcf_index = PCF_INDEX();
2774         p = &pcf[pcf_index];
2775 
2776         ASSERT(p->pcf_block == 0);
2777         ASSERT(p->pcf_wait == 0);
2778         p->pcf_count += 1;
2779 
2780         /* freemem is approximate, so this is OK */
2781         freemem += 1;
2782 }
2783 
2784 void
2785 page_free_pages(page_t *pp)
2786 {
2787         page_t  *tpp, *rootpp = NULL;
2788         pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2789         pgcnt_t i;
2790         uint_t  szc = pp->p_szc;
2791 
2792         VM_STAT_ADD(pagecnt.pc_free_pages);
2793         TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2794             "page_free_free:pp %p", pp);
2795 
2796         ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2797         if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2798                 panic("page_free_pages: not root page %p", (void *)pp);
2799                 /*NOTREACHED*/
2800         }
2801 
2802         for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2803                 ASSERT((PAGE_EXCL(tpp) &&
2804                     !page_iolock_assert(tpp)) || panicstr);
2805                 if (PP_ISFREE(tpp)) {
2806                         panic("page_free_pages: page %p is free", (void *)tpp);
2807                         /*NOTREACHED*/
2808                 }
2809                 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2810                     tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2811                         panic("page_free_pages %p", (void *)tpp);
2812                         /*NOTREACHED*/
2813                 }
2814 
2815                 ASSERT(!hat_page_getshare(tpp));
2816                 ASSERT(tpp->p_vnode == NULL);
2817                 ASSERT(tpp->p_szc == szc);
2818 
2819                 PP_SETFREE(tpp);
2820                 page_clr_all_props(tpp);
2821                 PP_SETAGED(tpp);
2822                 tpp->p_offset = (u_offset_t)-1;
2823                 ASSERT(tpp->p_next == tpp);
2824                 ASSERT(tpp->p_prev == tpp);
2825                 page_list_concat(&rootpp, &tpp);
2826         }
2827         ASSERT(rootpp == pp);
2828 
2829         page_list_add_pages(rootpp, 0);
2830         page_create_putback(pgcnt);
2831 }
2832 
2833 int free_pages = 1;
2834 
2835 /*
2836  * This routine attempts to return pages to the cachelist via page_release().
2837  * It does not *have* to be successful in all cases, since the pageout scanner
2838  * will catch any pages it misses.  It does need to be fast and not introduce
2839  * too much overhead.
2840  *
2841  * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2842  * don't lock and retry.  This is ok, since the page scanner will eventually
2843  * find any page we miss in free_vp_pages().
2844  */
2845 void
2846 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2847 {
2848         page_t *pp;
2849         u_offset_t eoff;
2850         extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2851 
2852         eoff = off + len;
2853 
2854         if (free_pages == 0)
2855                 return;
2856         if (swap_in_range(vp, off, len))
2857                 return;
2858 
2859         for (; off < eoff; off += PAGESIZE) {
2860 
2861                 /*
2862                  * find the page using a fast, but inexact search. It'll be OK
2863                  * if a few pages slip through the cracks here.
2864                  */
2865                 pp = page_exists(vp, off);
2866 
2867                 /*
2868                  * If we didn't find the page (it may not exist), the page
2869                  * is free, looks still in use (shared), or we can't lock it,
2870                  * just give up.
2871                  */
2872                 if (pp == NULL ||
2873                     PP_ISFREE(pp) ||
2874                     page_share_cnt(pp) > 0 ||
2875                     !page_trylock(pp, SE_EXCL))
2876                         continue;
2877 
2878                 /*
2879                  * Once we have locked pp, verify that it's still the
2880                  * correct page and not already free
2881                  */
2882                 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2883                 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2884                         page_unlock(pp);
2885                         continue;
2886                 }
2887 
2888                 /*
2889                  * try to release the page...
2890                  */
2891                 (void) page_release(pp, 1);
2892         }
2893 }
2894 
2895 /*
2896  * Reclaim the given page from the free list.
2897  * If pp is part of a large pages, only the given constituent page is reclaimed
2898  * and the large page it belonged to will be demoted.  This can only happen
2899  * if the page is not on the cachelist.
2900  *
2901  * Returns 1 on success or 0 on failure.
2902  *
2903  * The page is unlocked if it can't be reclaimed (when freemem == 0).
2904  * If `lock' is non-null, it will be dropped and re-acquired if
2905  * the routine must wait while freemem is 0.
2906  *
2907  * As it turns out, boot_getpages() does this.  It picks a page,
2908  * based on where OBP mapped in some address, gets its pfn, searches
2909  * the memsegs, locks the page, then pulls it off the free list!
2910  */
2911 int
2912 page_reclaim(page_t *pp, kmutex_t *lock)
2913 {
2914         struct pcf      *p;
2915         struct cpu      *cpup;
2916         int             enough;
2917         uint_t          i;
2918 
2919         ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2920         ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2921 
2922         /*
2923          * If `freemem' is 0, we cannot reclaim this page from the
2924          * freelist, so release every lock we might hold: the page,
2925          * and the `lock' before blocking.
2926          *
2927          * The only way `freemem' can become 0 while there are pages
2928          * marked free (have their p->p_free bit set) is when the
2929          * system is low on memory and doing a page_create().  In
2930          * order to guarantee that once page_create() starts acquiring
2931          * pages it will be able to get all that it needs since `freemem'
2932          * was decreased by the requested amount.  So, we need to release
2933          * this page, and let page_create() have it.
2934          *
2935          * Since `freemem' being zero is not supposed to happen, just
2936          * use the usual hash stuff as a starting point.  If that bucket
2937          * is empty, then assume the worst, and start at the beginning
2938          * of the pcf array.  If we always start at the beginning
2939          * when acquiring more than one pcf lock, there won't be any
2940          * deadlock problems.
2941          */
2942 
2943         /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2944 
2945         if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2946                 pcf_acquire_all();
2947                 goto page_reclaim_nomem;
2948         }
2949 
2950         enough = pcf_decrement_bucket(1);
2951 
2952         if (!enough) {
2953                 VM_STAT_ADD(page_reclaim_zero);
2954                 /*
2955                  * Check again. Its possible that some other thread
2956                  * could have been right behind us, and added one
2957                  * to a list somewhere.  Acquire each of the pcf locks
2958                  * until we find a page.
2959                  */
2960                 p = pcf;
2961                 for (i = 0; i < pcf_fanout; i++) {
2962                         mutex_enter(&p->pcf_lock);
2963                         if (p->pcf_count >= 1) {
2964                                 p->pcf_count -= 1;
2965                                 /*
2966                                  * freemem is not protected by any lock. Thus,
2967                                  * we cannot have any assertion containing
2968                                  * freemem here.
2969                                  */
2970                                 freemem -= 1;
2971                                 enough = 1;
2972                                 break;
2973                         }
2974                         p++;
2975                 }
2976 
2977                 if (!enough) {
2978 page_reclaim_nomem:
2979                         /*
2980                          * We really can't have page `pp'.
2981                          * Time for the no-memory dance with
2982                          * page_free().  This is just like
2983                          * page_create_wait().  Plus the added
2984                          * attraction of releasing whatever mutex
2985                          * we held when we were called with in `lock'.
2986                          * Page_unlock() will wakeup any thread
2987                          * waiting around for this page.
2988                          */
2989                         if (lock) {
2990                                 VM_STAT_ADD(page_reclaim_zero_locked);
2991                                 mutex_exit(lock);
2992                         }
2993                         page_unlock(pp);
2994 
2995                         /*
2996                          * get this before we drop all the pcf locks.
2997                          */
2998                         mutex_enter(&new_freemem_lock);
2999 
3000                         p = pcf;
3001                         for (i = 0; i < pcf_fanout; i++) {
3002                                 p->pcf_wait++;
3003                                 mutex_exit(&p->pcf_lock);
3004                                 p++;
3005                         }
3006 
3007                         freemem_wait++;
3008                         cv_wait(&freemem_cv, &new_freemem_lock);
3009                         freemem_wait--;
3010 
3011                         mutex_exit(&new_freemem_lock);
3012 
3013                         if (lock) {
3014                                 mutex_enter(lock);
3015                         }
3016                         return (0);
3017                 }
3018 
3019                 /*
3020                  * The pcf accounting has been done,
3021                  * though none of the pcf_wait flags have been set,
3022                  * drop the locks and continue on.
3023                  */
3024                 while (p >= pcf) {
3025                         mutex_exit(&p->pcf_lock);
3026                         p--;
3027                 }
3028         }
3029 
3030 
3031         VM_STAT_ADD(pagecnt.pc_reclaim);
3032 
3033         /*
3034          * page_list_sub will handle the case where pp is a large page.
3035          * It's possible that the page was promoted while on the freelist
3036          */
3037         if (PP_ISAGED(pp)) {
3038                 page_list_sub(pp, PG_FREE_LIST);
3039                 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3040                     "page_reclaim_free:pp %p", pp);
3041         } else {
3042                 page_list_sub(pp, PG_CACHE_LIST);
3043                 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3044                     "page_reclaim_cache:pp %p", pp);
3045         }
3046 
3047         /*
3048          * clear the p_free & p_age bits since this page is no longer
3049          * on the free list.  Notice that there was a brief time where
3050          * a page is marked as free, but is not on the list.
3051          *
3052          * Set the reference bit to protect against immediate pageout.
3053          */
3054         PP_CLRFREE(pp);
3055         PP_CLRAGED(pp);
3056         page_set_props(pp, P_REF);
3057 
3058         CPU_STATS_ENTER_K();
3059         cpup = CPU;     /* get cpup now that CPU cannot change */
3060         CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3061         CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3062         CPU_STATS_EXIT_K();
3063         ASSERT(pp->p_szc == 0);
3064 
3065         return (1);
3066 }
3067 
3068 /*
3069  * Destroy identity of the page and put it back on
3070  * the page free list.  Assumes that the caller has
3071  * acquired the "exclusive" lock on the page.
3072  */
3073 void
3074 page_destroy(page_t *pp, int dontfree)
3075 {
3076         ASSERT((PAGE_EXCL(pp) &&
3077             !page_iolock_assert(pp)) || panicstr);
3078         ASSERT(pp->p_slckcnt == 0 || panicstr);
3079 
3080         if (pp->p_szc != 0) {
3081                 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3082                     PP_ISKAS(pp)) {
3083                         panic("page_destroy: anon or kernel or no vnode "
3084                             "large page %p", (void *)pp);
3085                 }
3086                 page_demote_vp_pages(pp);
3087                 ASSERT(pp->p_szc == 0);
3088         }
3089 
3090         TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3091 
3092         /*
3093          * Unload translations, if any, then hash out the
3094          * page to erase its identity.
3095          */
3096         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3097         page_hashout(pp, NULL);
3098 
3099         if (!dontfree) {
3100                 /*
3101                  * Acquire the "freemem_lock" for availrmem.
3102                  * The page_struct_lock need not be acquired for lckcnt
3103                  * and cowcnt since the page has an "exclusive" lock.
3104                  * We are doing a modified version of page_pp_unlock here.
3105                  */
3106                 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3107                         mutex_enter(&freemem_lock);
3108                         if (pp->p_lckcnt != 0) {
3109                                 availrmem++;
3110                                 pages_locked--;
3111                                 pp->p_lckcnt = 0;
3112                         }
3113                         if (pp->p_cowcnt != 0) {
3114                                 availrmem += pp->p_cowcnt;
3115                                 pages_locked -= pp->p_cowcnt;
3116                                 pp->p_cowcnt = 0;
3117                         }
3118                         mutex_exit(&freemem_lock);
3119                 }
3120                 /*
3121                  * Put the page on the "free" list.
3122                  */
3123                 page_free(pp, 0);
3124         }
3125 }
3126 
3127 void
3128 page_destroy_pages(page_t *pp)
3129 {
3130 
3131         page_t  *tpp, *rootpp = NULL;
3132         pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
3133         pgcnt_t i, pglcks = 0;
3134         uint_t  szc = pp->p_szc;
3135 
3136         ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3137 
3138         VM_STAT_ADD(pagecnt.pc_destroy_pages);
3139 
3140         TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3141 
3142         if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3143                 panic("page_destroy_pages: not root page %p", (void *)pp);
3144                 /*NOTREACHED*/
3145         }
3146 
3147         for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3148                 ASSERT((PAGE_EXCL(tpp) &&
3149                     !page_iolock_assert(tpp)) || panicstr);
3150                 ASSERT(tpp->p_slckcnt == 0 || panicstr);
3151                 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3152                 page_hashout(tpp, NULL);
3153                 ASSERT(tpp->p_offset == (u_offset_t)-1);
3154                 if (tpp->p_lckcnt != 0) {
3155                         pglcks++;
3156                         tpp->p_lckcnt = 0;
3157                 } else if (tpp->p_cowcnt != 0) {
3158                         pglcks += tpp->p_cowcnt;
3159                         tpp->p_cowcnt = 0;
3160                 }
3161                 ASSERT(!hat_page_getshare(tpp));
3162                 ASSERT(tpp->p_vnode == NULL);
3163                 ASSERT(tpp->p_szc == szc);
3164 
3165                 PP_SETFREE(tpp);
3166                 page_clr_all_props(tpp);
3167                 PP_SETAGED(tpp);
3168                 ASSERT(tpp->p_next == tpp);
3169                 ASSERT(tpp->p_prev == tpp);
3170                 page_list_concat(&rootpp, &tpp);
3171         }
3172 
3173         ASSERT(rootpp == pp);
3174         if (pglcks != 0) {
3175                 mutex_enter(&freemem_lock);
3176                 availrmem += pglcks;
3177                 mutex_exit(&freemem_lock);
3178         }
3179 
3180         page_list_add_pages(rootpp, 0);
3181         page_create_putback(pgcnt);
3182 }
3183 
3184 /*
3185  * Similar to page_destroy(), but destroys pages which are
3186  * locked and known to be on the page free list.  Since
3187  * the page is known to be free and locked, no one can access
3188  * it.
3189  *
3190  * Also, the number of free pages does not change.
3191  */
3192 void
3193 page_destroy_free(page_t *pp)
3194 {
3195         ASSERT(PAGE_EXCL(pp));
3196         ASSERT(PP_ISFREE(pp));
3197         ASSERT(pp->p_vnode);
3198         ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3199         ASSERT(!hat_page_is_mapped(pp));
3200         ASSERT(PP_ISAGED(pp) == 0);
3201         ASSERT(pp->p_szc == 0);
3202 
3203         VM_STAT_ADD(pagecnt.pc_destroy_free);
3204         page_list_sub(pp, PG_CACHE_LIST);
3205 
3206         page_hashout(pp, NULL);
3207         ASSERT(pp->p_vnode == NULL);
3208         ASSERT(pp->p_offset == (u_offset_t)-1);
3209         ASSERT(pp->p_hash == NULL);
3210 
3211         PP_SETAGED(pp);
3212         page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3213         page_unlock(pp);
3214 
3215         mutex_enter(&new_freemem_lock);
3216         if (freemem_wait) {
3217                 cv_signal(&freemem_cv);
3218         }
3219         mutex_exit(&new_freemem_lock);
3220 }
3221 
3222 /*
3223  * Rename the page "opp" to have an identity specified
3224  * by [vp, off].  If a page already exists with this name
3225  * it is locked and destroyed.  Note that the page's
3226  * translations are not unloaded during the rename.
3227  *
3228  * This routine is used by the anon layer to "steal" the
3229  * original page and is not unlike destroying a page and
3230  * creating a new page using the same page frame.
3231  *
3232  * XXX -- Could deadlock if caller 1 tries to rename A to B while
3233  * caller 2 tries to rename B to A.
3234  */
3235 void
3236 page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3237 {
3238         page_t          *pp;
3239         int             olckcnt = 0;
3240         int             ocowcnt = 0;
3241         kmutex_t        *phm;
3242         ulong_t         index;
3243 
3244         ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3245         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3246         ASSERT(PP_ISFREE(opp) == 0);
3247 
3248         VM_STAT_ADD(page_rename_count);
3249 
3250         TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3251             "page rename:pp %p vp %p off %llx", opp, vp, off);
3252 
3253         /*
3254          * CacheFS may call page_rename for a large NFS page
3255          * when both CacheFS and NFS mount points are used
3256          * by applications. Demote this large page before
3257          * renaming it, to ensure that there are no "partial"
3258          * large pages left lying around.
3259          */
3260         if (opp->p_szc != 0) {
3261                 vnode_t *ovp = opp->p_vnode;
3262                 ASSERT(ovp != NULL);
3263                 ASSERT(!IS_SWAPFSVP(ovp));
3264                 ASSERT(!VN_ISKAS(ovp));
3265                 page_demote_vp_pages(opp);
3266                 ASSERT(opp->p_szc == 0);
3267         }
3268 
3269         page_hashout(opp, NULL);
3270         PP_CLRAGED(opp);
3271 
3272         /*
3273          * Acquire the appropriate page hash lock, since
3274          * we're going to rename the page.
3275          */
3276         index = PAGE_HASH_FUNC(vp, off);
3277         phm = PAGE_HASH_MUTEX(index);
3278         mutex_enter(phm);
3279 top:
3280         /*
3281          * Look for an existing page with this name and destroy it if found.
3282          * By holding the page hash lock all the way to the page_hashin()
3283          * call, we are assured that no page can be created with this
3284          * identity.  In the case when the phm lock is dropped to undo any
3285          * hat layer mappings, the existing page is held with an "exclusive"
3286          * lock, again preventing another page from being created with
3287          * this identity.
3288          */
3289         pp = page_hash_search(index, vp, off);
3290         if (pp != NULL) {
3291                 VM_STAT_ADD(page_rename_exists);
3292 
3293                 /*
3294                  * As it turns out, this is one of only two places where
3295                  * page_lock() needs to hold the passed in lock in the
3296                  * successful case.  In all of the others, the lock could
3297                  * be dropped as soon as the attempt is made to lock
3298                  * the page.  It is tempting to add yet another arguement,
3299                  * PL_KEEP or PL_DROP, to let page_lock know what to do.
3300                  */
3301                 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3302                         /*
3303                          * Went to sleep because the page could not
3304                          * be locked.  We were woken up when the page
3305                          * was unlocked, or when the page was destroyed.
3306                          * In either case, `phm' was dropped while we
3307                          * slept.  Hence we should not just roar through
3308                          * this loop.
3309                          */
3310                         goto top;
3311                 }
3312 
3313                 /*
3314                  * If an existing page is a large page, then demote
3315                  * it to ensure that no "partial" large pages are
3316                  * "created" after page_rename. An existing page
3317                  * can be a CacheFS page, and can't belong to swapfs.
3318                  */
3319                 if (hat_page_is_mapped(pp)) {
3320                         /*
3321                          * Unload translations.  Since we hold the
3322                          * exclusive lock on this page, the page
3323                          * can not be changed while we drop phm.
3324                          * This is also not a lock protocol violation,
3325                          * but rather the proper way to do things.
3326                          */
3327                         mutex_exit(phm);
3328                         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3329                         if (pp->p_szc != 0) {
3330                                 ASSERT(!IS_SWAPFSVP(vp));
3331                                 ASSERT(!VN_ISKAS(vp));
3332                                 page_demote_vp_pages(pp);
3333                                 ASSERT(pp->p_szc == 0);
3334                         }
3335                         mutex_enter(phm);
3336                 } else if (pp->p_szc != 0) {
3337                         ASSERT(!IS_SWAPFSVP(vp));
3338                         ASSERT(!VN_ISKAS(vp));
3339                         mutex_exit(phm);
3340                         page_demote_vp_pages(pp);
3341                         ASSERT(pp->p_szc == 0);
3342                         mutex_enter(phm);
3343                 }
3344                 page_hashout(pp, phm);
3345         }
3346         /*
3347          * Hash in the page with the new identity.
3348          */
3349         if (!page_hashin(opp, vp, off, phm)) {
3350                 /*
3351                  * We were holding phm while we searched for [vp, off]
3352                  * and only dropped phm if we found and locked a page.
3353                  * If we can't create this page now, then some thing
3354                  * is really broken.
3355                  */
3356                 panic("page_rename: Can't hash in page: %p", (void *)pp);
3357                 /*NOTREACHED*/
3358         }
3359 
3360         ASSERT(MUTEX_HELD(phm));
3361         mutex_exit(phm);
3362 
3363         /*
3364          * Now that we have dropped phm, lets get around to finishing up
3365          * with pp.
3366          */
3367         if (pp != NULL) {
3368                 ASSERT(!hat_page_is_mapped(pp));
3369                 /* for now large pages should not end up here */
3370                 ASSERT(pp->p_szc == 0);
3371                 /*
3372                  * Save the locks for transfer to the new page and then
3373                  * clear them so page_free doesn't think they're important.
3374                  * The page_struct_lock need not be acquired for lckcnt and
3375                  * cowcnt since the page has an "exclusive" lock.
3376                  */
3377                 olckcnt = pp->p_lckcnt;
3378                 ocowcnt = pp->p_cowcnt;
3379                 pp->p_lckcnt = pp->p_cowcnt = 0;
3380 
3381                 /*
3382                  * Put the page on the "free" list after we drop
3383                  * the lock.  The less work under the lock the better.
3384                  */
3385                 /*LINTED: constant in conditional context*/
3386                 VN_DISPOSE(pp, B_FREE, 0, kcred);
3387         }
3388 
3389         /*
3390          * Transfer the lock count from the old page (if any).
3391          * The page_struct_lock need not be acquired for lckcnt and
3392          * cowcnt since the page has an "exclusive" lock.
3393          */
3394         opp->p_lckcnt += olckcnt;
3395         opp->p_cowcnt += ocowcnt;
3396 }
3397 
3398 /*
3399  * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3400  *
3401  * Pages are normally inserted at the start of a vnode's v_pages list.
3402  * If the vnode is VMODSORT and the page is modified, it goes at the end.
3403  * This can happen when a modified page is relocated for DR.
3404  *
3405  * Returns 1 on success and 0 on failure.
3406  */
3407 static int
3408 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3409 {
3410         page_t          **listp;
3411         page_t          *tp;
3412         ulong_t         index;
3413 
3414         ASSERT(PAGE_EXCL(pp));
3415         ASSERT(vp != NULL);
3416         ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3417 
3418         /*
3419          * Be sure to set these up before the page is inserted on the hash
3420          * list.  As soon as the page is placed on the list some other
3421          * thread might get confused and wonder how this page could
3422          * possibly hash to this list.
3423          */
3424         pp->p_vnode = vp;
3425         pp->p_offset = offset;
3426 
3427         /*
3428          * record if this page is on a swap vnode
3429          */
3430         if ((vp->v_flag & VISSWAP) != 0)
3431                 PP_SETSWAP(pp);
3432 
3433         index = PAGE_HASH_FUNC(vp, offset);
3434         ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3435         listp = &page_hash[index];
3436 
3437         /*
3438          * If this page is already hashed in, fail this attempt to add it.
3439          */
3440         for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3441                 if (tp->p_vnode == vp && tp->p_offset == offset) {
3442                         pp->p_vnode = NULL;
3443                         pp->p_offset = (u_offset_t)(-1);
3444                         return (0);
3445                 }
3446         }
3447         pp->p_hash = *listp;
3448         *listp = pp;
3449 
3450         /*
3451          * Add the page to the vnode's list of pages
3452          */
3453         if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3454                 listp = &vp->v_pages->p_vpprev->p_vpnext;
3455         else
3456                 listp = &vp->v_pages;
3457 
3458         page_vpadd(listp, pp);
3459 
3460         return (1);
3461 }
3462 
3463 /*
3464  * Add page `pp' to both the hash and vp chains for [vp, offset].
3465  *
3466  * Returns 1 on success and 0 on failure.
3467  * If hold is passed in, it is not dropped.
3468  */
3469 int
3470 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3471 {
3472         kmutex_t        *phm = NULL;
3473         kmutex_t        *vphm;
3474         int             rc;
3475 
3476         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3477         ASSERT(pp->p_fsdata == 0 || panicstr);
3478 
3479         TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3480             "page_hashin:pp %p vp %p offset %llx",
3481             pp, vp, offset);
3482 
3483         VM_STAT_ADD(hashin_count);
3484 
3485         if (hold != NULL)
3486                 phm = hold;
3487         else {
3488                 VM_STAT_ADD(hashin_not_held);
3489                 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3490                 mutex_enter(phm);
3491         }
3492 
3493         vphm = page_vnode_mutex(vp);
3494         mutex_enter(vphm);
3495         rc = page_do_hashin(pp, vp, offset);
3496         mutex_exit(vphm);
3497         if (hold == NULL)
3498                 mutex_exit(phm);
3499         if (rc == 0)
3500                 VM_STAT_ADD(hashin_already);
3501         return (rc);
3502 }
3503 
3504 /*
3505  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3506  * All mutexes must be held
3507  */
3508 static void
3509 page_do_hashout(page_t *pp)
3510 {
3511         page_t  **hpp;
3512         page_t  *hp;
3513         vnode_t *vp = pp->p_vnode;
3514 
3515         ASSERT(vp != NULL);
3516         ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3517 
3518         /*
3519          * First, take pp off of its hash chain.
3520          */
3521         hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3522 
3523         for (;;) {
3524                 hp = *hpp;
3525                 if (hp == pp)
3526                         break;
3527                 if (hp == NULL) {
3528                         panic("page_do_hashout");
3529                         /*NOTREACHED*/
3530                 }
3531                 hpp = &hp->p_hash;
3532         }
3533         *hpp = pp->p_hash;
3534 
3535         /*
3536          * Now remove it from its associated vnode.
3537          */
3538         if (vp->v_pages)
3539                 page_vpsub(&vp->v_pages, pp);
3540 
3541         pp->p_hash = NULL;
3542         page_clr_all_props(pp);
3543         PP_CLRSWAP(pp);
3544         pp->p_vnode = NULL;
3545         pp->p_offset = (u_offset_t)-1;
3546         pp->p_fsdata = 0;
3547 }
3548 
3549 /*
3550  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3551  *
3552  * When `phm' is non-NULL it contains the address of the mutex protecting the
3553  * hash list pp is on.  It is not dropped.
3554  */
3555 void
3556 page_hashout(page_t *pp, kmutex_t *phm)
3557 {
3558         vnode_t         *vp;
3559         ulong_t         index;
3560         kmutex_t        *nphm;
3561         kmutex_t        *vphm;
3562         kmutex_t        *sep;
3563 
3564         ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3565         ASSERT(pp->p_vnode != NULL);
3566         ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3567         ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3568 
3569         vp = pp->p_vnode;
3570 
3571         TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3572             "page_hashout:pp %p vp %p", pp, vp);
3573 
3574         /* Kernel probe */
3575         TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3576             tnf_opaque, vnode, vp,
3577             tnf_offset, offset, pp->p_offset);
3578 
3579         /*
3580          *
3581          */
3582         VM_STAT_ADD(hashout_count);
3583         index = PAGE_HASH_FUNC(vp, pp->p_offset);
3584         if (phm == NULL) {
3585                 VM_STAT_ADD(hashout_not_held);
3586                 nphm = PAGE_HASH_MUTEX(index);
3587                 mutex_enter(nphm);
3588         }
3589         ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3590 
3591 
3592         /*
3593          * grab page vnode mutex and remove it...
3594          */
3595         vphm = page_vnode_mutex(vp);
3596         mutex_enter(vphm);
3597 
3598         page_do_hashout(pp);
3599 
3600         mutex_exit(vphm);
3601         if (phm == NULL)
3602                 mutex_exit(nphm);
3603 
3604         /*
3605          * Wake up processes waiting for this page.  The page's
3606          * identity has been changed, and is probably not the
3607          * desired page any longer.
3608          */
3609         sep = page_se_mutex(pp);
3610         mutex_enter(sep);
3611         pp->p_selock &= ~SE_EWANTED;
3612         if (CV_HAS_WAITERS(&pp->p_cv))
3613                 cv_broadcast(&pp->p_cv);
3614         mutex_exit(sep);
3615 }
3616 
3617 /*
3618  * Add the page to the front of a linked list of pages
3619  * using the p_next & p_prev pointers for the list.
3620  * The caller is responsible for protecting the list pointers.
3621  */
3622 void
3623 page_add(page_t **ppp, page_t *pp)
3624 {
3625         ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3626 
3627         page_add_common(ppp, pp);
3628 }
3629 
3630 
3631 
3632 /*
3633  *  Common code for page_add() and mach_page_add()
3634  */
3635 void
3636 page_add_common(page_t **ppp, page_t *pp)
3637 {
3638         if (*ppp == NULL) {
3639                 pp->p_next = pp->p_prev = pp;
3640         } else {
3641                 pp->p_next = *ppp;
3642                 pp->p_prev = (*ppp)->p_prev;
3643                 (*ppp)->p_prev = pp;
3644                 pp->p_prev->p_next = pp;
3645         }
3646         *ppp = pp;
3647 }
3648 
3649 
3650 /*
3651  * Remove this page from a linked list of pages
3652  * using the p_next & p_prev pointers for the list.
3653  *
3654  * The caller is responsible for protecting the list pointers.
3655  */
3656 void
3657 page_sub(page_t **ppp, page_t *pp)
3658 {
3659         ASSERT((PP_ISFREE(pp)) ? 1 :
3660             (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3661 
3662         if (*ppp == NULL || pp == NULL) {
3663                 panic("page_sub: bad arg(s): pp %p, *ppp %p",
3664                     (void *)pp, (void *)(*ppp));
3665                 /*NOTREACHED*/
3666         }
3667 
3668         page_sub_common(ppp, pp);
3669 }
3670 
3671 
3672 /*
3673  *  Common code for page_sub() and mach_page_sub()
3674  */
3675 void
3676 page_sub_common(page_t **ppp, page_t *pp)
3677 {
3678         if (*ppp == pp)
3679                 *ppp = pp->p_next;           /* go to next page */
3680 
3681         if (*ppp == pp)
3682                 *ppp = NULL;                    /* page list is gone */
3683         else {
3684                 pp->p_prev->p_next = pp->p_next;
3685                 pp->p_next->p_prev = pp->p_prev;
3686         }
3687         pp->p_prev = pp->p_next = pp;             /* make pp a list of one */
3688 }
3689 
3690 
3691 /*
3692  * Break page list cppp into two lists with npages in the first list.
3693  * The tail is returned in nppp.
3694  */
3695 void
3696 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3697 {
3698         page_t *s1pp = *oppp;
3699         page_t *s2pp;
3700         page_t *e1pp, *e2pp;
3701         long n = 0;
3702 
3703         if (s1pp == NULL) {
3704                 *nppp = NULL;
3705                 return;
3706         }
3707         if (npages == 0) {
3708                 *nppp = s1pp;
3709                 *oppp = NULL;
3710                 return;
3711         }
3712         for (n = 0, s2pp = *oppp; n < npages; n++) {
3713                 s2pp = s2pp->p_next;
3714         }
3715         /* Fix head and tail of new lists */
3716         e1pp = s2pp->p_prev;
3717         e2pp = s1pp->p_prev;
3718         s1pp->p_prev = e1pp;
3719         e1pp->p_next = s1pp;
3720         s2pp->p_prev = e2pp;
3721         e2pp->p_next = s2pp;
3722 
3723         /* second list empty */
3724         if (s2pp == s1pp) {
3725                 *oppp = s1pp;
3726                 *nppp = NULL;
3727         } else {
3728                 *oppp = s1pp;
3729                 *nppp = s2pp;
3730         }
3731 }
3732 
3733 /*
3734  * Concatenate page list nppp onto the end of list ppp.
3735  */
3736 void
3737 page_list_concat(page_t **ppp, page_t **nppp)
3738 {
3739         page_t *s1pp, *s2pp, *e1pp, *e2pp;
3740 
3741         if (*nppp == NULL) {
3742                 return;
3743         }
3744         if (*ppp == NULL) {
3745                 *ppp = *nppp;
3746                 return;
3747         }
3748         s1pp = *ppp;
3749         e1pp =  s1pp->p_prev;
3750         s2pp = *nppp;
3751         e2pp = s2pp->p_prev;
3752         s1pp->p_prev = e2pp;
3753         e2pp->p_next = s1pp;
3754         e1pp->p_next = s2pp;
3755         s2pp->p_prev = e1pp;
3756 }
3757 
3758 /*
3759  * return the next page in the page list
3760  */
3761 page_t *
3762 page_list_next(page_t *pp)
3763 {
3764         return (pp->p_next);
3765 }
3766 
3767 
3768 /*
3769  * Add the page to the front of the linked list of pages
3770  * using p_vpnext/p_vpprev pointers for the list.
3771  *
3772  * The caller is responsible for protecting the lists.
3773  */
3774 void
3775 page_vpadd(page_t **ppp, page_t *pp)
3776 {
3777         if (*ppp == NULL) {
3778                 pp->p_vpnext = pp->p_vpprev = pp;
3779         } else {
3780                 pp->p_vpnext = *ppp;
3781                 pp->p_vpprev = (*ppp)->p_vpprev;
3782                 (*ppp)->p_vpprev = pp;
3783                 pp->p_vpprev->p_vpnext = pp;
3784         }
3785         *ppp = pp;
3786 }
3787 
3788 /*
3789  * Remove this page from the linked list of pages
3790  * using p_vpnext/p_vpprev pointers for the list.
3791  *
3792  * The caller is responsible for protecting the lists.
3793  */
3794 void
3795 page_vpsub(page_t **ppp, page_t *pp)
3796 {
3797         if (*ppp == NULL || pp == NULL) {
3798                 panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3799                     (void *)pp, (void *)(*ppp));
3800                 /*NOTREACHED*/
3801         }
3802 
3803         if (*ppp == pp)
3804                 *ppp = pp->p_vpnext;         /* go to next page */
3805 
3806         if (*ppp == pp)
3807                 *ppp = NULL;                    /* page list is gone */
3808         else {
3809                 pp->p_vpprev->p_vpnext = pp->p_vpnext;
3810                 pp->p_vpnext->p_vpprev = pp->p_vpprev;
3811         }
3812         pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */
3813 }
3814 
3815 /*
3816  * Lock a physical page into memory "long term".  Used to support "lock
3817  * in memory" functions.  Accepts the page to be locked, and a cow variable
3818  * to indicate whether a the lock will travel to the new page during
3819  * a potential copy-on-write.
3820  */
3821 int
3822 page_pp_lock(
3823         page_t *pp,                     /* page to be locked */
3824         int cow,                        /* cow lock */
3825         int kernel)                     /* must succeed -- ignore checking */
3826 {
3827         int r = 0;                      /* result -- assume failure */
3828 
3829         ASSERT(PAGE_LOCKED(pp));
3830 
3831         page_struct_lock(pp);
3832         /*
3833          * Acquire the "freemem_lock" for availrmem.
3834          */
3835         if (cow) {
3836                 mutex_enter(&freemem_lock);
3837                 if ((availrmem > pages_pp_maximum) &&
3838                     (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3839                         availrmem--;
3840                         pages_locked++;
3841                         mutex_exit(&freemem_lock);
3842                         r = 1;
3843                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3844                                 cmn_err(CE_WARN,
3845                                     "COW lock limit reached on pfn 0x%lx",
3846                                     page_pptonum(pp));
3847                         }
3848                 } else
3849                         mutex_exit(&freemem_lock);
3850         } else {
3851                 if (pp->p_lckcnt) {
3852                         if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3853                                 r = 1;
3854                                 if (++pp->p_lckcnt ==
3855                                     (ushort_t)PAGE_LOCK_MAXIMUM) {
3856                                         cmn_err(CE_WARN, "Page lock limit "
3857                                             "reached on pfn 0x%lx",
3858                                             page_pptonum(pp));
3859                                 }
3860                         }
3861                 } else {
3862                         if (kernel) {
3863                                 /* availrmem accounting done by caller */
3864                                 ++pp->p_lckcnt;
3865                                 r = 1;
3866                         } else {
3867                                 mutex_enter(&freemem_lock);
3868                                 if (availrmem > pages_pp_maximum) {
3869                                         availrmem--;
3870                                         pages_locked++;
3871                                         ++pp->p_lckcnt;
3872                                         r = 1;
3873                                 }
3874                                 mutex_exit(&freemem_lock);
3875                         }
3876                 }
3877         }
3878         page_struct_unlock(pp);
3879         return (r);
3880 }
3881 
3882 /*
3883  * Decommit a lock on a physical page frame.  Account for cow locks if
3884  * appropriate.
3885  */
3886 void
3887 page_pp_unlock(
3888         page_t *pp,                     /* page to be unlocked */
3889         int cow,                        /* expect cow lock */
3890         int kernel)                     /* this was a kernel lock */
3891 {
3892         ASSERT(PAGE_LOCKED(pp));
3893 
3894         page_struct_lock(pp);
3895         /*
3896          * Acquire the "freemem_lock" for availrmem.
3897          * If cowcnt or lcknt is already 0 do nothing; i.e., we
3898          * could be called to unlock even if nothing is locked. This could
3899          * happen if locked file pages were truncated (removing the lock)
3900          * and the file was grown again and new pages faulted in; the new
3901          * pages are unlocked but the segment still thinks they're locked.
3902          */
3903         if (cow) {
3904                 if (pp->p_cowcnt) {
3905                         mutex_enter(&freemem_lock);
3906                         pp->p_cowcnt--;
3907                         availrmem++;
3908                         pages_locked--;
3909                         mutex_exit(&freemem_lock);
3910                 }
3911         } else {
3912                 if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3913                         if (!kernel) {
3914                                 mutex_enter(&freemem_lock);
3915                                 availrmem++;
3916                                 pages_locked--;
3917                                 mutex_exit(&freemem_lock);
3918                         }
3919                 }
3920         }
3921         page_struct_unlock(pp);
3922 }
3923 
3924 /*
3925  * This routine reserves availrmem for npages;
3926  *      flags: KM_NOSLEEP or KM_SLEEP
3927  *      returns 1 on success or 0 on failure
3928  */
3929 int
3930 page_resv(pgcnt_t npages, uint_t flags)
3931 {
3932         mutex_enter(&freemem_lock);
3933         while (availrmem < tune.t_minarmem + npages) {
3934                 if (flags & KM_NOSLEEP) {
3935                         mutex_exit(&freemem_lock);
3936                         return (0);
3937                 }
3938                 mutex_exit(&freemem_lock);
3939                 page_needfree(npages);
3940                 kmem_reap();
3941                 delay(hz >> 2);
3942                 page_needfree(-(spgcnt_t)npages);
3943                 mutex_enter(&freemem_lock);
3944         }
3945         availrmem -= npages;
3946         mutex_exit(&freemem_lock);
3947         return (1);
3948 }
3949 
3950 /*
3951  * This routine unreserves availrmem for npages;
3952  */
3953 void
3954 page_unresv(pgcnt_t npages)
3955 {
3956         mutex_enter(&freemem_lock);
3957         availrmem += npages;
3958         mutex_exit(&freemem_lock);
3959 }
3960 
3961 /*
3962  * See Statement at the beginning of segvn_lockop() regarding
3963  * the way we handle cowcnts and lckcnts.
3964  *
3965  * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3966  * that breaks COW has PROT_WRITE.
3967  *
3968  * Note that, we may also break COW in case we are softlocking
3969  * on read access during physio;
3970  * in this softlock case, the vpage may not have PROT_WRITE.
3971  * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3972  * if the vpage doesn't have PROT_WRITE.
3973  *
3974  * This routine is never called if we are stealing a page
3975  * in anon_private.
3976  *
3977  * The caller subtracted from availrmem for read only mapping.
3978  * if lckcnt is 1 increment availrmem.
3979  */
3980 void
3981 page_pp_useclaim(
3982         page_t *opp,            /* original page frame losing lock */
3983         page_t *npp,            /* new page frame gaining lock */
3984         uint_t write_perm)      /* set if vpage has PROT_WRITE */
3985 {
3986         int payback = 0;
3987         int nidx, oidx;
3988 
3989         ASSERT(PAGE_LOCKED(opp));
3990         ASSERT(PAGE_LOCKED(npp));
3991 
3992         /*
3993          * Since we have two pages we probably have two locks.  We need to take
3994          * them in a defined order to avoid deadlocks.  It's also possible they
3995          * both hash to the same lock in which case this is a non-issue.
3996          */
3997         nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3998         oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3999         if (nidx < oidx) {
4000                 page_struct_lock(npp);
4001                 page_struct_lock(opp);
4002         } else if (oidx < nidx) {
4003                 page_struct_lock(opp);
4004                 page_struct_lock(npp);
4005         } else {        /* The pages hash to the same lock */
4006                 page_struct_lock(npp);
4007         }
4008 
4009         ASSERT(npp->p_cowcnt == 0);
4010         ASSERT(npp->p_lckcnt == 0);
4011 
4012         /* Don't use claim if nothing is locked (see page_pp_unlock above) */
4013         if ((write_perm && opp->p_cowcnt != 0) ||
4014             (!write_perm && opp->p_lckcnt != 0)) {
4015 
4016                 if (write_perm) {
4017                         npp->p_cowcnt++;
4018                         ASSERT(opp->p_cowcnt != 0);
4019                         opp->p_cowcnt--;
4020                 } else {
4021 
4022                         ASSERT(opp->p_lckcnt != 0);
4023 
4024                         /*
4025                          * We didn't need availrmem decremented if p_lckcnt on
4026                          * original page is 1. Here, we are unlocking
4027                          * read-only copy belonging to original page and
4028                          * are locking a copy belonging to new page.
4029                          */
4030                         if (opp->p_lckcnt == 1)
4031                                 payback = 1;
4032 
4033                         npp->p_lckcnt++;
4034                         opp->p_lckcnt--;
4035                 }
4036         }
4037         if (payback) {
4038                 mutex_enter(&freemem_lock);
4039                 availrmem++;
4040                 pages_useclaim--;
4041                 mutex_exit(&freemem_lock);
4042         }
4043 
4044         if (nidx < oidx) {
4045                 page_struct_unlock(opp);
4046                 page_struct_unlock(npp);
4047         } else if (oidx < nidx) {
4048                 page_struct_unlock(npp);
4049                 page_struct_unlock(opp);
4050         } else {        /* The pages hash to the same lock */
4051                 page_struct_unlock(npp);
4052         }
4053 }
4054 
4055 /*
4056  * Simple claim adjust functions -- used to support changes in
4057  * claims due to changes in access permissions.  Used by segvn_setprot().
4058  */
4059 int
4060 page_addclaim(page_t *pp)
4061 {
4062         int r = 0;                      /* result */
4063 
4064         ASSERT(PAGE_LOCKED(pp));
4065 
4066         page_struct_lock(pp);
4067         ASSERT(pp->p_lckcnt != 0);
4068 
4069         if (pp->p_lckcnt == 1) {
4070                 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4071                         --pp->p_lckcnt;
4072                         r = 1;
4073                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4074                                 cmn_err(CE_WARN,
4075                                     "COW lock limit reached on pfn 0x%lx",
4076                                     page_pptonum(pp));
4077                         }
4078                 }
4079         } else {
4080                 mutex_enter(&freemem_lock);
4081                 if ((availrmem > pages_pp_maximum) &&
4082                     (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4083                         --availrmem;
4084                         ++pages_claimed;
4085                         mutex_exit(&freemem_lock);
4086                         --pp->p_lckcnt;
4087                         r = 1;
4088                         if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4089                                 cmn_err(CE_WARN,
4090                                     "COW lock limit reached on pfn 0x%lx",
4091                                     page_pptonum(pp));
4092                         }
4093                 } else
4094                         mutex_exit(&freemem_lock);
4095         }
4096         page_struct_unlock(pp);
4097         return (r);
4098 }
4099 
4100 int
4101 page_subclaim(page_t *pp)
4102 {
4103         int r = 0;
4104 
4105         ASSERT(PAGE_LOCKED(pp));
4106 
4107         page_struct_lock(pp);
4108         ASSERT(pp->p_cowcnt != 0);
4109 
4110         if (pp->p_lckcnt) {
4111                 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4112                         r = 1;
4113                         /*
4114                          * for availrmem
4115                          */
4116                         mutex_enter(&freemem_lock);
4117                         availrmem++;
4118                         pages_claimed--;
4119                         mutex_exit(&freemem_lock);
4120 
4121                         pp->p_cowcnt--;
4122 
4123                         if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4124                                 cmn_err(CE_WARN,
4125                                     "Page lock limit reached on pfn 0x%lx",
4126                                     page_pptonum(pp));
4127                         }
4128                 }
4129         } else {
4130                 r = 1;
4131                 pp->p_cowcnt--;
4132                 pp->p_lckcnt++;
4133         }
4134         page_struct_unlock(pp);
4135         return (r);
4136 }
4137 
4138 /*
4139  * Variant of page_addclaim(), where ppa[] contains the pages of a single large
4140  * page.
4141  */
4142 int
4143 page_addclaim_pages(page_t  **ppa)
4144 {
4145         pgcnt_t lckpgs = 0, pg_idx;
4146 
4147         VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4148 
4149         /*
4150          * Only need to take the page struct lock on the large page root.
4151          */
4152         page_struct_lock(ppa[0]);
4153         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4154 
4155                 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4156                 ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4157                 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4158                         page_struct_unlock(ppa[0]);
4159                         return (0);
4160                 }
4161                 if (ppa[pg_idx]->p_lckcnt > 1)
4162                         lckpgs++;
4163         }
4164 
4165         if (lckpgs != 0) {
4166                 mutex_enter(&freemem_lock);
4167                 if (availrmem >= pages_pp_maximum + lckpgs) {
4168                         availrmem -= lckpgs;
4169                         pages_claimed += lckpgs;
4170                 } else {
4171                         mutex_exit(&freemem_lock);
4172                         page_struct_unlock(ppa[0]);
4173                         return (0);
4174                 }
4175                 mutex_exit(&freemem_lock);
4176         }
4177 
4178         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4179                 ppa[pg_idx]->p_lckcnt--;
4180                 ppa[pg_idx]->p_cowcnt++;
4181         }
4182         page_struct_unlock(ppa[0]);
4183         return (1);
4184 }
4185 
4186 /*
4187  * Variant of page_subclaim(), where ppa[] contains the pages of a single large
4188  * page.
4189  */
4190 int
4191 page_subclaim_pages(page_t  **ppa)
4192 {
4193         pgcnt_t ulckpgs = 0, pg_idx;
4194 
4195         VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4196 
4197         /*
4198          * Only need to take the page struct lock on the large page root.
4199          */
4200         page_struct_lock(ppa[0]);
4201         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4202 
4203                 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4204                 ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4205                 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4206                         page_struct_unlock(ppa[0]);
4207                         return (0);
4208                 }
4209                 if (ppa[pg_idx]->p_lckcnt != 0)
4210                         ulckpgs++;
4211         }
4212 
4213         if (ulckpgs != 0) {
4214                 mutex_enter(&freemem_lock);
4215                 availrmem += ulckpgs;
4216                 pages_claimed -= ulckpgs;
4217                 mutex_exit(&freemem_lock);
4218         }
4219 
4220         for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4221                 ppa[pg_idx]->p_cowcnt--;
4222                 ppa[pg_idx]->p_lckcnt++;
4223 
4224         }
4225         page_struct_unlock(ppa[0]);
4226         return (1);
4227 }
4228 
4229 page_t *
4230 page_numtopp(pfn_t pfnum, se_t se)
4231 {
4232         page_t *pp;
4233 
4234 retry:
4235         pp = page_numtopp_nolock(pfnum);
4236         if (pp == NULL) {
4237                 return ((page_t *)NULL);
4238         }
4239 
4240         /*
4241          * Acquire the appropriate lock on the page.
4242          */
4243         while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4244                 if (page_pptonum(pp) != pfnum)
4245                         goto retry;
4246                 continue;
4247         }
4248 
4249         if (page_pptonum(pp) != pfnum) {
4250                 page_unlock(pp);
4251                 goto retry;
4252         }
4253 
4254         return (pp);
4255 }
4256 
4257 page_t *
4258 page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4259 {
4260         page_t *pp;
4261 
4262 retry:
4263         pp = page_numtopp_nolock(pfnum);
4264         if (pp == NULL) {
4265                 return ((page_t *)NULL);
4266         }
4267 
4268         /*
4269          * Acquire the appropriate lock on the page.
4270          */
4271         while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4272                 if (page_pptonum(pp) != pfnum)
4273                         goto retry;
4274                 continue;
4275         }
4276 
4277         if (page_pptonum(pp) != pfnum) {
4278                 page_unlock(pp);
4279                 goto retry;
4280         }
4281 
4282         return (pp);
4283 }
4284 
4285 /*
4286  * This routine is like page_numtopp, but will only return page structs
4287  * for pages which are ok for loading into hardware using the page struct.
4288  */
4289 page_t *
4290 page_numtopp_nowait(pfn_t pfnum, se_t se)
4291 {
4292         page_t *pp;
4293 
4294 retry:
4295         pp = page_numtopp_nolock(pfnum);
4296         if (pp == NULL) {
4297                 return ((page_t *)NULL);
4298         }
4299 
4300         /*
4301          * Try to acquire the appropriate lock on the page.
4302          */
4303         if (PP_ISFREE(pp))
4304                 pp = NULL;
4305         else {
4306                 if (!page_trylock(pp, se))
4307                         pp = NULL;
4308                 else {
4309                         if (page_pptonum(pp) != pfnum) {
4310                                 page_unlock(pp);
4311                                 goto retry;
4312                         }
4313                         if (PP_ISFREE(pp)) {
4314                                 page_unlock(pp);
4315                                 pp = NULL;
4316                         }
4317                 }
4318         }
4319         return (pp);
4320 }
4321 
4322 /*
4323  * Returns a count of dirty pages that are in the process
4324  * of being written out.  If 'cleanit' is set, try to push the page.
4325  */
4326 pgcnt_t
4327 page_busy(int cleanit)
4328 {
4329         page_t *page0 = page_first();
4330         page_t *pp = page0;
4331         pgcnt_t nppbusy = 0;
4332         u_offset_t off;
4333 
4334         do {
4335                 vnode_t *vp = pp->p_vnode;
4336                 /*
4337                  * A page is a candidate for syncing if it is:
4338                  *
4339                  * (a)  On neither the freelist nor the cachelist
4340                  * (b)  Hashed onto a vnode
4341                  * (c)  Not a kernel page
4342                  * (d)  Dirty
4343                  * (e)  Not part of a swapfile
4344                  * (f)  a page which belongs to a real vnode; eg has a non-null
4345                  *      v_vfsp pointer.
4346                  * (g)  Backed by a filesystem which doesn't have a
4347                  *      stubbed-out sync operation
4348                  */
4349                 if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4350                     hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4351                     vfs_can_sync(vp->v_vfsp)) {
4352                         nppbusy++;
4353 
4354                         if (!cleanit)
4355                                 continue;
4356                         if (!page_trylock(pp, SE_EXCL))
4357                                 continue;
4358 
4359                         if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4360                             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4361                             !(hat_pagesync(pp,
4362                             HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4363                                 page_unlock(pp);
4364                                 continue;
4365                         }
4366                         off = pp->p_offset;
4367                         VN_HOLD(vp);
4368                         page_unlock(pp);
4369                         (void) VOP_PUTPAGE(vp, off, PAGESIZE,
4370                             B_ASYNC | B_FREE, kcred, NULL);
4371                         VN_RELE(vp);
4372                 }
4373         } while ((pp = page_next(pp)) != page0);
4374 
4375         return (nppbusy);
4376 }
4377 
4378 void page_invalidate_pages(void);
4379 
4380 /*
4381  * callback handler to vm sub-system
4382  *
4383  * callers make sure no recursive entries to this func.
4384  */
4385 /*ARGSUSED*/
4386 boolean_t
4387 callb_vm_cpr(void *arg, int code)
4388 {
4389         if (code == CB_CODE_CPR_CHKPT)
4390                 page_invalidate_pages();
4391         return (B_TRUE);
4392 }
4393 
4394 /*
4395  * Invalidate all pages of the system.
4396  * It shouldn't be called until all user page activities are all stopped.
4397  */
4398 void
4399 page_invalidate_pages()
4400 {
4401         page_t *pp;
4402         page_t *page0;
4403         pgcnt_t nbusypages;
4404         int retry = 0;
4405         const int MAXRETRIES = 4;
4406 top:
4407         /*
4408          * Flush dirty pages and destroy the clean ones.
4409          */
4410         nbusypages = 0;
4411 
4412         pp = page0 = page_first();
4413         do {
4414                 struct vnode    *vp;
4415                 u_offset_t      offset;
4416                 int             mod;
4417 
4418                 /*
4419                  * skip the page if it has no vnode or the page associated
4420                  * with the kernel vnode or prom allocated kernel mem.
4421                  */
4422                 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4423                         continue;
4424 
4425                 /*
4426                  * skip the page which is already free invalidated.
4427                  */
4428                 if (PP_ISFREE(pp) && PP_ISAGED(pp))
4429                         continue;
4430 
4431                 /*
4432                  * skip pages that are already locked or can't be "exclusively"
4433                  * locked or are already free.  After we lock the page, check
4434                  * the free and age bits again to be sure it's not destroyed
4435                  * yet.
4436                  * To achieve max. parallelization, we use page_trylock instead
4437                  * of page_lock so that we don't get block on individual pages
4438                  * while we have thousands of other pages to process.
4439                  */
4440                 if (!page_trylock(pp, SE_EXCL)) {
4441                         nbusypages++;
4442                         continue;
4443                 } else if (PP_ISFREE(pp)) {
4444                         if (!PP_ISAGED(pp)) {
4445                                 page_destroy_free(pp);
4446                         } else {
4447                                 page_unlock(pp);
4448                         }
4449                         continue;
4450                 }
4451                 /*
4452                  * Is this page involved in some I/O? shared?
4453                  *
4454                  * The page_struct_lock need not be acquired to
4455                  * examine these fields since the page has an
4456                  * "exclusive" lock.
4457                  */
4458                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4459                         page_unlock(pp);
4460                         continue;
4461                 }
4462 
4463                 if (vp->v_type == VCHR) {
4464                         panic("vp->v_type == VCHR");
4465                         /*NOTREACHED*/
4466                 }
4467 
4468                 if (!page_try_demote_pages(pp)) {
4469                         page_unlock(pp);
4470                         continue;
4471                 }
4472 
4473                 /*
4474                  * Check the modified bit. Leave the bits alone in hardware
4475                  * (they will be modified if we do the putpage).
4476                  */
4477                 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4478                     & P_MOD);
4479                 if (mod) {
4480                         offset = pp->p_offset;
4481                         /*
4482                          * Hold the vnode before releasing the page lock
4483                          * to prevent it from being freed and re-used by
4484                          * some other thread.
4485                          */
4486                         VN_HOLD(vp);
4487                         page_unlock(pp);
4488                         /*
4489                          * No error return is checked here. Callers such as
4490                          * cpr deals with the dirty pages at the dump time
4491                          * if this putpage fails.
4492                          */
4493                         (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4494                             kcred, NULL);
4495                         VN_RELE(vp);
4496                 } else {
4497                         /*LINTED: constant in conditional context*/
4498                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
4499                 }
4500         } while ((pp = page_next(pp)) != page0);
4501         if (nbusypages && retry++ < MAXRETRIES) {
4502                 delay(1);
4503                 goto top;
4504         }
4505 }
4506 
4507 /*
4508  * Replace the page "old" with the page "new" on the page hash and vnode lists
4509  *
4510  * the replacement must be done in place, ie the equivalent sequence:
4511  *
4512  *      vp = old->p_vnode;
4513  *      off = old->p_offset;
4514  *      page_do_hashout(old)
4515  *      page_do_hashin(new, vp, off)
4516  *
4517  * doesn't work, since
4518  *  1) if old is the only page on the vnode, the v_pages list has a window
4519  *     where it looks empty. This will break file system assumptions.
4520  * and
4521  *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4522  */
4523 static void
4524 page_do_relocate_hash(page_t *new, page_t *old)
4525 {
4526         page_t  **hash_list;
4527         vnode_t *vp = old->p_vnode;
4528         kmutex_t *sep;
4529 
4530         ASSERT(PAGE_EXCL(old));
4531         ASSERT(PAGE_EXCL(new));
4532         ASSERT(vp != NULL);
4533         ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4534         ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4535 
4536         /*
4537          * First find old page on the page hash list
4538          */
4539         hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4540 
4541         for (;;) {
4542                 if (*hash_list == old)
4543                         break;
4544                 if (*hash_list == NULL) {
4545                         panic("page_do_hashout");
4546                         /*NOTREACHED*/
4547                 }
4548                 hash_list = &(*hash_list)->p_hash;
4549         }
4550 
4551         /*
4552          * update new and replace old with new on the page hash list
4553          */
4554         new->p_vnode = old->p_vnode;
4555         new->p_offset = old->p_offset;
4556         new->p_hash = old->p_hash;
4557         *hash_list = new;
4558 
4559         if ((new->p_vnode->v_flag & VISSWAP) != 0)
4560                 PP_SETSWAP(new);
4561 
4562         /*
4563          * replace old with new on the vnode's page list
4564          */
4565         if (old->p_vpnext == old) {
4566                 new->p_vpnext = new;
4567                 new->p_vpprev = new;
4568         } else {
4569                 new->p_vpnext = old->p_vpnext;
4570                 new->p_vpprev = old->p_vpprev;
4571                 new->p_vpnext->p_vpprev = new;
4572                 new->p_vpprev->p_vpnext = new;
4573         }
4574         if (vp->v_pages == old)
4575                 vp->v_pages = new;
4576 
4577         /*
4578          * clear out the old page
4579          */
4580         old->p_hash = NULL;
4581         old->p_vpnext = NULL;
4582         old->p_vpprev = NULL;
4583         old->p_vnode = NULL;
4584         PP_CLRSWAP(old);
4585         old->p_offset = (u_offset_t)-1;
4586         page_clr_all_props(old);
4587 
4588         /*
4589          * Wake up processes waiting for this page.  The page's
4590          * identity has been changed, and is probably not the
4591          * desired page any longer.
4592          */
4593         sep = page_se_mutex(old);
4594         mutex_enter(sep);
4595         old->p_selock &= ~SE_EWANTED;
4596         if (CV_HAS_WAITERS(&old->p_cv))
4597                 cv_broadcast(&old->p_cv);
4598         mutex_exit(sep);
4599 }
4600 
4601 /*
4602  * This function moves the identity of page "pp_old" to page "pp_new".
4603  * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4604  * and need not be hashed out from anywhere.
4605  */
4606 void
4607 page_relocate_hash(page_t *pp_new, page_t *pp_old)
4608 {
4609         vnode_t *vp = pp_old->p_vnode;
4610         u_offset_t off = pp_old->p_offset;
4611         kmutex_t *phm, *vphm;
4612 
4613         /*
4614          * Rehash two pages
4615          */
4616         ASSERT(PAGE_EXCL(pp_old));
4617         ASSERT(PAGE_EXCL(pp_new));
4618         ASSERT(vp != NULL);
4619         ASSERT(pp_new->p_vnode == NULL);
4620 
4621         /*
4622          * hashout then hashin while holding the mutexes
4623          */
4624         phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4625         mutex_enter(phm);
4626         vphm = page_vnode_mutex(vp);
4627         mutex_enter(vphm);
4628 
4629         page_do_relocate_hash(pp_new, pp_old);
4630 
4631         /* The following comment preserved from page_flip(). */
4632         pp_new->p_fsdata = pp_old->p_fsdata;
4633         pp_old->p_fsdata = 0;
4634         mutex_exit(vphm);
4635         mutex_exit(phm);
4636 
4637         /*
4638          * The page_struct_lock need not be acquired for lckcnt and
4639          * cowcnt since the page has an "exclusive" lock.
4640          */
4641         ASSERT(pp_new->p_lckcnt == 0);
4642         ASSERT(pp_new->p_cowcnt == 0);
4643         pp_new->p_lckcnt = pp_old->p_lckcnt;
4644         pp_new->p_cowcnt = pp_old->p_cowcnt;
4645         pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4646 
4647 }
4648 
4649 /*
4650  * Helper routine used to lock all remaining members of a
4651  * large page. The caller is responsible for passing in a locked
4652  * pp. If pp is a large page, then it succeeds in locking all the
4653  * remaining constituent pages or it returns with only the
4654  * original page locked.
4655  *
4656  * Returns 1 on success, 0 on failure.
4657  *
4658  * If success is returned this routine guarantees p_szc for all constituent
4659  * pages of a large page pp belongs to can't change. To achieve this we
4660  * recheck szc of pp after locking all constituent pages and retry if szc
4661  * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4662  * lock on one of constituent pages it can't be running after all constituent
4663  * pages are locked.  hat_page_demote() with a lock on a constituent page
4664  * outside of this large page (i.e. pp belonged to a larger large page) is
4665  * already done with all constituent pages of pp since the root's p_szc is
4666  * changed last. Therefore no need to synchronize with hat_page_demote() that
4667  * locked a constituent page outside of pp's current large page.
4668  */
4669 #ifdef DEBUG
4670 uint32_t gpg_trylock_mtbf = 0;
4671 #endif
4672 
4673 int
4674 group_page_trylock(page_t *pp, se_t se)
4675 {
4676         page_t  *tpp;
4677         pgcnt_t npgs, i, j;
4678         uint_t pszc = pp->p_szc;
4679 
4680 #ifdef DEBUG
4681         if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4682                 return (0);
4683         }
4684 #endif
4685 
4686         if (pp != PP_GROUPLEADER(pp, pszc)) {
4687                 return (0);
4688         }
4689 
4690 retry:
4691         ASSERT(PAGE_LOCKED_SE(pp, se));
4692         ASSERT(!PP_ISFREE(pp));
4693         if (pszc == 0) {
4694                 return (1);
4695         }
4696         npgs = page_get_pagecnt(pszc);
4697         tpp = pp + 1;
4698         for (i = 1; i < npgs; i++, tpp++) {
4699                 if (!page_trylock(tpp, se)) {
4700                         tpp = pp + 1;
4701                         for (j = 1; j < i; j++, tpp++) {
4702                                 page_unlock(tpp);
4703                         }
4704                         return (0);
4705                 }
4706         }
4707         if (pp->p_szc != pszc) {
4708                 ASSERT(pp->p_szc < pszc);
4709                 ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4710                     !IS_SWAPFSVP(pp->p_vnode));
4711                 tpp = pp + 1;
4712                 for (i = 1; i < npgs; i++, tpp++) {
4713                         page_unlock(tpp);
4714                 }
4715                 pszc = pp->p_szc;
4716                 goto retry;
4717         }
4718         return (1);
4719 }
4720 
4721 void
4722 group_page_unlock(page_t *pp)
4723 {
4724         page_t *tpp;
4725         pgcnt_t npgs, i;
4726 
4727         ASSERT(PAGE_LOCKED(pp));
4728         ASSERT(!PP_ISFREE(pp));
4729         ASSERT(pp == PP_PAGEROOT(pp));
4730         npgs = page_get_pagecnt(pp->p_szc);
4731         for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4732                 page_unlock(tpp);
4733         }
4734 }
4735 
4736 /*
4737  * returns
4738  * 0            : on success and *nrelocp is number of relocated PAGESIZE pages
4739  * ERANGE       : this is not a base page
4740  * EBUSY        : failure to get locks on the page/pages
4741  * ENOMEM       : failure to obtain replacement pages
4742  * EAGAIN       : OBP has not yet completed its boot-time handoff to the kernel
4743  * EIO          : An error occurred while trying to copy the page data
4744  *
4745  * Return with all constituent members of target and replacement
4746  * SE_EXCL locked. It is the callers responsibility to drop the
4747  * locks.
4748  */
4749 int
4750 do_page_relocate(
4751         page_t **target,
4752         page_t **replacement,
4753         int grouplock,
4754         spgcnt_t *nrelocp,
4755         lgrp_t *lgrp)
4756 {
4757         page_t *first_repl;
4758         page_t *repl;
4759         page_t *targ;
4760         page_t *pl = NULL;
4761         uint_t ppattr;
4762         pfn_t   pfn, repl_pfn;
4763         uint_t  szc;
4764         spgcnt_t npgs, i;
4765         int repl_contig = 0;
4766         uint_t flags = 0;
4767         spgcnt_t dofree = 0;
4768 
4769         *nrelocp = 0;
4770 
4771 #if defined(__sparc)
4772         /*
4773          * We need to wait till OBP has completed
4774          * its boot-time handoff of its resources to the kernel
4775          * before we allow page relocation
4776          */
4777         if (page_relocate_ready == 0) {
4778                 return (EAGAIN);
4779         }
4780 #endif
4781 
4782         /*
4783          * If this is not a base page,
4784          * just return with 0x0 pages relocated.
4785          */
4786         targ = *target;
4787         ASSERT(PAGE_EXCL(targ));
4788         ASSERT(!PP_ISFREE(targ));
4789         szc = targ->p_szc;
4790         ASSERT(szc < mmu_page_sizes);
4791         VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4792         pfn = targ->p_pagenum;
4793         if (pfn != PFN_BASE(pfn, szc)) {
4794                 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4795                 return (ERANGE);
4796         }
4797 
4798         if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4799                 repl_pfn = repl->p_pagenum;
4800                 if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4801                         VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4802                         return (ERANGE);
4803                 }
4804                 repl_contig = 1;
4805         }
4806 
4807         /*
4808          * We must lock all members of this large page or we cannot
4809          * relocate any part of it.
4810          */
4811         if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4812                 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4813                 return (EBUSY);
4814         }
4815 
4816         /*
4817          * reread szc it could have been decreased before
4818          * group_page_trylock() was done.
4819          */
4820         szc = targ->p_szc;
4821         ASSERT(szc < mmu_page_sizes);
4822         VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4823         ASSERT(pfn == PFN_BASE(pfn, szc));
4824 
4825         npgs = page_get_pagecnt(targ->p_szc);
4826 
4827         if (repl == NULL) {
4828                 dofree = npgs;          /* Size of target page in MMU pages */
4829                 if (!page_create_wait(dofree, 0)) {
4830                         if (grouplock != 0) {
4831                                 group_page_unlock(targ);
4832                         }
4833                         VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4834                         return (ENOMEM);
4835                 }
4836 
4837                 /*
4838                  * seg kmem pages require that the target and replacement
4839                  * page be the same pagesize.
4840                  */
4841                 flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4842                 repl = page_get_replacement_page(targ, lgrp, flags);
4843                 if (repl == NULL) {
4844                         if (grouplock != 0) {
4845                                 group_page_unlock(targ);
4846                         }
4847                         page_create_putback(dofree);
4848                         VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4849                         return (ENOMEM);
4850                 }
4851         }
4852 #ifdef DEBUG
4853         else {
4854                 ASSERT(PAGE_LOCKED(repl));
4855         }
4856 #endif /* DEBUG */
4857 
4858 #if defined(__sparc)
4859         /*
4860          * Let hat_page_relocate() complete the relocation if it's kernel page
4861          */
4862         if (VN_ISKAS(targ->p_vnode)) {
4863                 *replacement = repl;
4864                 if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4865                         if (grouplock != 0) {
4866                                 group_page_unlock(targ);
4867                         }
4868                         if (dofree) {
4869                                 *replacement = NULL;
4870                                 page_free_replacement_page(repl);
4871                                 page_create_putback(dofree);
4872                         }
4873                         VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4874                         return (EAGAIN);
4875                 }
4876                 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4877                 return (0);
4878         }
4879 #else
4880 #if defined(lint)
4881         dofree = dofree;
4882 #endif
4883 #endif
4884 
4885         first_repl = repl;
4886 
4887         for (i = 0; i < npgs; i++) {
4888                 ASSERT(PAGE_EXCL(targ));
4889                 ASSERT(targ->p_slckcnt == 0);
4890                 ASSERT(repl->p_slckcnt == 0);
4891 
4892                 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4893 
4894                 ASSERT(hat_page_getshare(targ) == 0);
4895                 ASSERT(!PP_ISFREE(targ));
4896                 ASSERT(targ->p_pagenum == (pfn + i));
4897                 ASSERT(repl_contig == 0 ||
4898                     repl->p_pagenum == (repl_pfn + i));
4899 
4900                 /*
4901                  * Copy the page contents and attributes then
4902                  * relocate the page in the page hash.
4903                  */
4904                 if (ppcopy(targ, repl) == 0) {
4905                         targ = *target;
4906                         repl = first_repl;
4907                         VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4908                         if (grouplock != 0) {
4909                                 group_page_unlock(targ);
4910                         }
4911                         if (dofree) {
4912                                 *replacement = NULL;
4913                                 page_free_replacement_page(repl);
4914                                 page_create_putback(dofree);
4915                         }
4916                         return (EIO);
4917                 }
4918 
4919                 targ++;
4920                 if (repl_contig != 0) {
4921                         repl++;
4922                 } else {
4923                         repl = repl->p_next;
4924                 }
4925         }
4926 
4927         repl = first_repl;
4928         targ = *target;
4929 
4930         for (i = 0; i < npgs; i++) {
4931                 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4932                 page_clr_all_props(repl);
4933                 page_set_props(repl, ppattr);
4934                 page_relocate_hash(repl, targ);
4935 
4936                 ASSERT(hat_page_getshare(targ) == 0);
4937                 ASSERT(hat_page_getshare(repl) == 0);
4938                 /*
4939                  * Now clear the props on targ, after the
4940                  * page_relocate_hash(), they no longer
4941                  * have any meaning.
4942                  */
4943                 page_clr_all_props(targ);
4944                 ASSERT(targ->p_next == targ);
4945                 ASSERT(targ->p_prev == targ);
4946                 page_list_concat(&pl, &targ);
4947 
4948                 targ++;
4949                 if (repl_contig != 0) {
4950                         repl++;
4951                 } else {
4952                         repl = repl->p_next;
4953                 }
4954         }
4955         /* assert that we have come full circle with repl */
4956         ASSERT(repl_contig == 1 || first_repl == repl);
4957 
4958         *target = pl;
4959         if (*replacement == NULL) {
4960                 ASSERT(first_repl == repl);
4961                 *replacement = repl;
4962         }
4963         VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4964         *nrelocp = npgs;
4965         return (0);
4966 }
4967 /*
4968  * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4969  */
4970 int
4971 page_relocate(
4972         page_t **target,
4973         page_t **replacement,
4974         int grouplock,
4975         int freetarget,
4976         spgcnt_t *nrelocp,
4977         lgrp_t *lgrp)
4978 {
4979         spgcnt_t ret;
4980 
4981         /* do_page_relocate returns 0 on success or errno value */
4982         ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4983 
4984         if (ret != 0 || freetarget == 0) {
4985                 return (ret);
4986         }
4987         if (*nrelocp == 1) {
4988                 ASSERT(*target != NULL);
4989                 page_free(*target, 1);
4990         } else {
4991                 page_t *tpp = *target;
4992                 uint_t szc = tpp->p_szc;
4993                 pgcnt_t npgs = page_get_pagecnt(szc);
4994                 ASSERT(npgs > 1);
4995                 ASSERT(szc != 0);
4996                 do {
4997                         ASSERT(PAGE_EXCL(tpp));
4998                         ASSERT(!hat_page_is_mapped(tpp));
4999                         ASSERT(tpp->p_szc == szc);
5000                         PP_SETFREE(tpp);
5001                         PP_SETAGED(tpp);
5002                         npgs--;
5003                 } while ((tpp = tpp->p_next) != *target);
5004                 ASSERT(npgs == 0);
5005                 page_list_add_pages(*target, 0);
5006                 npgs = page_get_pagecnt(szc);
5007                 page_create_putback(npgs);
5008         }
5009         return (ret);
5010 }
5011 
5012 /*
5013  * it is up to the caller to deal with pcf accounting.
5014  */
5015 void
5016 page_free_replacement_page(page_t *pplist)
5017 {
5018         page_t *pp;
5019 
5020         while (pplist != NULL) {
5021                 /*
5022                  * pp_targ is a linked list.
5023                  */
5024                 pp = pplist;
5025                 if (pp->p_szc == 0) {
5026                         page_sub(&pplist, pp);
5027                         page_clr_all_props(pp);
5028                         PP_SETFREE(pp);
5029                         PP_SETAGED(pp);
5030                         page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5031                         page_unlock(pp);
5032                         VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5033                 } else {
5034                         spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5035                         page_t *tpp;
5036                         page_list_break(&pp, &pplist, curnpgs);
5037                         tpp = pp;
5038                         do {
5039                                 ASSERT(PAGE_EXCL(tpp));
5040                                 ASSERT(!hat_page_is_mapped(tpp));
5041                                 page_clr_all_props(tpp);
5042                                 PP_SETFREE(tpp);
5043                                 PP_SETAGED(tpp);
5044                         } while ((tpp = tpp->p_next) != pp);
5045                         page_list_add_pages(pp, 0);
5046                         VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5047                 }
5048         }
5049 }
5050 
5051 /*
5052  * Relocate target to non-relocatable replacement page.
5053  */
5054 int
5055 page_relocate_cage(page_t **target, page_t **replacement)
5056 {
5057         page_t *tpp, *rpp;
5058         spgcnt_t pgcnt, npgs;
5059         int result;
5060 
5061         tpp = *target;
5062 
5063         ASSERT(PAGE_EXCL(tpp));
5064         ASSERT(tpp->p_szc == 0);
5065 
5066         pgcnt = btop(page_get_pagesize(tpp->p_szc));
5067 
5068         do {
5069                 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5070                 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5071                 if (rpp == NULL) {
5072                         page_create_putback(pgcnt);
5073                         kcage_cageout_wakeup();
5074                 }
5075         } while (rpp == NULL);
5076 
5077         ASSERT(PP_ISNORELOC(rpp));
5078 
5079         result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5080 
5081         if (result == 0) {
5082                 *replacement = rpp;
5083                 if (pgcnt != npgs)
5084                         panic("page_relocate_cage: partial relocation");
5085         }
5086 
5087         return (result);
5088 }
5089 
5090 /*
5091  * Release the page lock on a page, place on cachelist
5092  * tail if no longer mapped. Caller can let us know if
5093  * the page is known to be clean.
5094  */
5095 int
5096 page_release(page_t *pp, int checkmod)
5097 {
5098         int status;
5099 
5100         ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5101             (pp->p_vnode != NULL));
5102 
5103         if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5104             ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5105             pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5106             !hat_page_is_mapped(pp)) {
5107 
5108                 /*
5109                  * If page is modified, unlock it
5110                  *
5111                  * (p_nrm & P_MOD) bit has the latest stuff because:
5112                  * (1) We found that this page doesn't have any mappings
5113                  *      _after_ holding SE_EXCL and
5114                  * (2) We didn't drop SE_EXCL lock after the check in (1)
5115                  */
5116                 if (checkmod && hat_ismod(pp)) {
5117                         page_unlock(pp);
5118                         status = PGREL_MOD;
5119                 } else {
5120                         /*LINTED: constant in conditional context*/
5121                         VN_DISPOSE(pp, B_FREE, 0, kcred);
5122                         status = PGREL_CLEAN;
5123                 }
5124         } else {
5125                 page_unlock(pp);
5126                 status = PGREL_NOTREL;
5127         }
5128         return (status);
5129 }
5130 
5131 /*
5132  * Given a constituent page, try to demote the large page on the freelist.
5133  *
5134  * Returns nonzero if the page could be demoted successfully. Returns with
5135  * the constituent page still locked.
5136  */
5137 int
5138 page_try_demote_free_pages(page_t *pp)
5139 {
5140         page_t *rootpp = pp;
5141         pfn_t   pfn = page_pptonum(pp);
5142         spgcnt_t npgs;
5143         uint_t  szc = pp->p_szc;
5144 
5145         ASSERT(PP_ISFREE(pp));
5146         ASSERT(PAGE_EXCL(pp));
5147 
5148         /*
5149          * Adjust rootpp and lock it, if `pp' is not the base
5150          * constituent page.
5151          */
5152         npgs = page_get_pagecnt(pp->p_szc);
5153         if (npgs == 1) {
5154                 return (0);
5155         }
5156 
5157         if (!IS_P2ALIGNED(pfn, npgs)) {
5158                 pfn = P2ALIGN(pfn, npgs);
5159                 rootpp = page_numtopp_nolock(pfn);
5160         }
5161 
5162         if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5163                 return (0);
5164         }
5165 
5166         if (rootpp->p_szc != szc) {
5167                 if (pp != rootpp)
5168                         page_unlock(rootpp);
5169                 return (0);
5170         }
5171 
5172         page_demote_free_pages(rootpp);
5173 
5174         if (pp != rootpp)
5175                 page_unlock(rootpp);
5176 
5177         ASSERT(PP_ISFREE(pp));
5178         ASSERT(PAGE_EXCL(pp));
5179         return (1);
5180 }
5181 
5182 /*
5183  * Given a constituent page, try to demote the large page.
5184  *
5185  * Returns nonzero if the page could be demoted successfully. Returns with
5186  * the constituent page still locked.
5187  */
5188 int
5189 page_try_demote_pages(page_t *pp)
5190 {
5191         page_t *tpp, *rootpp = pp;
5192         pfn_t   pfn = page_pptonum(pp);
5193         spgcnt_t i, npgs;
5194         uint_t  szc = pp->p_szc;
5195         vnode_t *vp = pp->p_vnode;
5196 
5197         ASSERT(PAGE_EXCL(pp));
5198 
5199         VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5200 
5201         if (pp->p_szc == 0) {
5202                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5203                 return (1);
5204         }
5205 
5206         if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5207                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5208                 page_demote_vp_pages(pp);
5209                 ASSERT(pp->p_szc == 0);
5210                 return (1);
5211         }
5212 
5213         /*
5214          * Adjust rootpp if passed in is not the base
5215          * constituent page.
5216          */
5217         npgs = page_get_pagecnt(pp->p_szc);
5218         ASSERT(npgs > 1);
5219         if (!IS_P2ALIGNED(pfn, npgs)) {
5220                 pfn = P2ALIGN(pfn, npgs);
5221                 rootpp = page_numtopp_nolock(pfn);
5222                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5223                 ASSERT(rootpp->p_vnode != NULL);
5224                 ASSERT(rootpp->p_szc == szc);
5225         }
5226 
5227         /*
5228          * We can't demote kernel pages since we can't hat_unload()
5229          * the mappings.
5230          */
5231         if (VN_ISKAS(rootpp->p_vnode))
5232                 return (0);
5233 
5234         /*
5235          * Attempt to lock all constituent pages except the page passed
5236          * in since it's already locked.
5237          */
5238         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5239                 ASSERT(!PP_ISFREE(tpp));
5240                 ASSERT(tpp->p_vnode != NULL);
5241 
5242                 if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5243                         break;
5244                 ASSERT(tpp->p_szc == rootpp->p_szc);
5245                 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5246         }
5247 
5248         /*
5249          * If we failed to lock them all then unlock what we have
5250          * locked so far and bail.
5251          */
5252         if (i < npgs) {
5253                 tpp = rootpp;
5254                 while (i-- > 0) {
5255                         if (tpp != pp)
5256                                 page_unlock(tpp);
5257                         tpp++;
5258                 }
5259                 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5260                 return (0);
5261         }
5262 
5263         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5264                 ASSERT(PAGE_EXCL(tpp));
5265                 ASSERT(tpp->p_slckcnt == 0);
5266                 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5267                 tpp->p_szc = 0;
5268         }
5269 
5270         /*
5271          * Unlock all pages except the page passed in.
5272          */
5273         for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5274                 ASSERT(!hat_page_is_mapped(tpp));
5275                 if (tpp != pp)
5276                         page_unlock(tpp);
5277         }
5278 
5279         VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5280         return (1);
5281 }
5282 
5283 /*
5284  * Called by page_free() and page_destroy() to demote the page size code
5285  * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5286  * p_szc on free list, neither can we just clear p_szc of a single page_t
5287  * within a large page since it will break other code that relies on p_szc
5288  * being the same for all page_t's of a large page). Anonymous pages should
5289  * never end up here because anon_map_getpages() cannot deal with p_szc
5290  * changes after a single constituent page is locked.  While anonymous or
5291  * kernel large pages are demoted or freed the entire large page at a time
5292  * with all constituent pages locked EXCL for the file system pages we
5293  * have to be able to demote a large page (i.e. decrease all constituent pages
5294  * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5295  * we can easily deal with anonymous page demotion the entire large page at a
5296  * time is that those operation originate at address space level and concern
5297  * the entire large page region with actual demotion only done when pages are
5298  * not shared with any other processes (therefore we can always get EXCL lock
5299  * on all anonymous constituent pages after clearing segment page
5300  * cache). However file system pages can be truncated or invalidated at a
5301  * PAGESIZE level from the file system side and end up in page_free() or
5302  * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5303  * and therefore pageout should be able to demote a large page by EXCL locking
5304  * any constituent page that is not under SOFTLOCK). In those cases we cannot
5305  * rely on being able to lock EXCL all constituent pages.
5306  *
5307  * To prevent szc changes on file system pages one has to lock all constituent
5308  * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5309  * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5310  * prevent szc changes is hat layer that uses its own page level mlist
5311  * locks. hat assumes that szc doesn't change after mlist lock for a page is
5312  * taken. Therefore we need to change szc under hat level locks if we only
5313  * have an EXCL lock on a single constituent page and hat still references any
5314  * of constituent pages.  (Note we can't "ignore" hat layer by simply
5315  * hat_pageunload() all constituent pages without having EXCL locks on all of
5316  * constituent pages). We use hat_page_demote() call to safely demote szc of
5317  * all constituent pages under hat locks when we only have an EXCL lock on one
5318  * of constituent pages.
5319  *
5320  * This routine calls page_szc_lock() before calling hat_page_demote() to
5321  * allow segvn in one special case not to lock all constituent pages SHARED
5322  * before calling hat_memload_array() that relies on p_szc not changing even
5323  * before hat level mlist lock is taken.  In that case segvn uses
5324  * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5325  *
5326  * Anonymous or kernel page demotion still has to lock all pages exclusively
5327  * and do hat_pageunload() on all constituent pages before demoting the page
5328  * therefore there's no need for anonymous or kernel page demotion to use
5329  * hat_page_demote() mechanism.
5330  *
5331  * hat_page_demote() removes all large mappings that map pp and then decreases
5332  * p_szc starting from the last constituent page of the large page. By working
5333  * from the tail of a large page in pfn decreasing order allows one looking at
5334  * the root page to know that hat_page_demote() is done for root's szc area.
5335  * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5336  * pages within szc 1 area to prevent szc changes because hat_page_demote()
5337  * that started on this page when it had szc > 1 is done for this szc 1 area.
5338  *
5339  * We are guaranteed that all constituent pages of pp's large page belong to
5340  * the same vnode with the consecutive offsets increasing in the direction of
5341  * the pfn i.e. the identity of constituent pages can't change until their
5342  * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5343  * large mappings to pp even though we don't lock any constituent page except
5344  * pp (i.e. we won't unload e.g. kernel locked page).
5345  */
5346 static void
5347 page_demote_vp_pages(page_t *pp)
5348 {
5349         kmutex_t *mtx;
5350 
5351         ASSERT(PAGE_EXCL(pp));
5352         ASSERT(!PP_ISFREE(pp));
5353         ASSERT(pp->p_vnode != NULL);
5354         ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5355         ASSERT(!PP_ISKAS(pp));
5356 
5357         VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5358 
5359         mtx = page_szc_lock(pp);
5360         if (mtx != NULL) {
5361                 hat_page_demote(pp);
5362                 mutex_exit(mtx);
5363         }
5364         ASSERT(pp->p_szc == 0);
5365 }
5366 
5367 /*
5368  * Mark any existing pages for migration in the given range
5369  */
5370 void
5371 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5372     struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5373     u_offset_t vnoff, int rflag)
5374 {
5375         struct anon     *ap;
5376         vnode_t         *curvp;
5377         lgrp_t          *from;
5378         pgcnt_t         nlocked;
5379         u_offset_t      off;
5380         pfn_t           pfn;
5381         size_t          pgsz;
5382         size_t          segpgsz;
5383         pgcnt_t         pages;
5384         uint_t          pszc;
5385         page_t          *pp0, *pp;
5386         caddr_t         va;
5387         ulong_t         an_idx;
5388         anon_sync_obj_t cookie;
5389 
5390         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
5391 
5392         /*
5393          * Don't do anything if don't need to do lgroup optimizations
5394          * on this system
5395          */
5396         if (!lgrp_optimizations())
5397                 return;
5398 
5399         /*
5400          * Align address and length to (potentially large) page boundary
5401          */
5402         segpgsz = page_get_pagesize(seg->s_szc);
5403         addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5404         if (rflag)
5405                 len = P2ROUNDUP(len, segpgsz);
5406 
5407         /*
5408          * Do one (large) page at a time
5409          */
5410         va = addr;
5411         while (va < addr + len) {
5412                 /*
5413                  * Lookup (root) page for vnode and offset corresponding to
5414                  * this virtual address
5415                  * Try anonmap first since there may be copy-on-write
5416                  * pages, but initialize vnode pointer and offset using
5417                  * vnode arguments just in case there isn't an amp.
5418                  */
5419                 curvp = vp;
5420                 off = vnoff + va - seg->s_base;
5421                 if (amp) {
5422                         ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
5423                         an_idx = anon_index + seg_page(seg, va);
5424                         anon_array_enter(amp, an_idx, &cookie);
5425                         ap = anon_get_ptr(amp->ahp, an_idx);
5426                         if (ap)
5427                                 swap_xlate(ap, &curvp, &off);
5428                         anon_array_exit(&cookie);
5429                         ANON_LOCK_EXIT(&->a_rwlock);
5430                 }
5431 
5432                 pp = NULL;
5433                 if (curvp)
5434                         pp = page_lookup(curvp, off, SE_SHARED);
5435 
5436                 /*
5437                  * If there isn't a page at this virtual address,
5438                  * skip to next page
5439                  */
5440                 if (pp == NULL) {
5441                         va += PAGESIZE;
5442                         continue;
5443                 }
5444 
5445                 /*
5446                  * Figure out which lgroup this page is in for kstats
5447                  */
5448                 pfn = page_pptonum(pp);
5449                 from = lgrp_pfn_to_lgrp(pfn);
5450 
5451                 /*
5452                  * Get page size, and round up and skip to next page boundary
5453                  * if unaligned address
5454                  */
5455                 pszc = pp->p_szc;
5456                 pgsz = page_get_pagesize(pszc);
5457                 pages = btop(pgsz);
5458                 if (!IS_P2ALIGNED(va, pgsz) ||
5459                     !IS_P2ALIGNED(pfn, pages) ||
5460                     pgsz > segpgsz) {
5461                         pgsz = MIN(pgsz, segpgsz);
5462                         page_unlock(pp);
5463                         pages = btop(P2END((uintptr_t)va, pgsz) -
5464                             (uintptr_t)va);
5465                         va = (caddr_t)P2END((uintptr_t)va, pgsz);
5466                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5467                         continue;
5468                 }
5469 
5470                 /*
5471                  * Upgrade to exclusive lock on page
5472                  */
5473                 if (!page_tryupgrade(pp)) {
5474                         page_unlock(pp);
5475                         va += pgsz;
5476                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5477                             btop(pgsz));
5478                         continue;
5479                 }
5480 
5481                 pp0 = pp++;
5482                 nlocked = 1;
5483 
5484                 /*
5485                  * Lock constituent pages if this is large page
5486                  */
5487                 if (pages > 1) {
5488                         /*
5489                          * Lock all constituents except root page, since it
5490                          * should be locked already.
5491                          */
5492                         for (; nlocked < pages; nlocked++) {
5493                                 if (!page_trylock(pp, SE_EXCL)) {
5494                                         break;
5495                                 }
5496                                 if (PP_ISFREE(pp) ||
5497                                     pp->p_szc != pszc) {
5498                                         /*
5499                                          * hat_page_demote() raced in with us.
5500                                          */
5501                                         ASSERT(!IS_SWAPFSVP(curvp));
5502                                         page_unlock(pp);
5503                                         break;
5504                                 }
5505                                 pp++;
5506                         }
5507                 }
5508 
5509                 /*
5510                  * If all constituent pages couldn't be locked,
5511                  * unlock pages locked so far and skip to next page.
5512                  */
5513                 if (nlocked < pages) {
5514                         while (pp0 < pp) {
5515                                 page_unlock(pp0++);
5516                         }
5517                         va += pgsz;
5518                         lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5519                             btop(pgsz));
5520                         continue;
5521                 }
5522 
5523                 /*
5524                  * hat_page_demote() can no longer happen
5525                  * since last cons page had the right p_szc after
5526                  * all cons pages were locked. all cons pages
5527                  * should now have the same p_szc.
5528                  */
5529 
5530                 /*
5531                  * All constituent pages locked successfully, so mark
5532                  * large page for migration and unload the mappings of
5533                  * constituent pages, so a fault will occur on any part of the
5534                  * large page
5535                  */
5536                 PP_SETMIGRATE(pp0);
5537                 while (pp0 < pp) {
5538                         (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5539                         ASSERT(hat_page_getshare(pp0) == 0);
5540                         page_unlock(pp0++);
5541                 }
5542                 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5543 
5544                 va += pgsz;
5545         }
5546 }
5547 
5548 /*
5549  * Migrate any pages that have been marked for migration in the given range
5550  */
5551 void
5552 page_migrate(
5553         struct seg      *seg,
5554         caddr_t         addr,
5555         page_t          **ppa,
5556         pgcnt_t         npages)
5557 {
5558         lgrp_t          *from;
5559         lgrp_t          *to;
5560         page_t          *newpp;
5561         page_t          *pp;
5562         pfn_t           pfn;
5563         size_t          pgsz;
5564         spgcnt_t        page_cnt;
5565         spgcnt_t        i;
5566         uint_t          pszc;
5567 
5568         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
5569 
5570         while (npages > 0) {
5571                 pp = *ppa;
5572                 pszc = pp->p_szc;
5573                 pgsz = page_get_pagesize(pszc);
5574                 page_cnt = btop(pgsz);
5575 
5576                 /*
5577                  * Check to see whether this page is marked for migration
5578                  *
5579                  * Assume that root page of large page is marked for
5580                  * migration and none of the other constituent pages
5581                  * are marked.  This really simplifies clearing the
5582                  * migrate bit by not having to clear it from each
5583                  * constituent page.
5584                  *
5585                  * note we don't want to relocate an entire large page if
5586                  * someone is only using one subpage.
5587                  */
5588                 if (npages < page_cnt)
5589                         break;
5590 
5591                 /*
5592                  * Is it marked for migration?
5593                  */
5594                 if (!PP_ISMIGRATE(pp))
5595                         goto next;
5596 
5597                 /*
5598                  * Determine lgroups that page is being migrated between
5599                  */
5600                 pfn = page_pptonum(pp);
5601                 if (!IS_P2ALIGNED(pfn, page_cnt)) {
5602                         break;
5603                 }
5604                 from = lgrp_pfn_to_lgrp(pfn);
5605                 to = lgrp_mem_choose(seg, addr, pgsz);
5606 
5607                 /*
5608                  * Need to get exclusive lock's to migrate
5609                  */
5610                 for (i = 0; i < page_cnt; i++) {
5611                         ASSERT(PAGE_LOCKED(ppa[i]));
5612                         if (page_pptonum(ppa[i]) != pfn + i ||
5613                             ppa[i]->p_szc != pszc) {
5614                                 break;
5615                         }
5616                         if (!page_tryupgrade(ppa[i])) {
5617                                 lgrp_stat_add(from->lgrp_id,
5618                                     LGRP_PM_FAIL_LOCK_PGS,
5619                                     page_cnt);
5620                                 break;
5621                         }
5622 
5623                         /*
5624                          * Check to see whether we are trying to migrate
5625                          * page to lgroup where it is allocated already.
5626                          * If so, clear the migrate bit and skip to next
5627                          * page.
5628                          */
5629                         if (i == 0 && to == from) {
5630                                 PP_CLRMIGRATE(ppa[0]);
5631                                 page_downgrade(ppa[0]);
5632                                 goto next;
5633                         }
5634                 }
5635 
5636                 /*
5637                  * If all constituent pages couldn't be locked,
5638                  * unlock pages locked so far and skip to next page.
5639                  */
5640                 if (i != page_cnt) {
5641                         while (--i != -1) {
5642                                 page_downgrade(ppa[i]);
5643                         }
5644                         goto next;
5645                 }
5646 
5647                 (void) page_create_wait(page_cnt, PG_WAIT);
5648                 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5649                 if (newpp == NULL) {
5650                         page_create_putback(page_cnt);
5651                         for (i = 0; i < page_cnt; i++) {
5652                                 page_downgrade(ppa[i]);
5653                         }
5654                         lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5655                             page_cnt);
5656                         goto next;
5657                 }
5658                 ASSERT(newpp->p_szc == pszc);
5659                 /*
5660                  * Clear migrate bit and relocate page
5661                  */
5662                 PP_CLRMIGRATE(pp);
5663                 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5664                         panic("page_migrate: page_relocate failed");
5665                 }
5666                 ASSERT(page_cnt * PAGESIZE == pgsz);
5667 
5668                 /*
5669                  * Keep stats for number of pages migrated from and to
5670                  * each lgroup
5671                  */
5672                 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5673                 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5674                 /*
5675                  * update the page_t array we were passed in and
5676                  * unlink constituent pages of a large page.
5677                  */
5678                 for (i = 0; i < page_cnt; ++i, ++pp) {
5679                         ASSERT(PAGE_EXCL(newpp));
5680                         ASSERT(newpp->p_szc == pszc);
5681                         ppa[i] = newpp;
5682                         pp = newpp;
5683                         page_sub(&newpp, pp);
5684                         page_downgrade(pp);
5685                 }
5686                 ASSERT(newpp == NULL);
5687 next:
5688                 addr += pgsz;
5689                 ppa += page_cnt;
5690                 npages -= page_cnt;
5691         }
5692 }
5693 
5694 uint_t page_reclaim_maxcnt = 60; /* max total iterations */
5695 uint_t page_reclaim_nofree_maxcnt = 3; /* max iterations without progress */
5696 /*
5697  * Reclaim/reserve availrmem for npages.
5698  * If there is not enough memory start reaping seg, kmem caches.
5699  * Start pageout scanner (via page_needfree()).
5700  * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5701  * Note: There is no guarantee that any availrmem will be freed as
5702  * this memory typically is locked (kernel heap) or reserved for swap.
5703  * Also due to memory fragmentation kmem allocator may not be able
5704  * to free any memory (single user allocated buffer will prevent
5705  * freeing slab or a page).
5706  */
5707 int
5708 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5709 {
5710         int     i = 0;
5711         int     i_nofree = 0;
5712         int     ret = 0;
5713         pgcnt_t deficit;
5714         pgcnt_t old_availrmem = 0;
5715 
5716         mutex_enter(&freemem_lock);
5717         while (availrmem < tune.t_minarmem + npages + epages &&
5718             i++ < page_reclaim_maxcnt) {
5719                 /* ensure we made some progress in the last few iterations */
5720                 if (old_availrmem < availrmem) {
5721                         old_availrmem = availrmem;
5722                         i_nofree = 0;
5723                 } else if (i_nofree++ >= page_reclaim_nofree_maxcnt) {
5724                         break;
5725                 }
5726 
5727                 deficit = tune.t_minarmem + npages + epages - availrmem;
5728                 mutex_exit(&freemem_lock);
5729                 page_needfree(deficit);
5730                 kmem_reap();
5731                 delay(hz);
5732                 page_needfree(-(spgcnt_t)deficit);
5733                 mutex_enter(&freemem_lock);
5734         }
5735 
5736         if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5737                 availrmem -= npages;
5738                 ret = 1;
5739         }
5740 
5741         mutex_exit(&freemem_lock);
5742 
5743         return (ret);
5744 }
5745 
5746 /*
5747  * Search the memory segments to locate the desired page.  Within a
5748  * segment, pages increase linearly with one page structure per
5749  * physical page frame (size PAGESIZE).  The search begins
5750  * with the segment that was accessed last, to take advantage of locality.
5751  * If the hint misses, we start from the beginning of the sorted memseg list
5752  */
5753 
5754 
5755 /*
5756  * Some data structures for pfn to pp lookup.
5757  */
5758 ulong_t mhash_per_slot;
5759 struct memseg *memseg_hash[N_MEM_SLOTS];
5760 
5761 page_t *
5762 page_numtopp_nolock(pfn_t pfnum)
5763 {
5764         struct memseg *seg;
5765         page_t *pp;
5766         vm_cpu_data_t *vc;
5767 
5768         /*
5769          * We need to disable kernel preemption while referencing the
5770          * cpu_vm_data field in order to prevent us from being switched to
5771          * another cpu and trying to reference it after it has been freed.
5772          * This will keep us on cpu and prevent it from being removed while
5773          * we are still on it.
5774          *
5775          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5776          * which is being resued by DR who will flush those references
5777          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5778          */
5779         kpreempt_disable();
5780         vc = CPU->cpu_vm_data;
5781         ASSERT(vc != NULL);
5782 
5783         MEMSEG_STAT_INCR(nsearch);
5784 
5785         /* Try last winner first */
5786         if (((seg = vc->vc_pnum_memseg) != NULL) &&
5787             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5788                 MEMSEG_STAT_INCR(nlastwon);
5789                 pp = seg->pages + (pfnum - seg->pages_base);
5790                 if (pp->p_pagenum == pfnum) {
5791                         kpreempt_enable();
5792                         return ((page_t *)pp);
5793                 }
5794         }
5795 
5796         /* Else Try hash */
5797         if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5798             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5799                 MEMSEG_STAT_INCR(nhashwon);
5800                 vc->vc_pnum_memseg = seg;
5801                 pp = seg->pages + (pfnum - seg->pages_base);
5802                 if (pp->p_pagenum == pfnum) {
5803                         kpreempt_enable();
5804                         return ((page_t *)pp);
5805                 }
5806         }
5807 
5808         /* Else Brute force */
5809         for (seg = memsegs; seg != NULL; seg = seg->next) {
5810                 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5811                         vc->vc_pnum_memseg = seg;
5812                         pp = seg->pages + (pfnum - seg->pages_base);
5813                         if (pp->p_pagenum == pfnum) {
5814                                 kpreempt_enable();
5815                                 return ((page_t *)pp);
5816                         }
5817                 }
5818         }
5819         vc->vc_pnum_memseg = NULL;
5820         kpreempt_enable();
5821         MEMSEG_STAT_INCR(nnotfound);
5822         return ((page_t *)NULL);
5823 
5824 }
5825 
5826 struct memseg *
5827 page_numtomemseg_nolock(pfn_t pfnum)
5828 {
5829         struct memseg *seg;
5830         page_t *pp;
5831 
5832         /*
5833          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5834          * which is being resued by DR who will flush those references
5835          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5836          */
5837         kpreempt_disable();
5838         /* Try hash */
5839         if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5840             (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5841                 pp = seg->pages + (pfnum - seg->pages_base);
5842                 if (pp->p_pagenum == pfnum) {
5843                         kpreempt_enable();
5844                         return (seg);
5845                 }
5846         }
5847 
5848         /* Else Brute force */
5849         for (seg = memsegs; seg != NULL; seg = seg->next) {
5850                 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5851                         pp = seg->pages + (pfnum - seg->pages_base);
5852                         if (pp->p_pagenum == pfnum) {
5853                                 kpreempt_enable();
5854                                 return (seg);
5855                         }
5856                 }
5857         }
5858         kpreempt_enable();
5859         return ((struct memseg *)NULL);
5860 }
5861 
5862 /*
5863  * Given a page and a count return the page struct that is
5864  * n structs away from the current one in the global page
5865  * list.
5866  *
5867  * This function wraps to the first page upon
5868  * reaching the end of the memseg list.
5869  */
5870 page_t *
5871 page_nextn(page_t *pp, ulong_t n)
5872 {
5873         struct memseg *seg;
5874         page_t *ppn;
5875         vm_cpu_data_t *vc;
5876 
5877         /*
5878          * We need to disable kernel preemption while referencing the
5879          * cpu_vm_data field in order to prevent us from being switched to
5880          * another cpu and trying to reference it after it has been freed.
5881          * This will keep us on cpu and prevent it from being removed while
5882          * we are still on it.
5883          *
5884          * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5885          * which is being resued by DR who will flush those references
5886          * before modifying the reused memseg.  See memseg_cpu_vm_flush().
5887          */
5888         kpreempt_disable();
5889         vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5890 
5891         ASSERT(vc != NULL);
5892 
5893         if (((seg = vc->vc_pnext_memseg) == NULL) ||
5894             (seg->pages_base == seg->pages_end) ||
5895             !(pp >= seg->pages && pp < seg->epages)) {
5896 
5897                 for (seg = memsegs; seg; seg = seg->next) {
5898                         if (pp >= seg->pages && pp < seg->epages)
5899                                 break;
5900                 }
5901 
5902                 if (seg == NULL) {
5903                         /* Memory delete got in, return something valid. */
5904                         /* TODO: fix me. */
5905                         seg = memsegs;
5906                         pp = seg->pages;
5907                 }
5908         }
5909 
5910         /* check for wraparound - possible if n is large */
5911         while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5912                 n -= seg->epages - pp;
5913                 seg = seg->next;
5914                 if (seg == NULL)
5915                         seg = memsegs;
5916                 pp = seg->pages;
5917         }
5918         vc->vc_pnext_memseg = seg;
5919         kpreempt_enable();
5920         return (ppn);
5921 }
5922 
5923 /*
5924  * Initialize for a loop using page_next_scan_large().
5925  */
5926 page_t *
5927 page_next_scan_init(void **cookie)
5928 {
5929         ASSERT(cookie != NULL);
5930         *cookie = (void *)memsegs;
5931         return ((page_t *)memsegs->pages);
5932 }
5933 
5934 /*
5935  * Return the next page in a scan of page_t's, assuming we want
5936  * to skip over sub-pages within larger page sizes.
5937  *
5938  * The cookie is used to keep track of the current memseg.
5939  */
5940 page_t *
5941 page_next_scan_large(
5942         page_t          *pp,
5943         ulong_t         *n,
5944         void            **cookie)
5945 {
5946         struct memseg   *seg = (struct memseg *)*cookie;
5947         page_t          *new_pp;
5948         ulong_t         cnt;
5949         pfn_t           pfn;
5950 
5951 
5952         /*
5953          * get the count of page_t's to skip based on the page size
5954          */
5955         ASSERT(pp != NULL);
5956         if (pp->p_szc == 0) {
5957                 cnt = 1;
5958         } else {
5959                 pfn = page_pptonum(pp);
5960                 cnt = page_get_pagecnt(pp->p_szc);
5961                 cnt -= pfn & (cnt - 1);
5962         }
5963         *n += cnt;
5964         new_pp = pp + cnt;
5965 
5966         /*
5967          * Catch if we went past the end of the current memory segment. If so,
5968          * just move to the next segment with pages.
5969          */
5970         if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5971                 do {
5972                         seg = seg->next;
5973                         if (seg == NULL)
5974                                 seg = memsegs;
5975                 } while (seg->pages_base == seg->pages_end);
5976                 new_pp = seg->pages;
5977                 *cookie = (void *)seg;
5978         }
5979 
5980         return (new_pp);
5981 }
5982 
5983 
5984 /*
5985  * Returns next page in list. Note: this function wraps
5986  * to the first page in the list upon reaching the end
5987  * of the list. Callers should be aware of this fact.
5988  */
5989 
5990 /* We should change this be a #define */
5991 
5992 page_t *
5993 page_next(page_t *pp)
5994 {
5995         return (page_nextn(pp, 1));
5996 }
5997 
5998 page_t *
5999 page_first()
6000 {
6001         return ((page_t *)memsegs->pages);
6002 }
6003 
6004 
6005 /*
6006  * This routine is called at boot with the initial memory configuration
6007  * and when memory is added or removed.
6008  */
6009 void
6010 build_pfn_hash()
6011 {
6012         pfn_t cur;
6013         pgcnt_t index;
6014         struct memseg *pseg;
6015         int     i;
6016 
6017         /*
6018          * Clear memseg_hash array.
6019          * Since memory add/delete is designed to operate concurrently
6020          * with normal operation, the hash rebuild must be able to run
6021          * concurrently with page_numtopp_nolock(). To support this
6022          * functionality, assignments to memseg_hash array members must
6023          * be done atomically.
6024          *
6025          * NOTE: bzero() does not currently guarantee this for kernel
6026          * threads, and cannot be used here.
6027          */
6028         for (i = 0; i < N_MEM_SLOTS; i++)
6029                 memseg_hash[i] = NULL;
6030 
6031         hat_kpm_mseghash_clear(N_MEM_SLOTS);
6032 
6033         /*
6034          * Physmax is the last valid pfn.
6035          */
6036         mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6037         for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6038                 index = MEMSEG_PFN_HASH(pseg->pages_base);
6039                 cur = pseg->pages_base;
6040                 do {
6041                         if (index >= N_MEM_SLOTS)
6042                                 index = MEMSEG_PFN_HASH(cur);
6043 
6044                         if (memseg_hash[index] == NULL ||
6045                             memseg_hash[index]->pages_base > pseg->pages_base) {
6046                                 memseg_hash[index] = pseg;
6047                                 hat_kpm_mseghash_update(index, pseg);
6048                         }
6049                         cur += mhash_per_slot;
6050                         index++;
6051                 } while (cur < pseg->pages_end);
6052         }
6053 }
6054 
6055 /*
6056  * Return the pagenum for the pp
6057  */
6058 pfn_t
6059 page_pptonum(page_t *pp)
6060 {
6061         return (pp->p_pagenum);
6062 }
6063 
6064 /*
6065  * interface to the referenced and modified etc bits
6066  * in the PSM part of the page struct
6067  * when no locking is desired.
6068  */
6069 void
6070 page_set_props(page_t *pp, uint_t flags)
6071 {
6072         ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6073         pp->p_nrm |= (uchar_t)flags;
6074 }
6075 
6076 void
6077 page_clr_all_props(page_t *pp)
6078 {
6079         pp->p_nrm = 0;
6080 }
6081 
6082 /*
6083  * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6084  */
6085 int
6086 page_clear_lck_cow(page_t *pp, int adjust)
6087 {
6088         int     f_amount;
6089 
6090         ASSERT(PAGE_EXCL(pp));
6091 
6092         /*
6093          * The page_struct_lock need not be acquired here since
6094          * we require the caller hold the page exclusively locked.
6095          */
6096         f_amount = 0;
6097         if (pp->p_lckcnt) {
6098                 f_amount = 1;
6099                 pp->p_lckcnt = 0;
6100         }
6101         if (pp->p_cowcnt) {
6102                 f_amount += pp->p_cowcnt;
6103                 pp->p_cowcnt = 0;
6104         }
6105 
6106         if (adjust && f_amount) {
6107                 mutex_enter(&freemem_lock);
6108                 availrmem += f_amount;
6109                 mutex_exit(&freemem_lock);
6110         }
6111 
6112         return (f_amount);
6113 }
6114 
6115 /*
6116  * The following functions is called from free_vp_pages()
6117  * for an inexact estimate of a newly free'd page...
6118  */
6119 ulong_t
6120 page_share_cnt(page_t *pp)
6121 {
6122         return (hat_page_getshare(pp));
6123 }
6124 
6125 int
6126 page_isshared(page_t *pp)
6127 {
6128         return (hat_page_checkshare(pp, 1));
6129 }
6130 
6131 int
6132 page_isfree(page_t *pp)
6133 {
6134         return (PP_ISFREE(pp));
6135 }
6136 
6137 int
6138 page_isref(page_t *pp)
6139 {
6140         return (hat_page_getattr(pp, P_REF));
6141 }
6142 
6143 int
6144 page_ismod(page_t *pp)
6145 {
6146         return (hat_page_getattr(pp, P_MOD));
6147 }
6148 
6149 /*
6150  * The following code all currently relates to the page capture logic:
6151  *
6152  * This logic is used for cases where there is a desire to claim a certain
6153  * physical page in the system for the caller.  As it may not be possible
6154  * to capture the page immediately, the p_toxic bits are used in the page
6155  * structure to indicate that someone wants to capture this page.  When the
6156  * page gets unlocked, the toxic flag will be noted and an attempt to capture
6157  * the page will be made.  If it is successful, the original callers callback
6158  * will be called with the page to do with it what they please.
6159  *
6160  * There is also an async thread which wakes up to attempt to capture
6161  * pages occasionally which have the capture bit set.  All of the pages which
6162  * need to be captured asynchronously have been inserted into the
6163  * page_capture_hash and thus this thread walks that hash list.  Items in the
6164  * hash have an expiration time so this thread handles that as well by removing
6165  * the item from the hash if it has expired.
6166  *
6167  * Some important things to note are:
6168  * - if the PR_CAPTURE bit is set on a page, then the page is in the
6169  *   page_capture_hash.  The page_capture_hash_head.pchh_mutex is needed
6170  *   to set and clear this bit, and while the lock is held is the only time
6171  *   you can add or remove an entry from the hash.
6172  * - the PR_CAPTURE bit can only be set and cleared while holding the
6173  *   page_capture_hash_head.pchh_mutex
6174  * - the t_flag field of the thread struct is used with the T_CAPTURING
6175  *   flag to prevent recursion while dealing with large pages.
6176  * - pages which need to be retired never expire on the page_capture_hash.
6177  */
6178 
6179 static void page_capture_thread(void);
6180 static kthread_t *pc_thread_id;
6181 kcondvar_t pc_cv;
6182 static kmutex_t pc_thread_mutex;
6183 static clock_t pc_thread_shortwait;
6184 static clock_t pc_thread_longwait;
6185 static int pc_thread_retry;
6186 
6187 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6188 
6189 /* Note that this is a circular linked list */
6190 typedef struct page_capture_hash_bucket {
6191         page_t *pp;
6192         uchar_t szc;
6193         uchar_t pri;
6194         uint_t flags;
6195         clock_t expires;        /* lbolt at which this request expires. */
6196         void *datap;            /* Cached data passed in for callback */
6197         struct page_capture_hash_bucket *next;
6198         struct page_capture_hash_bucket *prev;
6199 } page_capture_hash_bucket_t;
6200 
6201 #define PC_PRI_HI       0       /* capture now */
6202 #define PC_PRI_LO       1       /* capture later */
6203 #define PC_NUM_PRI      2
6204 
6205 #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
6206 
6207 
6208 /*
6209  * Each hash bucket will have it's own mutex and two lists which are:
6210  * active (0):  represents requests which have not been processed by
6211  *              the page_capture async thread yet.
6212  * walked (1):  represents requests which have been processed by the
6213  *              page_capture async thread within it's given walk of this bucket.
6214  *
6215  * These are all needed so that we can synchronize all async page_capture
6216  * events.  When the async thread moves to a new bucket, it will append the
6217  * walked list to the active list and walk each item one at a time, moving it
6218  * from the active list to the walked list.  Thus if there is an async request
6219  * outstanding for a given page, it will always be in one of the two lists.
6220  * New requests will always be added to the active list.
6221  * If we were not able to capture a page before the request expired, we'd free
6222  * up the request structure which would indicate to page_capture that there is
6223  * no longer a need for the given page, and clear the PR_CAPTURE flag if
6224  * possible.
6225  */
6226 typedef struct page_capture_hash_head {
6227         kmutex_t pchh_mutex;
6228         uint_t num_pages[PC_NUM_PRI];
6229         page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6230 } page_capture_hash_head_t;
6231 
6232 #ifdef DEBUG
6233 #define NUM_PAGE_CAPTURE_BUCKETS 4
6234 #else
6235 #define NUM_PAGE_CAPTURE_BUCKETS 64
6236 #endif
6237 
6238 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6239 
6240 /* for now use a very simple hash based upon the size of a page struct */
6241 #define PAGE_CAPTURE_HASH(pp)   \
6242         ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6243 
6244 extern pgcnt_t swapfs_minfree;
6245 
6246 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6247 
6248 /*
6249  * a callback function is required for page capture requests.
6250  */
6251 void
6252 page_capture_register_callback(uint_t index, clock_t duration,
6253     int (*cb_func)(page_t *, void *, uint_t))
6254 {
6255         ASSERT(pc_cb[index].cb_active == 0);
6256         ASSERT(cb_func != NULL);
6257         rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6258         pc_cb[index].duration = duration;
6259         pc_cb[index].cb_func = cb_func;
6260         pc_cb[index].cb_active = 1;
6261         rw_exit(&pc_cb[index].cb_rwlock);
6262 }
6263 
6264 void
6265 page_capture_unregister_callback(uint_t index)
6266 {
6267         int i, j;
6268         struct page_capture_hash_bucket *bp1;
6269         struct page_capture_hash_bucket *bp2;
6270         struct page_capture_hash_bucket *head = NULL;
6271         uint_t flags = (1 << index);
6272 
6273         rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6274         ASSERT(pc_cb[index].cb_active == 1);
6275         pc_cb[index].duration = 0;      /* Paranoia */
6276         pc_cb[index].cb_func = NULL;    /* Paranoia */
6277         pc_cb[index].cb_active = 0;
6278         rw_exit(&pc_cb[index].cb_rwlock);
6279 
6280         /*
6281          * Just move all the entries to a private list which we can walk
6282          * through without the need to hold any locks.
6283          * No more requests can get added to the hash lists for this consumer
6284          * as the cb_active field for the callback has been cleared.
6285          */
6286         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6287                 mutex_enter(&page_capture_hash[i].pchh_mutex);
6288                 for (j = 0; j < 2; j++) {
6289                         bp1 = page_capture_hash[i].lists[j].next;
6290                         /* walk through all but first (sentinel) element */
6291                         while (bp1 != &page_capture_hash[i].lists[j]) {
6292                                 bp2 = bp1;
6293                                 if (bp2->flags & flags) {
6294                                         bp1 = bp2->next;
6295                                         bp1->prev = bp2->prev;
6296                                         bp2->prev->next = bp1;
6297                                         bp2->next = head;
6298                                         head = bp2;
6299                                         /*
6300                                          * Clear the PR_CAPTURE bit as we
6301                                          * hold appropriate locks here.
6302                                          */
6303                                         page_clrtoxic(head->pp, PR_CAPTURE);
6304                                         page_capture_hash[i].
6305                                             num_pages[bp2->pri]--;
6306                                         continue;
6307                                 }
6308                                 bp1 = bp1->next;
6309                         }
6310                 }
6311                 mutex_exit(&page_capture_hash[i].pchh_mutex);
6312         }
6313 
6314         while (head != NULL) {
6315                 bp1 = head;
6316                 head = head->next;
6317                 kmem_free(bp1, sizeof (*bp1));
6318         }
6319 }
6320 
6321 
6322 /*
6323  * Find pp in the active list and move it to the walked list if it
6324  * exists.
6325  * Note that most often pp should be at the front of the active list
6326  * as it is currently used and thus there is no other sort of optimization
6327  * being done here as this is a linked list data structure.
6328  * Returns 1 on successful move or 0 if page could not be found.
6329  */
6330 static int
6331 page_capture_move_to_walked(page_t *pp)
6332 {
6333         page_capture_hash_bucket_t *bp;
6334         int index;
6335 
6336         index = PAGE_CAPTURE_HASH(pp);
6337 
6338         mutex_enter(&page_capture_hash[index].pchh_mutex);
6339         bp = page_capture_hash[index].lists[0].next;
6340         while (bp != &page_capture_hash[index].lists[0]) {
6341                 if (bp->pp == pp) {
6342                         /* Remove from old list */
6343                         bp->next->prev = bp->prev;
6344                         bp->prev->next = bp->next;
6345 
6346                         /* Add to new list */
6347                         bp->next = page_capture_hash[index].lists[1].next;
6348                         bp->prev = &page_capture_hash[index].lists[1];
6349                         page_capture_hash[index].lists[1].next = bp;
6350                         bp->next->prev = bp;
6351 
6352                         /*
6353                          * There is a small probability of page on a free
6354                          * list being retired while being allocated
6355                          * and before P_RAF is set on it. The page may
6356                          * end up marked as high priority request instead
6357                          * of low priority request.
6358                          * If P_RAF page is not marked as low priority request
6359                          * change it to low priority request.
6360                          */
6361                         page_capture_hash[index].num_pages[bp->pri]--;
6362                         bp->pri = PAGE_CAPTURE_PRIO(pp);
6363                         page_capture_hash[index].num_pages[bp->pri]++;
6364                         mutex_exit(&page_capture_hash[index].pchh_mutex);
6365                         return (1);
6366                 }
6367                 bp = bp->next;
6368         }
6369         mutex_exit(&page_capture_hash[index].pchh_mutex);
6370         return (0);
6371 }
6372 
6373 /*
6374  * Add a new entry to the page capture hash.  The only case where a new
6375  * entry is not added is when the page capture consumer is no longer registered.
6376  * In this case, we'll silently not add the page to the hash.  We know that
6377  * page retire will always be registered for the case where we are currently
6378  * unretiring a page and thus there are no conflicts.
6379  */
6380 static void
6381 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6382 {
6383         page_capture_hash_bucket_t *bp1;
6384         page_capture_hash_bucket_t *bp2;
6385         int index;
6386         int cb_index;
6387         int i;
6388         uchar_t pri;
6389 #ifdef DEBUG
6390         page_capture_hash_bucket_t *tp1;
6391         int l;
6392 #endif
6393 
6394         ASSERT(!(flags & CAPTURE_ASYNC));
6395 
6396         bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6397 
6398         bp1->pp = pp;
6399         bp1->szc = szc;
6400         bp1->flags = flags;
6401         bp1->datap = datap;
6402 
6403         for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6404                 if ((flags >> cb_index) & 1) {
6405                         break;
6406                 }
6407         }
6408 
6409         ASSERT(cb_index != PC_NUM_CALLBACKS);
6410 
6411         rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6412         if (pc_cb[cb_index].cb_active) {
6413                 if (pc_cb[cb_index].duration == -1) {
6414                         bp1->expires = (clock_t)-1;
6415                 } else {
6416                         bp1->expires = ddi_get_lbolt() +
6417                             pc_cb[cb_index].duration;
6418                 }
6419         } else {
6420                 /* There's no callback registered so don't add to the hash */
6421                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6422                 kmem_free(bp1, sizeof (*bp1));
6423                 return;
6424         }
6425 
6426         index = PAGE_CAPTURE_HASH(pp);
6427 
6428         /*
6429          * Only allow capture flag to be modified under this mutex.
6430          * Prevents multiple entries for same page getting added.
6431          */
6432         mutex_enter(&page_capture_hash[index].pchh_mutex);
6433 
6434         /*
6435          * if not already on the hash, set capture bit and add to the hash
6436          */
6437         if (!(pp->p_toxic & PR_CAPTURE)) {
6438 #ifdef DEBUG
6439                 /* Check for duplicate entries */
6440                 for (l = 0; l < 2; l++) {
6441                         tp1 = page_capture_hash[index].lists[l].next;
6442                         while (tp1 != &page_capture_hash[index].lists[l]) {
6443                                 if (tp1->pp == pp) {
6444                                         panic("page pp 0x%p already on hash "
6445                                             "at 0x%p\n",
6446                                             (void *)pp, (void *)tp1);
6447                                 }
6448                                 tp1 = tp1->next;
6449                         }
6450                 }
6451 
6452 #endif
6453                 page_settoxic(pp, PR_CAPTURE);
6454                 pri = PAGE_CAPTURE_PRIO(pp);
6455                 bp1->pri = pri;
6456                 bp1->next = page_capture_hash[index].lists[0].next;
6457                 bp1->prev = &page_capture_hash[index].lists[0];
6458                 bp1->next->prev = bp1;
6459                 page_capture_hash[index].lists[0].next = bp1;
6460                 page_capture_hash[index].num_pages[pri]++;
6461                 if (flags & CAPTURE_RETIRE) {
6462                         page_retire_incr_pend_count(datap);
6463                 }
6464                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6465                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6466                 cv_signal(&pc_cv);
6467                 return;
6468         }
6469 
6470         /*
6471          * A page retire request will replace any other request.
6472          * A second physmem request which is for a different process than
6473          * the currently registered one will be dropped as there is
6474          * no way to hold the private data for both calls.
6475          * In the future, once there are more callers, this will have to
6476          * be worked out better as there needs to be private storage for
6477          * at least each type of caller (maybe have datap be an array of
6478          * *void's so that we can index based upon callers index).
6479          */
6480 
6481         /* walk hash list to update expire time */
6482         for (i = 0; i < 2; i++) {
6483                 bp2 = page_capture_hash[index].lists[i].next;
6484                 while (bp2 != &page_capture_hash[index].lists[i]) {
6485                         if (bp2->pp == pp) {
6486                                 if (flags & CAPTURE_RETIRE) {
6487                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6488                                                 page_retire_incr_pend_count(
6489                                                     datap);
6490                                                 bp2->flags = flags;
6491                                                 bp2->expires = bp1->expires;
6492                                                 bp2->datap = datap;
6493                                         }
6494                                 } else {
6495                                         ASSERT(flags & CAPTURE_PHYSMEM);
6496                                         if (!(bp2->flags & CAPTURE_RETIRE) &&
6497                                             (datap == bp2->datap)) {
6498                                                 bp2->expires = bp1->expires;
6499                                         }
6500                                 }
6501                                 mutex_exit(&page_capture_hash[index].
6502                                     pchh_mutex);
6503                                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6504                                 kmem_free(bp1, sizeof (*bp1));
6505                                 return;
6506                         }
6507                         bp2 = bp2->next;
6508                 }
6509         }
6510 
6511         /*
6512          * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6513          * and thus it either has to be set or not set and can't change
6514          * while holding the mutex above.
6515          */
6516         panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6517             (void *)pp);
6518 }
6519 
6520 /*
6521  * We have a page in our hands, lets try and make it ours by turning
6522  * it into a clean page like it had just come off the freelists.
6523  *
6524  * Returns 0 on success, with the page still EXCL locked.
6525  * On failure, the page will be unlocked, and returns EAGAIN
6526  */
6527 static int
6528 page_capture_clean_page(page_t *pp)
6529 {
6530         page_t *newpp;
6531         int skip_unlock = 0;
6532         spgcnt_t count;
6533         page_t *tpp;
6534         int ret = 0;
6535         int extra;
6536 
6537         ASSERT(PAGE_EXCL(pp));
6538         ASSERT(!PP_RETIRED(pp));
6539         ASSERT(curthread->t_flag & T_CAPTURING);
6540 
6541         if (PP_ISFREE(pp)) {
6542                 if (!page_reclaim(pp, NULL)) {
6543                         skip_unlock = 1;
6544                         ret = EAGAIN;
6545                         goto cleanup;
6546                 }
6547                 ASSERT(pp->p_szc == 0);
6548                 if (pp->p_vnode != NULL) {
6549                         /*
6550                          * Since this page came from the
6551                          * cachelist, we must destroy the
6552                          * old vnode association.
6553                          */
6554                         page_hashout(pp, NULL);
6555                 }
6556                 goto cleanup;
6557         }
6558 
6559         /*
6560          * If we know page_relocate will fail, skip it
6561          * It could still fail due to a UE on another page but we
6562          * can't do anything about that.
6563          */
6564         if (pp->p_toxic & PR_UE) {
6565                 goto skip_relocate;
6566         }
6567 
6568         /*
6569          * It's possible that pages can not have a vnode as fsflush comes
6570          * through and cleans up these pages.  It's ugly but that's how it is.
6571          */
6572         if (pp->p_vnode == NULL) {
6573                 goto skip_relocate;
6574         }
6575 
6576         /*
6577          * Page was not free, so lets try to relocate it.
6578          * page_relocate only works with root pages, so if this is not a root
6579          * page, we need to demote it to try and relocate it.
6580          * Unfortunately this is the best we can do right now.
6581          */
6582         newpp = NULL;
6583         if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6584                 if (page_try_demote_pages(pp) == 0) {
6585                         ret = EAGAIN;
6586                         goto cleanup;
6587                 }
6588         }
6589         ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6590         if (ret == 0) {
6591                 page_t *npp;
6592                 /* unlock the new page(s) */
6593                 while (count-- > 0) {
6594                         ASSERT(newpp != NULL);
6595                         npp = newpp;
6596                         page_sub(&newpp, npp);
6597                         page_unlock(npp);
6598                 }
6599                 ASSERT(newpp == NULL);
6600                 /*
6601                  * Check to see if the page we have is too large.
6602                  * If so, demote it freeing up the extra pages.
6603                  */
6604                 if (pp->p_szc > 0) {
6605                         /* For now demote extra pages to szc == 0 */
6606                         extra = page_get_pagecnt(pp->p_szc) - 1;
6607                         while (extra > 0) {
6608                                 tpp = pp->p_next;
6609                                 page_sub(&pp, tpp);
6610                                 tpp->p_szc = 0;
6611                                 page_free(tpp, 1);
6612                                 extra--;
6613                         }
6614                         /* Make sure to set our page to szc 0 as well */
6615                         ASSERT(pp->p_next == pp && pp->p_prev == pp);
6616                         pp->p_szc = 0;
6617                 }
6618                 goto cleanup;
6619         } else if (ret == EIO) {
6620                 ret = EAGAIN;
6621                 goto cleanup;
6622         } else {
6623                 /*
6624                  * Need to reset return type as we failed to relocate the page
6625                  * but that does not mean that some of the next steps will not
6626                  * work.
6627                  */
6628                 ret = 0;
6629         }
6630 
6631 skip_relocate:
6632 
6633         if (pp->p_szc > 0) {
6634                 if (page_try_demote_pages(pp) == 0) {
6635                         ret = EAGAIN;
6636                         goto cleanup;
6637                 }
6638         }
6639 
6640         ASSERT(pp->p_szc == 0);
6641 
6642         if (hat_ismod(pp)) {
6643                 ret = EAGAIN;
6644                 goto cleanup;
6645         }
6646         if (PP_ISKAS(pp)) {
6647                 ret = EAGAIN;
6648                 goto cleanup;
6649         }
6650         if (pp->p_lckcnt || pp->p_cowcnt) {
6651                 ret = EAGAIN;
6652                 goto cleanup;
6653         }
6654 
6655         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6656         ASSERT(!hat_page_is_mapped(pp));
6657 
6658         if (hat_ismod(pp)) {
6659                 /*
6660                  * This is a semi-odd case as the page is now modified but not
6661                  * mapped as we just unloaded the mappings above.
6662                  */
6663                 ret = EAGAIN;
6664                 goto cleanup;
6665         }
6666         if (pp->p_vnode != NULL) {
6667                 page_hashout(pp, NULL);
6668         }
6669 
6670         /*
6671          * At this point, the page should be in a clean state and
6672          * we can do whatever we want with it.
6673          */
6674 
6675 cleanup:
6676         if (ret != 0) {
6677                 if (!skip_unlock) {
6678                         page_unlock(pp);
6679                 }
6680         } else {
6681                 ASSERT(pp->p_szc == 0);
6682                 ASSERT(PAGE_EXCL(pp));
6683 
6684                 pp->p_next = pp;
6685                 pp->p_prev = pp;
6686         }
6687         return (ret);
6688 }
6689 
6690 /*
6691  * Various callers of page_trycapture() can have different restrictions upon
6692  * what memory they have access to.
6693  * Returns 0 on success, with the following error codes on failure:
6694  *      EPERM - The requested page is long term locked, and thus repeated
6695  *              requests to capture this page will likely fail.
6696  *      ENOMEM - There was not enough free memory in the system to safely
6697  *              map the requested page.
6698  *      ENOENT - The requested page was inside the kernel cage, and the
6699  *              PHYSMEM_CAGE flag was not set.
6700  */
6701 int
6702 page_capture_pre_checks(page_t *pp, uint_t flags)
6703 {
6704         ASSERT(pp != NULL);
6705 
6706 #if defined(__sparc)
6707         if (pp->p_vnode == &promvp) {
6708                 return (EPERM);
6709         }
6710 
6711         if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6712             (flags & CAPTURE_PHYSMEM)) {
6713                 return (ENOENT);
6714         }
6715 
6716         if (PP_ISNORELOCKERNEL(pp)) {
6717                 return (EPERM);
6718         }
6719 #else
6720         if (PP_ISKAS(pp)) {
6721                 return (EPERM);
6722         }
6723 #endif /* __sparc */
6724 
6725         /* only physmem currently has the restrictions checked below */
6726         if (!(flags & CAPTURE_PHYSMEM)) {
6727                 return (0);
6728         }
6729 
6730         if (availrmem < swapfs_minfree) {
6731                 /*
6732                  * We won't try to capture this page as we are
6733                  * running low on memory.
6734                  */
6735                 return (ENOMEM);
6736         }
6737         return (0);
6738 }
6739 
6740 /*
6741  * Once we have a page in our mits, go ahead and complete the capture
6742  * operation.
6743  * Returns 1 on failure where page is no longer needed
6744  * Returns 0 on success
6745  * Returns -1 if there was a transient failure.
6746  * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6747  */
6748 int
6749 page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6750 {
6751         int cb_index;
6752         int ret = 0;
6753         page_capture_hash_bucket_t *bp1;
6754         page_capture_hash_bucket_t *bp2;
6755         int index;
6756         int found = 0;
6757         int i;
6758 
6759         ASSERT(PAGE_EXCL(pp));
6760         ASSERT(curthread->t_flag & T_CAPTURING);
6761 
6762         for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6763                 if ((flags >> cb_index) & 1) {
6764                         break;
6765                 }
6766         }
6767         ASSERT(cb_index < PC_NUM_CALLBACKS);
6768 
6769         /*
6770          * Remove the entry from the page_capture hash, but don't free it yet
6771          * as we may need to put it back.
6772          * Since we own the page at this point in time, we should find it
6773          * in the hash if this is an ASYNC call.  If we don't it's likely
6774          * that the page_capture_async() thread decided that this request
6775          * had expired, in which case we just continue on.
6776          */
6777         if (flags & CAPTURE_ASYNC) {
6778 
6779                 index = PAGE_CAPTURE_HASH(pp);
6780 
6781                 mutex_enter(&page_capture_hash[index].pchh_mutex);
6782                 for (i = 0; i < 2 && !found; i++) {
6783                         bp1 = page_capture_hash[index].lists[i].next;
6784                         while (bp1 != &page_capture_hash[index].lists[i]) {
6785                                 if (bp1->pp == pp) {
6786                                         bp1->next->prev = bp1->prev;
6787                                         bp1->prev->next = bp1->next;
6788                                         page_capture_hash[index].
6789                                             num_pages[bp1->pri]--;
6790                                         page_clrtoxic(pp, PR_CAPTURE);
6791                                         found = 1;
6792                                         break;
6793                                 }
6794                                 bp1 = bp1->next;
6795                         }
6796                 }
6797                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6798         }
6799 
6800         /* Synchronize with the unregister func. */
6801         rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6802         if (!pc_cb[cb_index].cb_active) {
6803                 page_free(pp, 1);
6804                 rw_exit(&pc_cb[cb_index].cb_rwlock);
6805                 if (found) {
6806                         kmem_free(bp1, sizeof (*bp1));
6807                 }
6808                 return (1);
6809         }
6810 
6811         /*
6812          * We need to remove the entry from the page capture hash and turn off
6813          * the PR_CAPTURE bit before calling the callback.  We'll need to cache
6814          * the entry here, and then based upon the return value, cleanup
6815          * appropriately or re-add it to the hash, making sure that someone else
6816          * hasn't already done so.
6817          * It should be rare for the callback to fail and thus it's ok for
6818          * the failure path to be a bit complicated as the success path is
6819          * cleaner and the locking rules are easier to follow.
6820          */
6821 
6822         ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6823 
6824         rw_exit(&pc_cb[cb_index].cb_rwlock);
6825 
6826         /*
6827          * If this was an ASYNC request, we need to cleanup the hash if the
6828          * callback was successful or if the request was no longer valid.
6829          * For non-ASYNC requests, we return failure to map and the caller
6830          * will take care of adding the request to the hash.
6831          * Note also that the callback itself is responsible for the page
6832          * at this point in time in terms of locking ...  The most common
6833          * case for the failure path should just be a page_free.
6834          */
6835         if (ret >= 0) {
6836                 if (found) {
6837                         if (bp1->flags & CAPTURE_RETIRE) {
6838                                 page_retire_decr_pend_count(datap);
6839                         }
6840                         kmem_free(bp1, sizeof (*bp1));
6841                 }
6842                 return (ret);
6843         }
6844         if (!found) {
6845                 return (ret);
6846         }
6847 
6848         ASSERT(flags & CAPTURE_ASYNC);
6849 
6850         /*
6851          * Check for expiration time first as we can just free it up if it's
6852          * expired.
6853          */
6854         if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6855                 kmem_free(bp1, sizeof (*bp1));
6856                 return (ret);
6857         }
6858 
6859         /*
6860          * The callback failed and there used to be an entry in the hash for
6861          * this page, so we need to add it back to the hash.
6862          */
6863         mutex_enter(&page_capture_hash[index].pchh_mutex);
6864         if (!(pp->p_toxic & PR_CAPTURE)) {
6865                 /* just add bp1 back to head of walked list */
6866                 page_settoxic(pp, PR_CAPTURE);
6867                 bp1->next = page_capture_hash[index].lists[1].next;
6868                 bp1->prev = &page_capture_hash[index].lists[1];
6869                 bp1->next->prev = bp1;
6870                 bp1->pri = PAGE_CAPTURE_PRIO(pp);
6871                 page_capture_hash[index].lists[1].next = bp1;
6872                 page_capture_hash[index].num_pages[bp1->pri]++;
6873                 mutex_exit(&page_capture_hash[index].pchh_mutex);
6874                 return (ret);
6875         }
6876 
6877         /*
6878          * Otherwise there was a new capture request added to list
6879          * Need to make sure that our original data is represented if
6880          * appropriate.
6881          */
6882         for (i = 0; i < 2; i++) {
6883                 bp2 = page_capture_hash[index].lists[i].next;
6884                 while (bp2 != &page_capture_hash[index].lists[i]) {
6885                         if (bp2->pp == pp) {
6886                                 if (bp1->flags & CAPTURE_RETIRE) {
6887                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6888                                                 bp2->szc = bp1->szc;
6889                                                 bp2->flags = bp1->flags;
6890                                                 bp2->expires = bp1->expires;
6891                                                 bp2->datap = bp1->datap;
6892                                         }
6893                                 } else {
6894                                         ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6895                                         if (!(bp2->flags & CAPTURE_RETIRE)) {
6896                                                 bp2->szc = bp1->szc;
6897                                                 bp2->flags = bp1->flags;
6898                                                 bp2->expires = bp1->expires;
6899                                                 bp2->datap = bp1->datap;
6900                                         }
6901                                 }
6902                                 page_capture_hash[index].num_pages[bp2->pri]--;
6903                                 bp2->pri = PAGE_CAPTURE_PRIO(pp);
6904                                 page_capture_hash[index].num_pages[bp2->pri]++;
6905                                 mutex_exit(&page_capture_hash[index].
6906                                     pchh_mutex);
6907                                 kmem_free(bp1, sizeof (*bp1));
6908                                 return (ret);
6909                         }
6910                         bp2 = bp2->next;
6911                 }
6912         }
6913         panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6914         /*NOTREACHED*/
6915 }
6916 
6917 /*
6918  * Try to capture the given page for the caller specified in the flags
6919  * parameter.  The page will either be captured and handed over to the
6920  * appropriate callback, or will be queued up in the page capture hash
6921  * to be captured asynchronously.
6922  * If the current request is due to an async capture, the page must be
6923  * exclusively locked before calling this function.
6924  * Currently szc must be 0 but in the future this should be expandable to
6925  * other page sizes.
6926  * Returns 0 on success, with the following error codes on failure:
6927  *      EPERM - The requested page is long term locked, and thus repeated
6928  *              requests to capture this page will likely fail.
6929  *      ENOMEM - There was not enough free memory in the system to safely
6930  *              map the requested page.
6931  *      ENOENT - The requested page was inside the kernel cage, and the
6932  *              CAPTURE_GET_CAGE flag was not set.
6933  *      EAGAIN - The requested page could not be capturead at this point in
6934  *              time but future requests will likely work.
6935  *      EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6936  *              was not set.
6937  */
6938 int
6939 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6940 {
6941         int ret;
6942         int cb_index;
6943 
6944         if (flags & CAPTURE_ASYNC) {
6945                 ASSERT(PAGE_EXCL(pp));
6946                 goto async;
6947         }
6948 
6949         /* Make sure there's enough availrmem ... */
6950         ret = page_capture_pre_checks(pp, flags);
6951         if (ret != 0) {
6952                 return (ret);
6953         }
6954 
6955         if (!page_trylock(pp, SE_EXCL)) {
6956                 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6957                         if ((flags >> cb_index) & 1) {
6958                                 break;
6959                         }
6960                 }
6961                 ASSERT(cb_index < PC_NUM_CALLBACKS);
6962                 ret = EAGAIN;
6963                 /* Special case for retired pages */
6964                 if (PP_RETIRED(pp)) {
6965                         if (flags & CAPTURE_GET_RETIRED) {
6966                                 if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6967                                         /*
6968                                          * Need to set capture bit and add to
6969                                          * hash so that the page will be
6970                                          * retired when freed.
6971                                          */
6972                                         page_capture_add_hash(pp, szc,
6973                                             CAPTURE_RETIRE, NULL);
6974                                         ret = 0;
6975                                         goto own_page;
6976                                 }
6977                         } else {
6978                                 return (EBUSY);
6979                         }
6980                 }
6981                 page_capture_add_hash(pp, szc, flags, datap);
6982                 return (ret);
6983         }
6984 
6985 async:
6986         ASSERT(PAGE_EXCL(pp));
6987 
6988         /* Need to check for physmem async requests that availrmem is sane */
6989         if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6990             (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6991             (availrmem < swapfs_minfree)) {
6992                 page_unlock(pp);
6993                 return (ENOMEM);
6994         }
6995 
6996         ret = page_capture_clean_page(pp);
6997 
6998         if (ret != 0) {
6999                 /* We failed to get the page, so lets add it to the hash */
7000                 if (!(flags & CAPTURE_ASYNC)) {
7001                         page_capture_add_hash(pp, szc, flags, datap);
7002                 }
7003                 return (ret);
7004         }
7005 
7006 own_page:
7007         ASSERT(PAGE_EXCL(pp));
7008         ASSERT(pp->p_szc == 0);
7009 
7010         /* Call the callback */
7011         ret = page_capture_take_action(pp, flags, datap);
7012 
7013         if (ret == 0) {
7014                 return (0);
7015         }
7016 
7017         /*
7018          * Note that in the failure cases from page_capture_take_action, the
7019          * EXCL lock will have already been dropped.
7020          */
7021         if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
7022                 page_capture_add_hash(pp, szc, flags, datap);
7023         }
7024         return (EAGAIN);
7025 }
7026 
7027 int
7028 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
7029 {
7030         int ret;
7031 
7032         curthread->t_flag |= T_CAPTURING;
7033         ret = page_itrycapture(pp, szc, flags, datap);
7034         curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
7035         return (ret);
7036 }
7037 
7038 /*
7039  * When unlocking a page which has the PR_CAPTURE bit set, this routine
7040  * gets called to try and capture the page.
7041  */
7042 void
7043 page_unlock_capture(page_t *pp)
7044 {
7045         page_capture_hash_bucket_t *bp;
7046         int index;
7047         int i;
7048         uint_t szc;
7049         uint_t flags = 0;
7050         void *datap;
7051         kmutex_t *mp;
7052         extern vnode_t retired_pages;
7053 
7054         /*
7055          * We need to protect against a possible deadlock here where we own
7056          * the vnode page hash mutex and want to acquire it again as there
7057          * are locations in the code, where we unlock a page while holding
7058          * the mutex which can lead to the page being captured and eventually
7059          * end up here.  As we may be hashing out the old page and hashing into
7060          * the retire vnode, we need to make sure we don't own them.
7061          * Other callbacks who do hash operations also need to make sure that
7062          * before they hashin to a vnode that they do not currently own the
7063          * vphm mutex otherwise there will be a panic.
7064          */
7065         if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7066                 page_unlock_nocapture(pp);
7067                 return;
7068         }
7069         if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7070                 page_unlock_nocapture(pp);
7071                 return;
7072         }
7073 
7074         index = PAGE_CAPTURE_HASH(pp);
7075 
7076         mp = &page_capture_hash[index].pchh_mutex;
7077         mutex_enter(mp);
7078         for (i = 0; i < 2; i++) {
7079                 bp = page_capture_hash[index].lists[i].next;
7080                 while (bp != &page_capture_hash[index].lists[i]) {
7081                         if (bp->pp == pp) {
7082                                 szc = bp->szc;
7083                                 flags = bp->flags | CAPTURE_ASYNC;
7084                                 datap = bp->datap;
7085                                 mutex_exit(mp);
7086                                 (void) page_trycapture(pp, szc, flags, datap);
7087                                 return;
7088                         }
7089                         bp = bp->next;
7090                 }
7091         }
7092 
7093         /* Failed to find page in hash so clear flags and unlock it. */
7094         page_clrtoxic(pp, PR_CAPTURE);
7095         page_unlock(pp);
7096 
7097         mutex_exit(mp);
7098 }
7099 
7100 void
7101 page_capture_init()
7102 {
7103         int i;
7104         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7105                 page_capture_hash[i].lists[0].next =
7106                     &page_capture_hash[i].lists[0];
7107                 page_capture_hash[i].lists[0].prev =
7108                     &page_capture_hash[i].lists[0];
7109                 page_capture_hash[i].lists[1].next =
7110                     &page_capture_hash[i].lists[1];
7111                 page_capture_hash[i].lists[1].prev =
7112                     &page_capture_hash[i].lists[1];
7113         }
7114 
7115         pc_thread_shortwait = 23 * hz;
7116         pc_thread_longwait = 1201 * hz;
7117         pc_thread_retry = 3;
7118         mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7119         cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7120         pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7121             TS_RUN, minclsyspri);
7122 }
7123 
7124 /*
7125  * It is necessary to scrub any failing pages prior to reboot in order to
7126  * prevent a latent error trap from occurring on the next boot.
7127  */
7128 void
7129 page_retire_mdboot()
7130 {
7131         page_t *pp;
7132         int i, j;
7133         page_capture_hash_bucket_t *bp;
7134         uchar_t pri;
7135 
7136         /* walk lists looking for pages to scrub */
7137         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7138                 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7139                         if (page_capture_hash[i].num_pages[pri] != 0) {
7140                                 break;
7141                         }
7142                 }
7143                 if (pri == PC_NUM_PRI)
7144                         continue;
7145 
7146                 mutex_enter(&page_capture_hash[i].pchh_mutex);
7147 
7148                 for (j = 0; j < 2; j++) {
7149                         bp = page_capture_hash[i].lists[j].next;
7150                         while (bp != &page_capture_hash[i].lists[j]) {
7151                                 pp = bp->pp;
7152                                 if (PP_TOXIC(pp)) {
7153                                         if (page_trylock(pp, SE_EXCL)) {
7154                                                 PP_CLRFREE(pp);
7155                                                 pagescrub(pp, 0, PAGESIZE);
7156                                                 page_unlock(pp);
7157                                         }
7158                                 }
7159                                 bp = bp->next;
7160                         }
7161                 }
7162                 mutex_exit(&page_capture_hash[i].pchh_mutex);
7163         }
7164 }
7165 
7166 /*
7167  * Walk the page_capture_hash trying to capture pages and also cleanup old
7168  * entries which have expired.
7169  */
7170 void
7171 page_capture_async()
7172 {
7173         page_t *pp;
7174         int i;
7175         int ret;
7176         page_capture_hash_bucket_t *bp1, *bp2;
7177         uint_t szc;
7178         uint_t flags;
7179         void *datap;
7180         uchar_t pri;
7181 
7182         /* If there are outstanding pages to be captured, get to work */
7183         for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7184                 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7185                         if (page_capture_hash[i].num_pages[pri] != 0)
7186                                 break;
7187                 }
7188                 if (pri == PC_NUM_PRI)
7189                         continue;
7190 
7191                 /* Append list 1 to list 0 and then walk through list 0 */
7192                 mutex_enter(&page_capture_hash[i].pchh_mutex);
7193                 bp1 = &page_capture_hash[i].lists[1];
7194                 bp2 = bp1->next;
7195                 if (bp1 != bp2) {
7196                         bp1->prev->next = page_capture_hash[i].lists[0].next;
7197                         bp2->prev = &page_capture_hash[i].lists[0];
7198                         page_capture_hash[i].lists[0].next->prev = bp1->prev;
7199                         page_capture_hash[i].lists[0].next = bp2;
7200                         bp1->next = bp1;
7201                         bp1->prev = bp1;
7202                 }
7203 
7204                 /* list[1] will be empty now */
7205 
7206                 bp1 = page_capture_hash[i].lists[0].next;
7207                 while (bp1 != &page_capture_hash[i].lists[0]) {
7208                         /* Check expiration time */
7209                         if ((ddi_get_lbolt() > bp1->expires &&
7210                             bp1->expires != -1) ||
7211                             page_deleted(bp1->pp)) {
7212                                 page_capture_hash[i].lists[0].next = bp1->next;
7213                                 bp1->next->prev =
7214                                     &page_capture_hash[i].lists[0];
7215                                 page_capture_hash[i].num_pages[bp1->pri]--;
7216 
7217                                 /*
7218                                  * We can safely remove the PR_CAPTURE bit
7219                                  * without holding the EXCL lock on the page
7220                                  * as the PR_CAPTURE bit requres that the
7221                                  * page_capture_hash[].pchh_mutex be held
7222                                  * to modify it.
7223                                  */
7224                                 page_clrtoxic(bp1->pp, PR_CAPTURE);
7225                                 mutex_exit(&page_capture_hash[i].pchh_mutex);
7226                                 kmem_free(bp1, sizeof (*bp1));
7227                                 mutex_enter(&page_capture_hash[i].pchh_mutex);
7228                                 bp1 = page_capture_hash[i].lists[0].next;
7229                                 continue;
7230                         }
7231                         pp = bp1->pp;
7232                         szc = bp1->szc;
7233                         flags = bp1->flags;
7234                         datap = bp1->datap;
7235                         mutex_exit(&page_capture_hash[i].pchh_mutex);
7236                         if (page_trylock(pp, SE_EXCL)) {
7237                                 ret = page_trycapture(pp, szc,
7238                                     flags | CAPTURE_ASYNC, datap);
7239                         } else {
7240                                 ret = 1;        /* move to walked hash */
7241                         }
7242 
7243                         if (ret != 0) {
7244                                 /* Move to walked hash */
7245                                 (void) page_capture_move_to_walked(pp);
7246                         }
7247                         mutex_enter(&page_capture_hash[i].pchh_mutex);
7248                         bp1 = page_capture_hash[i].lists[0].next;
7249                 }
7250 
7251                 mutex_exit(&page_capture_hash[i].pchh_mutex);
7252         }
7253 }
7254 
7255 /*
7256  * This function is called by the page_capture_thread, and is needed in
7257  * in order to initiate aio cleanup, so that pages used in aio
7258  * will be unlocked and subsequently retired by page_capture_thread.
7259  */
7260 static int
7261 do_aio_cleanup(void)
7262 {
7263         proc_t *procp;
7264         int (*aio_cleanup_dr_delete_memory)(proc_t *);
7265         int cleaned = 0;
7266 
7267         if (modload("sys", "kaio") == -1) {
7268                 cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7269                 return (0);
7270         }
7271         /*
7272          * We use the aio_cleanup_dr_delete_memory function to
7273          * initiate the actual clean up; this function will wake
7274          * up the per-process aio_cleanup_thread.
7275          */
7276         aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7277             modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7278         if (aio_cleanup_dr_delete_memory == NULL) {
7279                 cmn_err(CE_WARN,
7280             "aio_cleanup_dr_delete_memory not found in kaio");
7281                 return (0);
7282         }
7283         mutex_enter(&pidlock);
7284         for (procp = practive; (procp != NULL); procp = procp->p_next) {
7285                 mutex_enter(&procp->p_lock);
7286                 if (procp->p_aio != NULL) {
7287                         /* cleanup proc's outstanding kaio */
7288                         cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7289                 }
7290                 mutex_exit(&procp->p_lock);
7291         }
7292         mutex_exit(&pidlock);
7293         return (cleaned);
7294 }
7295 
7296 /*
7297  * helper function for page_capture_thread
7298  */
7299 static void
7300 page_capture_handle_outstanding(void)
7301 {
7302         int ntry;
7303 
7304         /* Reap pages before attempting capture pages */
7305         kmem_reap();
7306 
7307         if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
7308             hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
7309                 /*
7310                  * Note: Purging only for platforms that support
7311                  * ISM hat_pageunload() - mainly SPARC. On x86/x64
7312                  * platforms ISM pages SE_SHARED locked until destroyed.
7313                  */
7314 
7315                 /* disable and purge seg_pcache */
7316                 (void) seg_p_disable();
7317                 for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7318                         if (!page_retire_pend_count())
7319                                 break;
7320                         if (do_aio_cleanup()) {
7321                                 /*
7322                                  * allow the apps cleanup threads
7323                                  * to run
7324                                  */
7325                                 delay(pc_thread_shortwait);
7326                         }
7327                         page_capture_async();
7328                 }
7329                 /* reenable seg_pcache */
7330                 seg_p_enable();
7331 
7332                 /* completed what can be done.  break out */
7333                 return;
7334         }
7335 
7336         /*
7337          * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
7338          * and then attempt to capture.
7339          */
7340         seg_preap();
7341         page_capture_async();
7342 }
7343 
7344 /*
7345  * The page_capture_thread loops forever, looking to see if there are
7346  * pages still waiting to be captured.
7347  */
7348 static void
7349 page_capture_thread(void)
7350 {
7351         callb_cpr_t c;
7352         int i;
7353         int high_pri_pages;
7354         int low_pri_pages;
7355         clock_t timeout;
7356 
7357         CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7358 
7359         mutex_enter(&pc_thread_mutex);
7360         for (;;) {
7361                 high_pri_pages = 0;
7362                 low_pri_pages = 0;
7363                 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7364                         high_pri_pages +=
7365                             page_capture_hash[i].num_pages[PC_PRI_HI];
7366                         low_pri_pages +=
7367                             page_capture_hash[i].num_pages[PC_PRI_LO];
7368                 }
7369 
7370                 timeout = pc_thread_longwait;
7371                 if (high_pri_pages != 0) {
7372                         timeout = pc_thread_shortwait;
7373                         page_capture_handle_outstanding();
7374                 } else if (low_pri_pages != 0) {
7375                         page_capture_async();
7376                 }
7377                 CALLB_CPR_SAFE_BEGIN(&c);
7378                 (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
7379                     timeout, TR_CLOCK_TICK);
7380                 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7381         }
7382         /*NOTREACHED*/
7383 }
7384 /*
7385  * Attempt to locate a bucket that has enough pages to satisfy the request.
7386  * The initial check is done without the lock to avoid unneeded contention.
7387  * The function returns 1 if enough pages were found, else 0 if it could not
7388  * find enough pages in a bucket.
7389  */
7390 static int
7391 pcf_decrement_bucket(pgcnt_t npages)
7392 {
7393         struct pcf      *p;
7394         struct pcf      *q;
7395         int i;
7396 
7397         p = &pcf[PCF_INDEX()];
7398         q = &pcf[pcf_fanout];
7399         for (i = 0; i < pcf_fanout; i++) {
7400                 if (p->pcf_count > npages) {
7401                         /*
7402                          * a good one to try.
7403                          */
7404                         mutex_enter(&p->pcf_lock);
7405                         if (p->pcf_count > npages) {
7406                                 p->pcf_count -= (uint_t)npages;
7407                                 /*
7408                                  * freemem is not protected by any lock.
7409                                  * Thus, we cannot have any assertion
7410                                  * containing freemem here.
7411                                  */
7412                                 freemem -= npages;
7413                                 mutex_exit(&p->pcf_lock);
7414                                 return (1);
7415                         }
7416                         mutex_exit(&p->pcf_lock);
7417                 }
7418                 p++;
7419                 if (p >= q) {
7420                         p = pcf;
7421                 }
7422         }
7423         return (0);
7424 }
7425 
7426 /*
7427  * Arguments:
7428  *      pcftotal_ret:   If the value is not NULL and we have walked all the
7429  *                      buckets but did not find enough pages then it will
7430  *                      be set to the total number of pages in all the pcf
7431  *                      buckets.
7432  *      npages:         Is the number of pages we have been requested to
7433  *                      find.
7434  *      unlock:         If set to 0 we will leave the buckets locked if the
7435  *                      requested number of pages are not found.
7436  *
7437  * Go and try to satisfy the page request  from any number of buckets.
7438  * This can be a very expensive operation as we have to lock the buckets
7439  * we are checking (and keep them locked), starting at bucket 0.
7440  *
7441  * The function returns 1 if enough pages were found, else 0 if it could not
7442  * find enough pages in the buckets.
7443  *
7444  */
7445 static int
7446 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7447 {
7448         struct pcf      *p;
7449         pgcnt_t pcftotal;
7450         int i;
7451 
7452         p = pcf;
7453         /* try to collect pages from several pcf bins */
7454         for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7455                 mutex_enter(&p->pcf_lock);
7456                 pcftotal += p->pcf_count;
7457                 if (pcftotal >= npages) {
7458                         /*
7459                          * Wow!  There are enough pages laying around
7460                          * to satisfy the request.  Do the accounting,
7461                          * drop the locks we acquired, and go back.
7462                          *
7463                          * freemem is not protected by any lock. So,
7464                          * we cannot have any assertion containing
7465                          * freemem.
7466                          */
7467                         freemem -= npages;
7468                         while (p >= pcf) {
7469                                 if (p->pcf_count <= npages) {
7470                                         npages -= p->pcf_count;
7471                                         p->pcf_count = 0;
7472                                 } else {
7473                                         p->pcf_count -= (uint_t)npages;
7474                                         npages = 0;
7475                                 }
7476                                 mutex_exit(&p->pcf_lock);
7477                                 p--;
7478                         }
7479                         ASSERT(npages == 0);
7480                         return (1);
7481                 }
7482                 p++;
7483         }
7484         if (unlock) {
7485                 /* failed to collect pages - release the locks */
7486                 while (--p >= pcf) {
7487                         mutex_exit(&p->pcf_lock);
7488                 }
7489         }
7490         if (pcftotal_ret != NULL)
7491                 *pcftotal_ret = pcftotal;
7492         return (0);
7493 }