io-lx-public New usr/src/uts/i86pc/vm/vm

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  28  */
  29 
  30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  31 /*      All Rights Reserved   */
  32 
  33 /*
  34  * Portions of this source code were derived from Berkeley 4.3 BSD
  35  * under license from the Regents of the University of California.
  36  */
  37 
  38 /*
  39  * UNIX machine dependent virtual memory support.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/user.h>
  46 #include <sys/proc.h>
  47 #include <sys/kmem.h>
  48 #include <sys/vmem.h>
  49 #include <sys/buf.h>
  50 #include <sys/cpuvar.h>
  51 #include <sys/lgrp.h>
  52 #include <sys/disp.h>
  53 #include <sys/vm.h>
  54 #include <sys/mman.h>
  55 #include <sys/vnode.h>
  56 #include <sys/cred.h>
  57 #include <sys/exec.h>
  58 #include <sys/exechdr.h>
  59 #include <sys/debug.h>
  60 #include <sys/vmsystm.h>
  61 #include <sys/swap.h>
  62 #include <sys/dumphdr.h>
  63 
  64 #include <vm/hat.h>
  65 #include <vm/as.h>
  66 #include <vm/seg.h>
  67 #include <vm/seg_kp.h>
  68 #include <vm/seg_vn.h>
  69 #include <vm/page.h>
  70 #include <vm/seg_kmem.h>
  71 #include <vm/seg_kpm.h>
  72 #include <vm/vm_dep.h>
  73 
  74 #include <sys/cpu.h>
  75 #include <sys/vm_machparam.h>
  76 #include <sys/memlist.h>
  77 #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */
  78 #include <vm/hat_i86.h>
  79 #include <sys/x86_archext.h>
  80 #include <sys/elf_386.h>
  81 #include <sys/cmn_err.h>
  82 #include <sys/archsystm.h>
  83 #include <sys/machsystm.h>
  84 
  85 #include <sys/vtrace.h>
  86 #include <sys/ddidmareq.h>
  87 #include <sys/promif.h>
  88 #include <sys/memnode.h>
  89 #include <sys/stack.h>
  90 #include <util/qsort.h>
  91 #include <sys/taskq.h>
  92 
  93 #ifdef __xpv
  94 
  95 #include <sys/hypervisor.h>
  96 #include <sys/xen_mmu.h>
  97 #include <sys/balloon_impl.h>
  98 
  99 /*
 100  * domain 0 pages usable for DMA are kept pre-allocated and kept in
 101  * distinct lists, ordered by increasing mfn.
 102  */
 103 static kmutex_t io_pool_lock;
 104 static kmutex_t contig_list_lock;
 105 static page_t *io_pool_4g;      /* pool for 32 bit dma limited devices */
 106 static page_t *io_pool_16m;     /* pool for 24 bit dma limited legacy devices */
 107 static long io_pool_cnt;
 108 static long io_pool_cnt_max = 0;
 109 #define DEFAULT_IO_POOL_MIN     128
 110 static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN;
 111 static long io_pool_cnt_lowater = 0;
 112 static long io_pool_shrink_attempts; /* how many times did we try to shrink */
 113 static long io_pool_shrinks;    /* how many times did we really shrink */
 114 static long io_pool_grows;      /* how many times did we grow */
 115 static mfn_t start_mfn = 1;
 116 static caddr_t io_pool_kva;     /* use to alloc pages when needed */
 117 
 118 static int create_contig_pfnlist(uint_t);
 119 
 120 /*
 121  * percentage of phys mem to hold in the i/o pool
 122  */
 123 #define DEFAULT_IO_POOL_PCT     2
 124 static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT;
 125 static void page_io_pool_sub(page_t **, page_t *, page_t *);
 126 int ioalloc_dbg = 0;
 127 
 128 #endif /* __xpv */
 129 
 130 uint_t vac_colors = 1;
 131 
 132 int largepagesupport = 0;
 133 extern uint_t page_create_new;
 134 extern uint_t page_create_exists;
 135 extern uint_t page_create_putbacks;
 136 /*
 137  * Allow users to disable the kernel's use of SSE.
 138  */
 139 extern int use_sse_pagecopy, use_sse_pagezero;
 140 
 141 /*
 142  * combined memory ranges from mnode and memranges[] to manage single
 143  * mnode/mtype dimension in the page lists.
 144  */
 145 typedef struct {
 146         pfn_t   mnr_pfnlo;
 147         pfn_t   mnr_pfnhi;
 148         int     mnr_mnode;
 149         int     mnr_memrange;           /* index into memranges[] */
 150         int     mnr_next;               /* next lower PA mnoderange */
 151         int     mnr_exists;
 152         /* maintain page list stats */
 153         pgcnt_t mnr_mt_clpgcnt;         /* cache list cnt */
 154         pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */
 155         pgcnt_t mnr_mt_totcnt;          /* sum of cache and free lists */
 156 #ifdef DEBUG
 157         struct mnr_mts {                /* mnode/mtype szc stats */
 158                 pgcnt_t mnr_mts_pgcnt;
 159                 int     mnr_mts_colors;
 160                 pgcnt_t *mnr_mtsc_pgcnt;
 161         }       *mnr_mts;
 162 #endif
 163 } mnoderange_t;
 164 
 165 #define MEMRANGEHI(mtype)                                               \
 166         ((mtype > 0) ? memranges[mtype - 1] - 1: physmax)
 167 #define MEMRANGELO(mtype)       (memranges[mtype])
 168 
 169 #define MTYPE_FREEMEM(mt)       (mnoderanges[mt].mnr_mt_totcnt)
 170 
 171 /*
 172  * As the PC architecture evolved memory up was clumped into several
 173  * ranges for various historical I/O devices to do DMA.
 174  * < 16Meg - ISA bus
 175  * < 2Gig - ???
 176  * < 4Gig - PCI bus or drivers that don't understand PAE mode
 177  *
 178  * These are listed in reverse order, so that we can skip over unused
 179  * ranges on machines with small memories.
 180  *
 181  * For now under the Hypervisor, we'll only ever have one memrange.
 182  */
 183 #define PFN_4GIG        0x100000
 184 #define PFN_16MEG       0x1000
 185 /* Indices into the memory range (arch_memranges) array. */
 186 #define MRI_4G          0
 187 #define MRI_2G          1
 188 #define MRI_16M         2
 189 #define MRI_0           3
 190 static pfn_t arch_memranges[NUM_MEM_RANGES] = {
 191     PFN_4GIG,   /* pfn range for 4G and above */
 192     0x80000,    /* pfn range for 2G-4G */
 193     PFN_16MEG,  /* pfn range for 16M-2G */
 194     0x00000,    /* pfn range for 0-16M */
 195 };
 196 pfn_t *memranges = &arch_memranges[0];
 197 int nranges = NUM_MEM_RANGES;
 198 
 199 /*
 200  * This combines mem_node_config and memranges into one data
 201  * structure to be used for page list management.
 202  */
 203 mnoderange_t    *mnoderanges;
 204 int             mnoderangecnt;
 205 int             mtype4g;
 206 int             mtype16m;
 207 int             mtypetop;       /* index of highest pfn'ed mnoderange */
 208 
 209 /*
 210  * 4g memory management variables for systems with more than 4g of memory:
 211  *
 212  * physical memory below 4g is required for 32bit dma devices and, currently,
 213  * for kmem memory. On systems with more than 4g of memory, the pool of memory
 214  * below 4g can be depleted without any paging activity given that there is
 215  * likely to be sufficient memory above 4g.
 216  *
 217  * physmax4g is set true if the largest pfn is over 4g. The rest of the
 218  * 4g memory management code is enabled only when physmax4g is true.
 219  *
 220  * maxmem4g is the count of the maximum number of pages on the page lists
 221  * with physical addresses below 4g. It can be a lot less then 4g given that
 222  * BIOS may reserve large chunks of space below 4g for hot plug pci devices,
 223  * agp aperture etc.
 224  *
 225  * freemem4g maintains the count of the number of available pages on the
 226  * page lists with physical addresses below 4g.
 227  *
 228  * DESFREE4G specifies the desired amount of below 4g memory. It defaults to
 229  * 6% (desfree4gshift = 4) of maxmem4g.
 230  *
 231  * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G
 232  * and the amount of physical memory above 4g is greater than freemem4g.
 233  * In this case, page_get_* routines will restrict below 4g allocations
 234  * for requests that don't specifically require it.
 235  */
 236 
 237 #define DESFREE4G       (maxmem4g >> desfree4gshift)
 238 
 239 #define RESTRICT4G_ALLOC                                        \
 240         (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem))
 241 
 242 static pgcnt_t  maxmem4g;
 243 static pgcnt_t  freemem4g;
 244 static int      physmax4g;
 245 static int      desfree4gshift = 4;     /* maxmem4g shift to derive DESFREE4G */
 246 
 247 /*
 248  * 16m memory management:
 249  *
 250  * reserve some amount of physical memory below 16m for legacy devices.
 251  *
 252  * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above
 253  * 16m or if the 16m pool drops below DESFREE16M.
 254  *
 255  * In this case, general page allocations via page_get_{free,cache}list
 256  * routines will be restricted from allocating from the 16m pool. Allocations
 257  * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations
 258  * are not restricted.
 259  */
 260 
 261 #define FREEMEM16M      MTYPE_FREEMEM(mtype16m)
 262 #define DESFREE16M      desfree16m
 263 #define RESTRICT16M_ALLOC(freemem, pgcnt, flags)                \
 264         ((freemem != 0) && ((flags & PG_PANIC) == 0) &&             \
 265             ((freemem >= (FREEMEM16M)) ||                    \
 266             (FREEMEM16M  < (DESFREE16M + pgcnt))))
 267 
 268 static pgcnt_t  desfree16m = 0x380;
 269 
 270 /*
 271  * This can be patched via /etc/system to allow old non-PAE aware device
 272  * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM.
 273  */
 274 int restricted_kmemalloc = 0;
 275 
 276 #ifdef VM_STATS
 277 struct {
 278         ulong_t pga_alloc;
 279         ulong_t pga_notfullrange;
 280         ulong_t pga_nulldmaattr;
 281         ulong_t pga_allocok;
 282         ulong_t pga_allocfailed;
 283         ulong_t pgma_alloc;
 284         ulong_t pgma_allocok;
 285         ulong_t pgma_allocfailed;
 286         ulong_t pgma_allocempty;
 287 } pga_vmstats;
 288 #endif
 289 
 290 uint_t mmu_page_sizes;
 291 
 292 /* How many page sizes the users can see */
 293 uint_t mmu_exported_page_sizes;
 294 
 295 /* page sizes that legacy applications can see */
 296 uint_t mmu_legacy_page_sizes;
 297 
 298 /*
 299  * Number of pages in 1 GB.  Don't enable automatic large pages if we have
 300  * fewer than this many pages.
 301  */
 302 pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 303 pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT);
 304 
 305 /*
 306  * Maximum and default segment size tunables for user private
 307  * and shared anon memory, and user text and initialized data.
 308  * These can be patched via /etc/system to allow large pages
 309  * to be used for mapping application private and shared anon memory.
 310  */
 311 size_t mcntl0_lpsize = MMU_PAGESIZE;
 312 size_t max_uheap_lpsize = MMU_PAGESIZE;
 313 size_t default_uheap_lpsize = MMU_PAGESIZE;
 314 size_t max_ustack_lpsize = MMU_PAGESIZE;
 315 size_t default_ustack_lpsize = MMU_PAGESIZE;
 316 size_t max_privmap_lpsize = MMU_PAGESIZE;
 317 size_t max_uidata_lpsize = MMU_PAGESIZE;
 318 size_t max_utext_lpsize = MMU_PAGESIZE;
 319 size_t max_shm_lpsize = MMU_PAGESIZE;
 320 
 321 
 322 /*
 323  * initialized by page_coloring_init().
 324  */
 325 uint_t  page_colors;
 326 uint_t  page_colors_mask;
 327 uint_t  page_coloring_shift;
 328 int     cpu_page_colors;
 329 static uint_t   l2_colors;
 330 
 331 /*
 332  * Page freelists and cachelists are dynamically allocated once mnoderangecnt
 333  * and page_colors are calculated from the l2 cache n-way set size.  Within a
 334  * mnode range, the page freelist and cachelist are hashed into bins based on
 335  * color. This makes it easier to search for a page within a specific memory
 336  * range.
 337  */
 338 #define PAGE_COLORS_MIN 16
 339 
 340 page_t ****page_freelists;
 341 page_t ***page_cachelists;
 342 
 343 
 344 /*
 345  * Used by page layer to know about page sizes
 346  */
 347 hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1];
 348 
 349 kmutex_t        *fpc_mutex[NPC_MUTEX];
 350 kmutex_t        *cpc_mutex[NPC_MUTEX];
 351 
 352 /* Lock to protect mnoderanges array for memory DR operations. */
 353 static kmutex_t mnoderange_lock;
 354 
 355 /*
 356  * Only let one thread at a time try to coalesce large pages, to
 357  * prevent them from working against each other.
 358  */
 359 static kmutex_t contig_lock;
 360 #define CONTIG_LOCK()   mutex_enter(&contig_lock);
 361 #define CONTIG_UNLOCK() mutex_exit(&contig_lock);
 362 
 363 #define PFN_16M         (mmu_btop((uint64_t)0x1000000))
 364 
 365 /*
 366  * Return the optimum page size for a given mapping
 367  */
 368 /*ARGSUSED*/
 369 size_t
 370 map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl)
 371 {
 372         level_t l = 0;
 373         size_t pgsz = MMU_PAGESIZE;
 374         size_t max_lpsize;
 375         uint_t mszc;
 376 
 377         ASSERT(maptype != MAPPGSZ_VA);
 378 
 379         if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) {
 380                 return (MMU_PAGESIZE);
 381         }
 382 
 383         switch (maptype) {
 384         case MAPPGSZ_HEAP:
 385         case MAPPGSZ_STK:
 386                 max_lpsize = memcntl ? mcntl0_lpsize : (maptype ==
 387                     MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize);
 388                 if (max_lpsize == MMU_PAGESIZE) {
 389                         return (MMU_PAGESIZE);
 390                 }
 391                 if (len == 0) {
 392                         len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase +
 393                             p->p_brksize - p->p_bssbase : p->p_stksize;
 394                 }
 395                 len = (maptype == MAPPGSZ_HEAP) ? MAX(len,
 396                     default_uheap_lpsize) : MAX(len, default_ustack_lpsize);
 397 
 398                 /*
 399                  * use the pages size that best fits len
 400                  */
 401                 for (l = mmu.umax_page_level; l > 0; --l) {
 402                         if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) {
 403                                 continue;
 404                         } else {
 405                                 pgsz = LEVEL_SIZE(l);
 406                         }
 407                         break;
 408                 }
 409 
 410                 mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc :
 411                     p->p_stkpageszc);
 412                 if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) {
 413                         pgsz = hw_page_array[mszc].hp_size;
 414                 }
 415                 return (pgsz);
 416 
 417         case MAPPGSZ_ISM:
 418                 for (l = mmu.umax_page_level; l > 0; --l) {
 419                         if (len >= LEVEL_SIZE(l))
 420                                 return (LEVEL_SIZE(l));
 421                 }
 422                 return (LEVEL_SIZE(0));
 423         }
 424         return (pgsz);
 425 }
 426 
 427 static uint_t
 428 map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize,
 429     size_t min_physmem)
 430 {
 431         caddr_t eaddr = addr + size;
 432         uint_t szcvec = 0;
 433         caddr_t raddr;
 434         caddr_t readdr;
 435         size_t  pgsz;
 436         int i;
 437 
 438         if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) {
 439                 return (0);
 440         }
 441 
 442         for (i = mmu_exported_page_sizes - 1; i > 0; i--) {
 443                 pgsz = page_get_pagesize(i);
 444                 if (pgsz > max_lpsize) {
 445                         continue;
 446                 }
 447                 raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
 448                 readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
 449                 if (raddr < addr || raddr >= readdr) {
 450                         continue;
 451                 }
 452                 if (P2PHASE((uintptr_t)addr ^ off, pgsz)) {
 453                         continue;
 454                 }
 455                 /*
 456                  * Set szcvec to the remaining page sizes.
 457                  */
 458                 szcvec = ((1 << (i + 1)) - 1) & ~1;
 459                 break;
 460         }
 461         return (szcvec);
 462 }
 463 
 464 /*
 465  * Return a bit vector of large page size codes that
 466  * can be used to map [addr, addr + len) region.
 467  */
 468 /*ARGSUSED*/
 469 uint_t
 470 map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type,
 471     int memcntl)
 472 {
 473         size_t max_lpsize = mcntl0_lpsize;
 474 
 475         if (mmu.max_page_level == 0)
 476                 return (0);
 477 
 478         if (flags & MAP_TEXT) {
 479                 if (!memcntl)
 480                         max_lpsize = max_utext_lpsize;
 481                 return (map_szcvec(addr, size, off, max_lpsize,
 482                     shm_lpg_min_physmem));
 483 
 484         } else if (flags & MAP_INITDATA) {
 485                 if (!memcntl)
 486                         max_lpsize = max_uidata_lpsize;
 487                 return (map_szcvec(addr, size, off, max_lpsize,
 488                     privm_lpg_min_physmem));
 489 
 490         } else if (type == MAPPGSZC_SHM) {
 491                 if (!memcntl)
 492                         max_lpsize = max_shm_lpsize;
 493                 return (map_szcvec(addr, size, off, max_lpsize,
 494                     shm_lpg_min_physmem));
 495 
 496         } else if (type == MAPPGSZC_HEAP) {
 497                 if (!memcntl)
 498                         max_lpsize = max_uheap_lpsize;
 499                 return (map_szcvec(addr, size, off, max_lpsize,
 500                     privm_lpg_min_physmem));
 501 
 502         } else if (type == MAPPGSZC_STACK) {
 503                 if (!memcntl)
 504                         max_lpsize = max_ustack_lpsize;
 505                 return (map_szcvec(addr, size, off, max_lpsize,
 506                     privm_lpg_min_physmem));
 507 
 508         } else {
 509                 if (!memcntl)
 510                         max_lpsize = max_privmap_lpsize;
 511                 return (map_szcvec(addr, size, off, max_lpsize,
 512                     privm_lpg_min_physmem));
 513         }
 514 }
 515 
 516 /*
 517  * Handle a pagefault.
 518  */
 519 faultcode_t
 520 pagefault(
 521         caddr_t addr,
 522         enum fault_type type,
 523         enum seg_rw rw,
 524         int iskernel)
 525 {
 526         struct as *as;
 527         struct hat *hat;
 528         struct proc *p;
 529         kthread_t *t;
 530         faultcode_t res;
 531         caddr_t base;
 532         size_t len;
 533         int err;
 534         int mapped_red;
 535         uintptr_t ea;
 536 
 537         ASSERT_STACK_ALIGNED();
 538 
 539         if (INVALID_VADDR(addr))
 540                 return (FC_NOMAP);
 541 
 542         mapped_red = segkp_map_red();
 543 
 544         if (iskernel) {
 545                 as = &kas;
 546                 hat = as->a_hat;
 547         } else {
 548                 t = curthread;
 549                 p = ttoproc(t);
 550                 as = p->p_as;
 551                 hat = as->a_hat;
 552         }
 553 
 554         /*
 555          * Dispatch pagefault.
 556          */
 557         res = as_fault(hat, as, addr, 1, type, rw);
 558 
 559         /*
 560          * If this isn't a potential unmapped hole in the user's
 561          * UNIX data or stack segments, just return status info.
 562          */
 563         if (res != FC_NOMAP || iskernel)
 564                 goto out;
 565 
 566         /*
 567          * Check to see if we happened to faulted on a currently unmapped
 568          * part of the UNIX data or stack segments.  If so, create a zfod
 569          * mapping there and then try calling the fault routine again.
 570          */
 571         base = p->p_brkbase;
 572         len = p->p_brksize;
 573 
 574         if (addr < base || addr >= base + len) {          /* data seg? */
 575                 base = (caddr_t)p->p_usrstack - p->p_stksize;
 576                 len = p->p_stksize;
 577                 if (addr < base || addr >= p->p_usrstack) {    /* stack seg? */
 578                         /* not in either UNIX data or stack segments */
 579                         res = FC_NOMAP;
 580                         goto out;
 581                 }
 582         }
 583 
 584         /*
 585          * the rest of this function implements a 3.X 4.X 5.X compatibility
 586          * This code is probably not needed anymore
 587          */
 588         if (p->p_model == DATAMODEL_ILP32) {
 589 
 590                 /* expand the gap to the page boundaries on each side */
 591                 ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE);
 592                 base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE);
 593                 len = ea - (uintptr_t)base;
 594 
 595                 as_rangelock(as);
 596                 if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) ==
 597                     0) {
 598                         err = as_map(as, base, len, segvn_create, zfod_argsp);
 599                         as_rangeunlock(as);
 600                         if (err) {
 601                                 res = FC_MAKE_ERR(err);
 602                                 goto out;
 603                         }
 604                 } else {
 605                         /*
 606                          * This page is already mapped by another thread after
 607                          * we returned from as_fault() above.  We just fall
 608                          * through as_fault() below.
 609                          */
 610                         as_rangeunlock(as);
 611                 }
 612 
 613                 res = as_fault(hat, as, addr, 1, F_INVAL, rw);
 614         }
 615 
 616 out:
 617         if (mapped_red)
 618                 segkp_unmap_red();
 619 
 620         return (res);
 621 }
 622 
 623 void
 624 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
 625 {
 626         struct proc *p = curproc;
 627         map_addr_proc(addrp, len, off, vacalign,
 628             map_userlimit(p, p->p_as, flags), curproc, flags);
 629 }
 630 
 631 /*ARGSUSED*/
 632 int
 633 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
 634 {
 635         return (0);
 636 }
 637 
 638 /*
 639  * map_addr_proc() is the routine called when the system is to
 640  * choose an address for the user.  We will pick an address
 641  * range which is the highest available below userlimit.
 642  *
 643  * Every mapping will have a redzone of a single page on either side of
 644  * the request. This is done to leave one page unmapped between segments.
 645  * This is not required, but it's useful for the user because if their
 646  * program strays across a segment boundary, it will catch a fault
 647  * immediately making debugging a little easier.  Currently the redzone
 648  * is mandatory.
 649  *
 650  * addrp is a value/result parameter.
 651  *      On input it is a hint from the user to be used in a completely
 652  *      machine dependent fashion.  We decide to completely ignore this hint.
 653  *      If MAP_ALIGN was specified, addrp contains the minimal alignment, which
 654  *      must be some "power of two" multiple of pagesize.
 655  *
 656  *      On output it is NULL if no address can be found in the current
 657  *      processes address space or else an address that is currently
 658  *      not mapped for len bytes with a page of red zone on either side.
 659  *
 660  *      vacalign is not needed on x86 (it's for viturally addressed caches)
 661  */
 662 /*ARGSUSED*/
 663 void
 664 map_addr_proc(
 665         caddr_t *addrp,
 666         size_t len,
 667         offset_t off,
 668         int vacalign,
 669         caddr_t userlimit,
 670         struct proc *p,
 671         uint_t flags)
 672 {
 673         struct as *as = p->p_as;
 674         caddr_t addr;
 675         caddr_t base;
 676         size_t slen;
 677         size_t align_amount;
 678 
 679         ASSERT32(userlimit == as->a_userlimit);
 680 
 681         base = p->p_brkbase;
 682 #if defined(__amd64)
 683         /*
 684          * XX64 Yes, this needs more work.
 685          */
 686         if (p->p_model == DATAMODEL_NATIVE) {
 687                 if (userlimit < as->a_userlimit) {
 688                         /*
 689                          * This happens when a program wants to map
 690                          * something in a range that's accessible to a
 691                          * program in a smaller address space.  For example,
 692                          * a 64-bit program calling mmap32(2) to guarantee
 693                          * that the returned address is below 4Gbytes.
 694                          */
 695                         ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff));
 696 
 697                         if (userlimit > base)
 698                                 slen = userlimit - base;
 699                         else {
 700                                 *addrp = NULL;
 701                                 return;
 702                         }
 703                 } else {
 704                         /*
 705                          * XX64 This layout is probably wrong .. but in
 706                          * the event we make the amd64 address space look
 707                          * like sparcv9 i.e. with the stack -above- the
 708                          * heap, this bit of code might even be correct.
 709                          */
 710                         slen = p->p_usrstack - base -
 711                             ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 712                 }
 713         } else
 714 #endif
 715                 slen = userlimit - base;
 716 
 717         /* Make len be a multiple of PAGESIZE */
 718         len = (len + PAGEOFFSET) & PAGEMASK;
 719 
 720         /*
 721          * figure out what the alignment should be
 722          *
 723          * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same????
 724          */
 725         if (len <= ELF_386_MAXPGSZ) {
 726                 /*
 727                  * Align virtual addresses to ensure that ELF shared libraries
 728                  * are mapped with the appropriate alignment constraints by
 729                  * the run-time linker.
 730                  */
 731                 align_amount = ELF_386_MAXPGSZ;
 732         } else {
 733                 /*
 734                  * For 32-bit processes, only those which have specified
 735                  * MAP_ALIGN and an addr will be aligned on a larger page size.
 736                  * Not doing so can potentially waste up to 1G of process
 737                  * address space.
 738                  */
 739                 int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 :
 740                     mmu.umax_page_level;
 741 
 742                 while (lvl && len < LEVEL_SIZE(lvl))
 743                         --lvl;
 744 
 745                 align_amount = LEVEL_SIZE(lvl);
 746         }
 747         if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount))
 748                 align_amount = (uintptr_t)*addrp;
 749 
 750         ASSERT(ISP2(align_amount));
 751         ASSERT(align_amount == 0 || align_amount >= PAGESIZE);
 752 
 753         off = off & (align_amount - 1);
 754         /*
 755          * Look for a large enough hole starting below userlimit.
 756          * After finding it, use the upper part.
 757          */
 758         if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount,
 759             PAGESIZE, off) == 0) {
 760                 caddr_t as_addr;
 761 
 762                 /*
 763                  * addr is the highest possible address to use since we have
 764                  * a PAGESIZE redzone at the beginning and end.
 765                  */
 766                 addr = base + slen - (PAGESIZE + len);
 767                 as_addr = addr;
 768                 /*
 769                  * Round address DOWN to the alignment amount and
 770                  * add the offset in.
 771                  * If addr is greater than as_addr, len would not be large
 772                  * enough to include the redzone, so we must adjust down
 773                  * by the alignment amount.
 774                  */
 775                 addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1)));
 776                 addr += (uintptr_t)off;
 777                 if (addr > as_addr) {
 778                         addr -= align_amount;
 779                 }
 780 
 781                 ASSERT(addr > base);
 782                 ASSERT(addr + len < base + slen);
 783                 ASSERT(((uintptr_t)addr & (align_amount - 1)) ==
 784                     ((uintptr_t)(off)));
 785                 *addrp = addr;
 786         } else {
 787                 *addrp = NULL;  /* no more virtual space */
 788         }
 789 }
 790 
 791 int valid_va_range_aligned_wraparound;
 792 
 793 /*
 794  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 795  * addresses at least "minlen" long, where the base of the range is at "off"
 796  * phase from an "align" boundary and there is space for a "redzone"-sized
 797  * redzone on either side of the range.  On success, 1 is returned and *basep
 798  * and *lenp are adjusted to describe the acceptable range (including
 799  * the redzone).  On failure, 0 is returned.
 800  */
 801 /*ARGSUSED3*/
 802 int
 803 valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir,
 804     size_t align, size_t redzone, size_t off)
 805 {
 806         uintptr_t hi, lo;
 807         size_t tot_len;
 808 
 809         ASSERT(align == 0 ? off == 0 : off < align);
 810         ASSERT(ISP2(align));
 811         ASSERT(align == 0 || align >= PAGESIZE);
 812 
 813         lo = (uintptr_t)*basep;
 814         hi = lo + *lenp;
 815         tot_len = minlen + 2 * redzone; /* need at least this much space */
 816 
 817         /*
 818          * If hi rolled over the top, try cutting back.
 819          */
 820         if (hi < lo) {
 821                 *lenp = 0UL - lo - 1UL;
 822                 /* See if this really happens. If so, then we figure out why */
 823                 valid_va_range_aligned_wraparound++;
 824                 hi = lo + *lenp;
 825         }
 826         if (*lenp < tot_len) {
 827                 return (0);
 828         }
 829 
 830 #if defined(__amd64)
 831         /*
 832          * Deal with a possible hole in the address range between
 833          * hole_start and hole_end that should never be mapped.
 834          */
 835         if (lo < hole_start) {
 836                 if (hi > hole_start) {
 837                         if (hi < hole_end) {
 838                                 hi = hole_start;
 839                         } else {
 840                                 /* lo < hole_start && hi >= hole_end */
 841                                 if (dir == AH_LO) {
 842                                         /*
 843                                          * prefer lowest range
 844                                          */
 845                                         if (hole_start - lo >= tot_len)
 846                                                 hi = hole_start;
 847                                         else if (hi - hole_end >= tot_len)
 848                                                 lo = hole_end;
 849                                         else
 850                                                 return (0);
 851                                 } else {
 852                                         /*
 853                                          * prefer highest range
 854                                          */
 855                                         if (hi - hole_end >= tot_len)
 856                                                 lo = hole_end;
 857                                         else if (hole_start - lo >= tot_len)
 858                                                 hi = hole_start;
 859                                         else
 860                                                 return (0);
 861                                 }
 862                         }
 863                 }
 864         } else {
 865                 /* lo >= hole_start */
 866                 if (hi < hole_end)
 867                         return (0);
 868                 if (lo < hole_end)
 869                         lo = hole_end;
 870         }
 871 #endif
 872 
 873         if (hi - lo < tot_len)
 874                 return (0);
 875 
 876         if (align > 1) {
 877                 uintptr_t tlo = lo + redzone;
 878                 uintptr_t thi = hi - redzone;
 879                 tlo = (uintptr_t)P2PHASEUP(tlo, align, off);
 880                 if (tlo < lo + redzone) {
 881                         return (0);
 882                 }
 883                 if (thi < tlo || thi - tlo < minlen) {
 884                         return (0);
 885                 }
 886         }
 887 
 888         *basep = (caddr_t)lo;
 889         *lenp = hi - lo;
 890         return (1);
 891 }
 892 
 893 /*
 894  * Determine whether [*basep, *basep + *lenp) contains a mappable range of
 895  * addresses at least "minlen" long.  On success, 1 is returned and *basep
 896  * and *lenp are adjusted to describe the acceptable range.  On failure, 0
 897  * is returned.
 898  */
 899 int
 900 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
 901 {
 902         return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0));
 903 }
 904 
 905 /*
 906  * Determine whether [addr, addr+len] are valid user addresses.
 907  */
 908 /*ARGSUSED*/
 909 int
 910 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
 911     caddr_t userlimit)
 912 {
 913         caddr_t eaddr = addr + len;
 914 
 915         if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
 916                 return (RANGE_BADADDR);
 917 
 918 #if defined(__amd64)
 919         /*
 920          * Check for the VA hole
 921          */
 922         if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end)
 923                 return (RANGE_BADADDR);
 924 #endif
 925 
 926         return (RANGE_OKAY);
 927 }
 928 
 929 /*
 930  * Return 1 if the page frame is onboard memory, else 0.
 931  */
 932 int
 933 pf_is_memory(pfn_t pf)
 934 {
 935         if (pfn_is_foreign(pf))
 936                 return (0);
 937         return (address_in_memlist(phys_install, pfn_to_pa(pf), 1));
 938 }
 939 
 940 /*
 941  * return the memrange containing pfn
 942  */
 943 int
 944 memrange_num(pfn_t pfn)
 945 {
 946         int n;
 947 
 948         for (n = 0; n < nranges - 1; ++n) {
 949                 if (pfn >= memranges[n])
 950                         break;
 951         }
 952         return (n);
 953 }
 954 
 955 /*
 956  * return the mnoderange containing pfn
 957  */
 958 /*ARGSUSED*/
 959 int
 960 pfn_2_mtype(pfn_t pfn)
 961 {
 962 #if defined(__xpv)
 963         return (0);
 964 #else
 965         int     n;
 966 
 967         /* Always start from highest pfn and work our way down */
 968         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
 969                 if (pfn >= mnoderanges[n].mnr_pfnlo) {
 970                         break;
 971                 }
 972         }
 973         return (n);
 974 #endif
 975 }
 976 
 977 #if !defined(__xpv)
 978 /*
 979  * is_contigpage_free:
 980  *      returns a page list of contiguous pages. It minimally has to return
 981  *      minctg pages. Caller determines minctg based on the scatter-gather
 982  *      list length.
 983  *
 984  *      pfnp is set to the next page frame to search on return.
 985  */
 986 static page_t *
 987 is_contigpage_free(
 988         pfn_t *pfnp,
 989         pgcnt_t *pgcnt,
 990         pgcnt_t minctg,
 991         uint64_t pfnseg,
 992         int iolock)
 993 {
 994         int     i = 0;
 995         pfn_t   pfn = *pfnp;
 996         page_t  *pp;
 997         page_t  *plist = NULL;
 998 
 999         /*
1000          * fail if pfn + minctg crosses a segment boundary.
1001          * Adjust for next starting pfn to begin at segment boundary.
1002          */
1003 
1004         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) {
1005                 *pfnp = roundup(*pfnp, pfnseg + 1);
1006                 return (NULL);
1007         }
1008 
1009         do {
1010 retry:
1011                 pp = page_numtopp_nolock(pfn + i);
1012                 if ((pp == NULL) || IS_DUMP_PAGE(pp) ||
1013                     (page_trylock(pp, SE_EXCL) == 0)) {
1014                         (*pfnp)++;
1015                         break;
1016                 }
1017                 if (page_pptonum(pp) != pfn + i) {
1018                         page_unlock(pp);
1019                         goto retry;
1020                 }
1021 
1022                 if (!(PP_ISFREE(pp))) {
1023                         page_unlock(pp);
1024                         (*pfnp)++;
1025                         break;
1026                 }
1027 
1028                 if (!PP_ISAGED(pp)) {
1029                         page_list_sub(pp, PG_CACHE_LIST);
1030                         page_hashout(pp, (kmutex_t *)NULL);
1031                 } else {
1032                         page_list_sub(pp, PG_FREE_LIST);
1033                 }
1034 
1035                 if (iolock)
1036                         page_io_lock(pp);
1037                 page_list_concat(&plist, &pp);
1038 
1039                 /*
1040                  * exit loop when pgcnt satisfied or segment boundary reached.
1041                  */
1042 
1043         } while ((++i < *pgcnt) && ((pfn + i) & pfnseg));
1044 
1045         *pfnp += i;             /* set to next pfn to search */
1046 
1047         if (i >= minctg) {
1048                 *pgcnt -= i;
1049                 return (plist);
1050         }
1051 
1052         /*
1053          * failure: minctg not satisfied.
1054          *
1055          * if next request crosses segment boundary, set next pfn
1056          * to search from the segment boundary.
1057          */
1058         if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg))
1059                 *pfnp = roundup(*pfnp, pfnseg + 1);
1060 
1061         /* clean up any pages already allocated */
1062 
1063         while (plist) {
1064                 pp = plist;
1065                 page_sub(&plist, pp);
1066                 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
1067                 if (iolock)
1068                         page_io_unlock(pp);
1069                 page_unlock(pp);
1070         }
1071 
1072         return (NULL);
1073 }
1074 #endif  /* !__xpv */
1075 
1076 /*
1077  * verify that pages being returned from allocator have correct DMA attribute
1078  */
1079 #ifndef DEBUG
1080 #define check_dma(a, b, c) (void)(0)
1081 #else
1082 static void
1083 check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt)
1084 {
1085         if (dma_attr == NULL)
1086                 return;
1087 
1088         while (cnt-- > 0) {
1089                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) <
1090                     dma_attr->dma_attr_addr_lo)
1091                         panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp);
1092                 if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >=
1093                     dma_attr->dma_attr_addr_hi)
1094                         panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp);
1095                 pp = pp->p_next;
1096         }
1097 }
1098 #endif
1099 
1100 #if !defined(__xpv)
1101 static page_t *
1102 page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock)
1103 {
1104         pfn_t           pfn;
1105         int             sgllen;
1106         uint64_t        pfnseg;
1107         pgcnt_t         minctg;
1108         page_t          *pplist = NULL, *plist;
1109         uint64_t        lo, hi;
1110         pgcnt_t         pfnalign = 0;
1111         static pfn_t    startpfn;
1112         static pgcnt_t  lastctgcnt;
1113         uintptr_t       align;
1114 
1115         CONTIG_LOCK();
1116 
1117         if (mattr) {
1118                 lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET));
1119                 hi = mmu_btop(mattr->dma_attr_addr_hi);
1120                 if (hi >= physmax)
1121                         hi = physmax - 1;
1122                 sgllen = mattr->dma_attr_sgllen;
1123                 pfnseg = mmu_btop(mattr->dma_attr_seg);
1124 
1125                 align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
1126                 if (align > MMU_PAGESIZE)
1127                         pfnalign = mmu_btop(align);
1128 
1129                 /*
1130                  * in order to satisfy the request, must minimally
1131                  * acquire minctg contiguous pages
1132                  */
1133                 minctg = howmany(*pgcnt, sgllen);
1134 
1135                 ASSERT(hi >= lo);
1136 
1137                 /*
1138                  * start from where last searched if the minctg >= lastctgcnt
1139                  */
1140                 if (minctg < lastctgcnt || startpfn < lo || startpfn > hi)
1141                         startpfn = lo;
1142         } else {
1143                 hi = physmax - 1;
1144                 lo = 0;
1145                 sgllen = 1;
1146                 pfnseg = mmu.highest_pfn;
1147                 minctg = *pgcnt;
1148 
1149                 if (minctg < lastctgcnt)
1150                         startpfn = lo;
1151         }
1152         lastctgcnt = minctg;
1153 
1154         ASSERT(pfnseg + 1 >= (uint64_t)minctg);
1155 
1156         /* conserve 16m memory - start search above 16m when possible */
1157         if (hi > PFN_16M && startpfn < PFN_16M)
1158                 startpfn = PFN_16M;
1159 
1160         pfn = startpfn;
1161         if (pfnalign)
1162                 pfn = P2ROUNDUP(pfn, pfnalign);
1163 
1164         while (pfn + minctg - 1 <= hi) {
1165 
1166                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1167                 if (plist) {
1168                         page_list_concat(&pplist, &plist);
1169                         sgllen--;
1170                         /*
1171                          * return when contig pages no longer needed
1172                          */
1173                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1174                                 startpfn = pfn;
1175                                 CONTIG_UNLOCK();
1176                                 check_dma(mattr, pplist, *pgcnt);
1177                                 return (pplist);
1178                         }
1179                         minctg = howmany(*pgcnt, sgllen);
1180                 }
1181                 if (pfnalign)
1182                         pfn = P2ROUNDUP(pfn, pfnalign);
1183         }
1184 
1185         /* cannot find contig pages in specified range */
1186         if (startpfn == lo) {
1187                 CONTIG_UNLOCK();
1188                 return (NULL);
1189         }
1190 
1191         /* did not start with lo previously */
1192         pfn = lo;
1193         if (pfnalign)
1194                 pfn = P2ROUNDUP(pfn, pfnalign);
1195 
1196         /* allow search to go above startpfn */
1197         while (pfn < startpfn) {
1198 
1199                 plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock);
1200                 if (plist != NULL) {
1201 
1202                         page_list_concat(&pplist, &plist);
1203                         sgllen--;
1204 
1205                         /*
1206                          * return when contig pages no longer needed
1207                          */
1208                         if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) {
1209                                 startpfn = pfn;
1210                                 CONTIG_UNLOCK();
1211                                 check_dma(mattr, pplist, *pgcnt);
1212                                 return (pplist);
1213                         }
1214                         minctg = howmany(*pgcnt, sgllen);
1215                 }
1216                 if (pfnalign)
1217                         pfn = P2ROUNDUP(pfn, pfnalign);
1218         }
1219         CONTIG_UNLOCK();
1220         return (NULL);
1221 }
1222 #endif  /* !__xpv */
1223 
1224 /*
1225  * mnode_range_cnt() calculates the number of memory ranges for mnode and
1226  * memranges[]. Used to determine the size of page lists and mnoderanges.
1227  */
1228 int
1229 mnode_range_cnt(int mnode)
1230 {
1231 #if defined(__xpv)
1232         ASSERT(mnode == 0);
1233         return (1);
1234 #else   /* __xpv */
1235         int     mri;
1236         int     mnrcnt = 0;
1237 
1238         if (mem_node_config[mnode].exists != 0) {
1239                 mri = nranges - 1;
1240 
1241                 /* find the memranges index below contained in mnode range */
1242 
1243                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1244                         mri--;
1245 
1246                 /*
1247                  * increment mnode range counter when memranges or mnode
1248                  * boundary is reached.
1249                  */
1250                 while (mri >= 0 &&
1251                     mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1252                         mnrcnt++;
1253                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1254                                 mri--;
1255                         else
1256                                 break;
1257                 }
1258         }
1259         ASSERT(mnrcnt <= MAX_MNODE_MRANGES);
1260         return (mnrcnt);
1261 #endif  /* __xpv */
1262 }
1263 
1264 /*
1265  * mnode_range_setup() initializes mnoderanges.
1266  */
1267 void
1268 mnode_range_setup(mnoderange_t *mnoderanges)
1269 {
1270         mnoderange_t *mp = mnoderanges;
1271         int     mnode, mri;
1272         int     mindex = 0;     /* current index into mnoderanges array */
1273         int     i, j;
1274         pfn_t   hipfn;
1275         int     last, hi;
1276 
1277         for (mnode = 0; mnode < max_mem_nodes; mnode++) {
1278                 if (mem_node_config[mnode].exists == 0)
1279                         continue;
1280 
1281                 mri = nranges - 1;
1282 
1283                 while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1284                         mri--;
1285 
1286                 while (mri >= 0 && mem_node_config[mnode].physmax >=
1287                     MEMRANGELO(mri)) {
1288                         mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri),
1289                             mem_node_config[mnode].physbase);
1290                         mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri),
1291                             mem_node_config[mnode].physmax);
1292                         mnoderanges->mnr_mnode = mnode;
1293                         mnoderanges->mnr_memrange = mri;
1294                         mnoderanges->mnr_exists = 1;
1295                         mnoderanges++;
1296                         mindex++;
1297                         if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1298                                 mri--;
1299                         else
1300                                 break;
1301                 }
1302         }
1303 
1304         /*
1305          * For now do a simple sort of the mnoderanges array to fill in
1306          * the mnr_next fields.  Since mindex is expected to be relatively
1307          * small, using a simple O(N^2) algorithm.
1308          */
1309         for (i = 0; i < mindex; i++) {
1310                 if (mp[i].mnr_pfnlo == 0)       /* find lowest */
1311                         break;
1312         }
1313         ASSERT(i < mindex);
1314         last = i;
1315         mtype16m = last;
1316         mp[last].mnr_next = -1;
1317         for (i = 0; i < mindex - 1; i++) {
1318                 hipfn = (pfn_t)(-1);
1319                 hi = -1;
1320                 /* find next highest mnode range */
1321                 for (j = 0; j < mindex; j++) {
1322                         if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo &&
1323                             mp[j].mnr_pfnlo < hipfn) {
1324                                 hipfn = mp[j].mnr_pfnlo;
1325                                 hi = j;
1326                         }
1327                 }
1328                 mp[hi].mnr_next = last;
1329                 last = hi;
1330         }
1331         mtypetop = last;
1332 }
1333 
1334 #ifndef __xpv
1335 /*
1336  * Update mnoderanges for memory hot-add DR operations.
1337  */
1338 static void
1339 mnode_range_add(int mnode)
1340 {
1341         int     *prev;
1342         int     n, mri;
1343         pfn_t   start, end;
1344         extern  void membar_sync(void);
1345 
1346         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1347         ASSERT(mem_node_config[mnode].exists);
1348         start = mem_node_config[mnode].physbase;
1349         end = mem_node_config[mnode].physmax;
1350         ASSERT(start <= end);
1351         mutex_enter(&mnoderange_lock);
1352 
1353 #ifdef  DEBUG
1354         /* Check whether it interleaves with other memory nodes. */
1355         for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1356                 ASSERT(mnoderanges[n].mnr_exists);
1357                 if (mnoderanges[n].mnr_mnode == mnode)
1358                         continue;
1359                 ASSERT(start > mnoderanges[n].mnr_pfnhi ||
1360                     end < mnoderanges[n].mnr_pfnlo);
1361         }
1362 #endif  /* DEBUG */
1363 
1364         mri = nranges - 1;
1365         while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase)
1366                 mri--;
1367         while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) {
1368                 /* Check whether mtype already exists. */
1369                 for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) {
1370                         if (mnoderanges[n].mnr_mnode == mnode &&
1371                             mnoderanges[n].mnr_memrange == mri) {
1372                                 mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri),
1373                                     start);
1374                                 mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri),
1375                                     end);
1376                                 break;
1377                         }
1378                 }
1379 
1380                 /* Add a new entry if it doesn't exist yet. */
1381                 if (n == -1) {
1382                         /* Try to find an unused entry in mnoderanges array. */
1383                         for (n = 0; n < mnoderangecnt; n++) {
1384                                 if (mnoderanges[n].mnr_exists == 0)
1385                                         break;
1386                         }
1387                         ASSERT(n < mnoderangecnt);
1388                         mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start);
1389                         mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end);
1390                         mnoderanges[n].mnr_mnode = mnode;
1391                         mnoderanges[n].mnr_memrange = mri;
1392                         mnoderanges[n].mnr_exists = 1;
1393                         /* Page 0 should always be present. */
1394                         for (prev = &mtypetop;
1395                             mnoderanges[*prev].mnr_pfnlo > start;
1396                             prev = &mnoderanges[*prev].mnr_next) {
1397                                 ASSERT(mnoderanges[*prev].mnr_next >= 0);
1398                                 ASSERT(mnoderanges[*prev].mnr_pfnlo > end);
1399                         }
1400                         mnoderanges[n].mnr_next = *prev;
1401                         membar_sync();
1402                         *prev = n;
1403                 }
1404 
1405                 if (mem_node_config[mnode].physmax > MEMRANGEHI(mri))
1406                         mri--;
1407                 else
1408                         break;
1409         }
1410 
1411         mutex_exit(&mnoderange_lock);
1412 }
1413 
1414 /*
1415  * Update mnoderanges for memory hot-removal DR operations.
1416  */
1417 static void
1418 mnode_range_del(int mnode)
1419 {
1420         _NOTE(ARGUNUSED(mnode));
1421         ASSERT(0 <= mnode && mnode < max_mem_nodes);
1422         /* TODO: support deletion operation. */
1423         ASSERT(0);
1424 }
1425 
1426 void
1427 plat_slice_add(pfn_t start, pfn_t end)
1428 {
1429         mem_node_add_slice(start, end);
1430         if (plat_dr_enabled()) {
1431                 mnode_range_add(PFN_2_MEM_NODE(start));
1432         }
1433 }
1434 
1435 void
1436 plat_slice_del(pfn_t start, pfn_t end)
1437 {
1438         ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end));
1439         ASSERT(plat_dr_enabled());
1440         mnode_range_del(PFN_2_MEM_NODE(start));
1441         mem_node_del_slice(start, end);
1442 }
1443 #endif  /* __xpv */
1444 
1445 /*ARGSUSED*/
1446 int
1447 mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz)
1448 {
1449         int mtype = mtypetop;
1450 
1451 #if !defined(__xpv)
1452 #if defined(__i386)
1453         /*
1454          * set the mtype range
1455          * - kmem requests need to be below 4g if restricted_kmemalloc is set.
1456          * - for non kmem requests, set range to above 4g if memory below 4g
1457          * runs low.
1458          */
1459         if (restricted_kmemalloc && VN_ISKAS(vp) &&
1460             (caddr_t)(vaddr) >= kernelheap &&
1461             (caddr_t)(vaddr) < ekernelheap) {
1462                 ASSERT(physmax4g);
1463                 mtype = mtype4g;
1464                 if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz),
1465                     btop(pgsz), *flags)) {
1466                         *flags |= PGI_MT_RANGE16M;
1467                 } else {
1468                         VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1469                         VM_STAT_COND_ADD((*flags & PG_PANIC),
1470                             vmm_vmstats.pgpanicalloc);
1471                         *flags |= PGI_MT_RANGE0;
1472                 }
1473                 return (mtype);
1474         }
1475 #endif  /* __i386 */
1476 
1477         if (RESTRICT4G_ALLOC) {
1478                 VM_STAT_ADD(vmm_vmstats.restrict4gcnt);
1479                 /* here only for > 4g systems */
1480                 *flags |= PGI_MT_RANGE4G;
1481         } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) {
1482                 *flags |= PGI_MT_RANGE16M;
1483         } else {
1484                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1485                 VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc);
1486                 *flags |= PGI_MT_RANGE0;
1487         }
1488 #endif /* !__xpv */
1489         return (mtype);
1490 }
1491 
1492 
1493 /* mtype init for page_get_replacement_page */
1494 /*ARGSUSED*/
1495 int
1496 mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt)
1497 {
1498         int mtype = mtypetop;
1499 #if !defined(__xpv)
1500         if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) {
1501                 *flags |= PGI_MT_RANGE16M;
1502         } else {
1503                 VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt);
1504                 *flags |= PGI_MT_RANGE0;
1505         }
1506 #endif
1507         return (mtype);
1508 }
1509 
1510 /*
1511  * Determine if the mnode range specified in mtype contains memory belonging
1512  * to memory node mnode.  If flags & PGI_MT_RANGE is set then mtype contains
1513  * the range from high pfn to 0, 16m or 4g.
1514  *
1515  * Return first mnode range type index found otherwise return -1 if none found.
1516  */
1517 int
1518 mtype_func(int mnode, int mtype, uint_t flags)
1519 {
1520         if (flags & PGI_MT_RANGE) {
1521                 int     mnr_lim = MRI_0;
1522 
1523                 if (flags & PGI_MT_NEXT) {
1524                         mtype = mnoderanges[mtype].mnr_next;
1525                 }
1526                 if (flags & PGI_MT_RANGE4G)
1527                         mnr_lim = MRI_4G;       /* exclude 0-4g range */
1528                 else if (flags & PGI_MT_RANGE16M)
1529                         mnr_lim = MRI_16M;      /* exclude 0-16m range */
1530                 while (mtype != -1 &&
1531                     mnoderanges[mtype].mnr_memrange <= mnr_lim) {
1532                         if (mnoderanges[mtype].mnr_mnode == mnode)
1533                                 return (mtype);
1534                         mtype = mnoderanges[mtype].mnr_next;
1535                 }
1536         } else if (mnoderanges[mtype].mnr_mnode == mnode) {
1537                 return (mtype);
1538         }
1539         return (-1);
1540 }
1541 
1542 /*
1543  * Update the page list max counts with the pfn range specified by the
1544  * input parameters.
1545  */
1546 void
1547 mtype_modify_max(pfn_t startpfn, long cnt)
1548 {
1549         int             mtype;
1550         pgcnt_t         inc;
1551         spgcnt_t        scnt = (spgcnt_t)(cnt);
1552         pgcnt_t         acnt = ABS(scnt);
1553         pfn_t           endpfn = startpfn + acnt;
1554         pfn_t           pfn, lo;
1555 
1556         if (!physmax4g)
1557                 return;
1558 
1559         mtype = mtypetop;
1560         for (pfn = endpfn; pfn > startpfn; ) {
1561                 ASSERT(mtype != -1);
1562                 lo = mnoderanges[mtype].mnr_pfnlo;
1563                 if (pfn > lo) {
1564                         if (startpfn >= lo) {
1565                                 inc = pfn - startpfn;
1566                         } else {
1567                                 inc = pfn - lo;
1568                         }
1569                         if (mnoderanges[mtype].mnr_memrange != MRI_4G) {
1570                                 if (scnt > 0)
1571                                         maxmem4g += inc;
1572                                 else
1573                                         maxmem4g -= inc;
1574                         }
1575                         pfn -= inc;
1576                 }
1577                 mtype = mnoderanges[mtype].mnr_next;
1578         }
1579 }
1580 
1581 int
1582 mtype_2_mrange(int mtype)
1583 {
1584         return (mnoderanges[mtype].mnr_memrange);
1585 }
1586 
1587 void
1588 mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi)
1589 {
1590         _NOTE(ARGUNUSED(mnode));
1591         ASSERT(mnoderanges[mtype].mnr_mnode == mnode);
1592         *pfnlo = mnoderanges[mtype].mnr_pfnlo;
1593         *pfnhi = mnoderanges[mtype].mnr_pfnhi;
1594 }
1595 
1596 size_t
1597 plcnt_sz(size_t ctrs_sz)
1598 {
1599 #ifdef DEBUG
1600         int     szc, colors;
1601 
1602         ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes;
1603         for (szc = 0; szc < mmu_page_sizes; szc++) {
1604                 colors = page_get_pagecolors(szc);
1605                 ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors;
1606         }
1607 #endif
1608         return (ctrs_sz);
1609 }
1610 
1611 caddr_t
1612 plcnt_init(caddr_t addr)
1613 {
1614 #ifdef DEBUG
1615         int     mt, szc, colors;
1616 
1617         for (mt = 0; mt < mnoderangecnt; mt++) {
1618                 mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr;
1619                 addr += (sizeof (struct mnr_mts) * mmu_page_sizes);
1620                 for (szc = 0; szc < mmu_page_sizes; szc++) {
1621                         colors = page_get_pagecolors(szc);
1622                         mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors;
1623                         mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt =
1624                             (pgcnt_t *)addr;
1625                         addr += (sizeof (pgcnt_t) * colors);
1626                 }
1627         }
1628 #endif
1629         return (addr);
1630 }
1631 
1632 void
1633 plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags)
1634 {
1635         _NOTE(ARGUNUSED(pp));
1636 #ifdef DEBUG
1637         int     bin = PP_2_BIN(pp);
1638 
1639         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt);
1640         atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin],
1641             cnt);
1642 #endif
1643         ASSERT(mtype == PP_2_MTYPE(pp));
1644         if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G)
1645                 atomic_add_long(&freemem4g, cnt);
1646         if (flags & PG_CACHE_LIST)
1647                 atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt);
1648         else
1649                 atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt);
1650         atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt);
1651 }
1652 
1653 /*
1654  * Returns the free page count for mnode
1655  */
1656 int
1657 mnode_pgcnt(int mnode)
1658 {
1659         int     mtype = mtypetop;
1660         int     flags = PGI_MT_RANGE0;
1661         pgcnt_t pgcnt = 0;
1662 
1663         mtype = mtype_func(mnode, mtype, flags);
1664 
1665         while (mtype != -1) {
1666                 pgcnt += MTYPE_FREEMEM(mtype);
1667                 mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT);
1668         }
1669         return (pgcnt);
1670 }
1671 
1672 /*
1673  * Initialize page coloring variables based on the l2 cache parameters.
1674  * Calculate and return memory needed for page coloring data structures.
1675  */
1676 size_t
1677 page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc)
1678 {
1679         _NOTE(ARGUNUSED(l2_linesz));
1680         size_t  colorsz = 0;
1681         int     i;
1682         int     colors;
1683 
1684 #if defined(__xpv)
1685         /*
1686          * Hypervisor domains currently don't have any concept of NUMA.
1687          * Hence we'll act like there is only 1 memrange.
1688          */
1689         i = memrange_num(1);
1690 #else /* !__xpv */
1691         /*
1692          * Reduce the memory ranges lists if we don't have large amounts
1693          * of memory. This avoids searching known empty free lists.
1694          * To support memory DR operations, we need to keep memory ranges
1695          * for possible memory hot-add operations.
1696          */
1697         if (plat_dr_physmax > physmax)
1698                 i = memrange_num(plat_dr_physmax);
1699         else
1700                 i = memrange_num(physmax);
1701 #if defined(__i386)
1702         if (i > MRI_4G)
1703                 restricted_kmemalloc = 0;
1704 #endif
1705         /* physmax greater than 4g */
1706         if (i == MRI_4G)
1707                 physmax4g = 1;
1708 #endif /* !__xpv */
1709         memranges += i;
1710         nranges -= i;
1711 
1712         ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES);
1713 
1714         ASSERT(ISP2(l2_linesz));
1715         ASSERT(l2_sz > MMU_PAGESIZE);
1716 
1717         /* l2_assoc is 0 for fully associative l2 cache */
1718         if (l2_assoc)
1719                 l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE));
1720         else
1721                 l2_colors = 1;
1722 
1723         ASSERT(ISP2(l2_colors));
1724 
1725         /* for scalability, configure at least PAGE_COLORS_MIN color bins */
1726         page_colors = MAX(l2_colors, PAGE_COLORS_MIN);
1727 
1728         /*
1729          * cpu_page_colors is non-zero when a page color may be spread across
1730          * multiple bins.
1731          */
1732         if (l2_colors < page_colors)
1733                 cpu_page_colors = l2_colors;
1734 
1735         ASSERT(ISP2(page_colors));
1736 
1737         page_colors_mask = page_colors - 1;
1738 
1739         ASSERT(ISP2(CPUSETSIZE()));
1740         page_coloring_shift = lowbit(CPUSETSIZE());
1741 
1742         /* initialize number of colors per page size */
1743         for (i = 0; i <= mmu.max_page_level; i++) {
1744                 hw_page_array[i].hp_size = LEVEL_SIZE(i);
1745                 hw_page_array[i].hp_shift = LEVEL_SHIFT(i);
1746                 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0);
1747                 hw_page_array[i].hp_colors = (page_colors_mask >>
1748                     (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift))
1749                     + 1;
1750                 colorequivszc[i] = 0;
1751         }
1752 
1753         /*
1754          * The value of cpu_page_colors determines if additional color bins
1755          * need to be checked for a particular color in the page_get routines.
1756          */
1757         if (cpu_page_colors != 0) {
1758 
1759                 int a = lowbit(page_colors) - lowbit(cpu_page_colors);
1760                 ASSERT(a > 0);
1761                 ASSERT(a < 16);
1762 
1763                 for (i = 0; i <= mmu.max_page_level; i++) {
1764                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1765                                 colorequivszc[i] = 0;
1766                                 continue;
1767                         }
1768                         while ((colors >> a) == 0)
1769                                 a--;
1770                         ASSERT(a >= 0);
1771 
1772                         /* higher 4 bits encodes color equiv mask */
1773                         colorequivszc[i] = (a << 4);
1774                 }
1775         }
1776 
1777         /* factor in colorequiv to check additional 'equivalent' bins. */
1778         if (colorequiv > 1) {
1779 
1780                 int a = lowbit(colorequiv) - 1;
1781                 if (a > 15)
1782                         a = 15;
1783 
1784                 for (i = 0; i <= mmu.max_page_level; i++) {
1785                         if ((colors = hw_page_array[i].hp_colors) <= 1) {
1786                                 continue;
1787                         }
1788                         while ((colors >> a) == 0)
1789                                 a--;
1790                         if ((a << 4) > colorequivszc[i]) {
1791                                 colorequivszc[i] = (a << 4);
1792                         }
1793                 }
1794         }
1795 
1796         /* size for mnoderanges */
1797         for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++)
1798                 mnoderangecnt += mnode_range_cnt(i);
1799         if (plat_dr_support_memory()) {
1800                 /*
1801                  * Reserve enough space for memory DR operations.
1802                  * Two extra mnoderanges for possbile fragmentations,
1803                  * one for the 2G boundary and the other for the 4G boundary.
1804                  * We don't expect a memory board crossing the 16M boundary
1805                  * for memory hot-add operations on x86 platforms.
1806                  */
1807                 mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt;
1808         }
1809         colorsz = mnoderangecnt * sizeof (mnoderange_t);
1810 
1811         /* size for fpc_mutex and cpc_mutex */
1812         colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX);
1813 
1814         /* size of page_freelists */
1815         colorsz += mnoderangecnt * sizeof (page_t ***);
1816         colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **);
1817 
1818         for (i = 0; i < mmu_page_sizes; i++) {
1819                 colors = page_get_pagecolors(i);
1820                 colorsz += mnoderangecnt * colors * sizeof (page_t *);
1821         }
1822 
1823         /* size of page_cachelists */
1824         colorsz += mnoderangecnt * sizeof (page_t **);
1825         colorsz += mnoderangecnt * page_colors * sizeof (page_t *);
1826 
1827         return (colorsz);
1828 }
1829 
1830 /*
1831  * Called once at startup to configure page_coloring data structures and
1832  * does the 1st page_free()/page_freelist_add().
1833  */
1834 void
1835 page_coloring_setup(caddr_t pcmemaddr)
1836 {
1837         int     i;
1838         int     j;
1839         int     k;
1840         caddr_t addr;
1841         int     colors;
1842 
1843         /*
1844          * do page coloring setup
1845          */
1846         addr = pcmemaddr;
1847 
1848         mnoderanges = (mnoderange_t *)addr;
1849         addr += (mnoderangecnt * sizeof (mnoderange_t));
1850 
1851         mnode_range_setup(mnoderanges);
1852 
1853         if (physmax4g)
1854                 mtype4g = pfn_2_mtype(0xfffff);
1855 
1856         for (k = 0; k < NPC_MUTEX; k++) {
1857                 fpc_mutex[k] = (kmutex_t *)addr;
1858                 addr += (max_mem_nodes * sizeof (kmutex_t));
1859         }
1860         for (k = 0; k < NPC_MUTEX; k++) {
1861                 cpc_mutex[k] = (kmutex_t *)addr;
1862                 addr += (max_mem_nodes * sizeof (kmutex_t));
1863         }
1864         page_freelists = (page_t ****)addr;
1865         addr += (mnoderangecnt * sizeof (page_t ***));
1866 
1867         page_cachelists = (page_t ***)addr;
1868         addr += (mnoderangecnt * sizeof (page_t **));
1869 
1870         for (i = 0; i < mnoderangecnt; i++) {
1871                 page_freelists[i] = (page_t ***)addr;
1872                 addr += (mmu_page_sizes * sizeof (page_t **));
1873 
1874                 for (j = 0; j < mmu_page_sizes; j++) {
1875                         colors = page_get_pagecolors(j);
1876                         page_freelists[i][j] = (page_t **)addr;
1877                         addr += (colors * sizeof (page_t *));
1878                 }
1879                 page_cachelists[i] = (page_t **)addr;
1880                 addr += (page_colors * sizeof (page_t *));
1881         }
1882 }
1883 
1884 #if defined(__xpv)
1885 /*
1886  * Give back 10% of the io_pool pages to the free list.
1887  * Don't shrink the pool below some absolute minimum.
1888  */
1889 static void
1890 page_io_pool_shrink()
1891 {
1892         int retcnt;
1893         page_t *pp, *pp_first, *pp_last, **curpool;
1894         mfn_t mfn;
1895         int bothpools = 0;
1896 
1897         mutex_enter(&io_pool_lock);
1898         io_pool_shrink_attempts++;      /* should be a kstat? */
1899         retcnt = io_pool_cnt / 10;
1900         if (io_pool_cnt - retcnt < io_pool_cnt_min)
1901                 retcnt = io_pool_cnt - io_pool_cnt_min;
1902         if (retcnt <= 0)
1903                 goto done;
1904         io_pool_shrinks++;      /* should be a kstat? */
1905         curpool = &io_pool_4g;
1906 domore:
1907         /*
1908          * Loop through taking pages from the end of the list
1909          * (highest mfns) till amount to return reached.
1910          */
1911         for (pp = *curpool; pp && retcnt > 0; ) {
1912                 pp_first = pp_last = pp->p_prev;
1913                 if (pp_first == *curpool)
1914                         break;
1915                 retcnt--;
1916                 io_pool_cnt--;
1917                 page_io_pool_sub(curpool, pp_first, pp_last);
1918                 if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn)
1919                         start_mfn = mfn;
1920                 page_free(pp_first, 1);
1921                 pp = *curpool;
1922         }
1923         if (retcnt != 0 && !bothpools) {
1924                 /*
1925                  * If not enough found in less constrained pool try the
1926                  * more constrained one.
1927                  */
1928                 curpool = &io_pool_16m;
1929                 bothpools = 1;
1930                 goto domore;
1931         }
1932 done:
1933         mutex_exit(&io_pool_lock);
1934 }
1935 
1936 #endif  /* __xpv */
1937 
1938 uint_t
1939 page_create_update_flags_x86(uint_t flags)
1940 {
1941 #if defined(__xpv)
1942         /*
1943          * Check this is an urgent allocation and free pages are depleted.
1944          */
1945         if (!(flags & PG_WAIT) && freemem < desfree)
1946                 page_io_pool_shrink();
1947 #else /* !__xpv */
1948         /*
1949          * page_create_get_something may call this because 4g memory may be
1950          * depleted. Set flags to allow for relocation of base page below
1951          * 4g if necessary.
1952          */
1953         if (physmax4g)
1954                 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
1955 #endif /* __xpv */
1956         return (flags);
1957 }
1958 
1959 /*ARGSUSED*/
1960 int
1961 bp_color(struct buf *bp)
1962 {
1963         return (0);
1964 }
1965 
1966 #if defined(__xpv)
1967 
1968 /*
1969  * Take pages out of an io_pool
1970  */
1971 static void
1972 page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last)
1973 {
1974         if (*poolp == pp_first) {
1975                 *poolp = pp_last->p_next;
1976                 if (*poolp == pp_first)
1977                         *poolp = NULL;
1978         }
1979         pp_first->p_prev->p_next = pp_last->p_next;
1980         pp_last->p_next->p_prev = pp_first->p_prev;
1981         pp_first->p_prev = pp_last;
1982         pp_last->p_next = pp_first;
1983 }
1984 
1985 /*
1986  * Put a page on the io_pool list. The list is ordered by increasing MFN.
1987  */
1988 static void
1989 page_io_pool_add(page_t **poolp, page_t *pp)
1990 {
1991         page_t  *look;
1992         mfn_t   mfn = mfn_list[pp->p_pagenum];
1993 
1994         if (*poolp == NULL) {
1995                 *poolp = pp;
1996                 pp->p_next = pp;
1997                 pp->p_prev = pp;
1998                 return;
1999         }
2000 
2001         /*
2002          * Since we try to take pages from the high end of the pool
2003          * chances are good that the pages to be put on the list will
2004          * go at or near the end of the list. so start at the end and
2005          * work backwards.
2006          */
2007         look = (*poolp)->p_prev;
2008         while (mfn < mfn_list[look->p_pagenum]) {
2009                 look = look->p_prev;
2010                 if (look == (*poolp)->p_prev)
2011                         break; /* backed all the way to front of list */
2012         }
2013 
2014         /* insert after look */
2015         pp->p_prev = look;
2016         pp->p_next = look->p_next;
2017         pp->p_next->p_prev = pp;
2018         look->p_next = pp;
2019         if (mfn < mfn_list[(*poolp)->p_pagenum]) {
2020                 /*
2021                  * we inserted a new first list element
2022                  * adjust pool pointer to newly inserted element
2023                  */
2024                 *poolp = pp;
2025         }
2026 }
2027 
2028 /*
2029  * Add a page to the io_pool.  Setting the force flag will force the page
2030  * into the io_pool no matter what.
2031  */
2032 static void
2033 add_page_to_pool(page_t *pp, int force)
2034 {
2035         page_t *highest;
2036         page_t *freep = NULL;
2037 
2038         mutex_enter(&io_pool_lock);
2039         /*
2040          * Always keep the scarce low memory pages
2041          */
2042         if (mfn_list[pp->p_pagenum] < PFN_16MEG) {
2043                 ++io_pool_cnt;
2044                 page_io_pool_add(&io_pool_16m, pp);
2045                 goto done;
2046         }
2047         if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) {
2048                 ++io_pool_cnt;
2049                 page_io_pool_add(&io_pool_4g, pp);
2050         } else {
2051                 highest = io_pool_4g->p_prev;
2052                 if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) {
2053                         page_io_pool_sub(&io_pool_4g, highest, highest);
2054                         page_io_pool_add(&io_pool_4g, pp);
2055                         freep = highest;
2056                 } else {
2057                         freep = pp;
2058                 }
2059         }
2060 done:
2061         mutex_exit(&io_pool_lock);
2062         if (freep)
2063                 page_free(freep, 1);
2064 }
2065 
2066 
2067 int contig_pfn_cnt;     /* no of pfns in the contig pfn list */
2068 int contig_pfn_max;     /* capacity of the contig pfn list */
2069 int next_alloc_pfn;     /* next position in list to start a contig search */
2070 int contig_pfnlist_updates;     /* pfn list update count */
2071 int contig_pfnlist_builds;      /* how many times have we (re)built list */
2072 int contig_pfnlist_buildfailed; /* how many times has list build failed */
2073 int create_contig_pending;      /* nonzero means taskq creating contig list */
2074 pfn_t *contig_pfn_list = NULL;  /* list of contig pfns in ascending mfn order */
2075 
2076 /*
2077  * Function to use in sorting a list of pfns by their underlying mfns.
2078  */
2079 static int
2080 mfn_compare(const void *pfnp1, const void *pfnp2)
2081 {
2082         mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1];
2083         mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2];
2084 
2085         if (mfn1 > mfn2)
2086                 return (1);
2087         if (mfn1 < mfn2)
2088                 return (-1);
2089         return (0);
2090 }
2091 
2092 /*
2093  * Compact the contig_pfn_list by tossing all the non-contiguous
2094  * elements from the list.
2095  */
2096 static void
2097 compact_contig_pfn_list(void)
2098 {
2099         pfn_t pfn, lapfn, prev_lapfn;
2100         mfn_t mfn;
2101         int i, newcnt = 0;
2102 
2103         prev_lapfn = 0;
2104         for (i = 0; i < contig_pfn_cnt - 1; i++) {
2105                 pfn = contig_pfn_list[i];
2106                 lapfn = contig_pfn_list[i + 1];
2107                 mfn = mfn_list[pfn];
2108                 /*
2109                  * See if next pfn is for a contig mfn
2110                  */
2111                 if (mfn_list[lapfn] != mfn + 1)
2112                         continue;
2113                 /*
2114                  * pfn and lookahead are both put in list
2115                  * unless pfn is the previous lookahead.
2116                  */
2117                 if (pfn != prev_lapfn)
2118                         contig_pfn_list[newcnt++] = pfn;
2119                 contig_pfn_list[newcnt++] = lapfn;
2120                 prev_lapfn = lapfn;
2121         }
2122         for (i = newcnt; i < contig_pfn_cnt; i++)
2123                 contig_pfn_list[i] = 0;
2124         contig_pfn_cnt = newcnt;
2125 }
2126 
2127 /*ARGSUSED*/
2128 static void
2129 call_create_contiglist(void *arg)
2130 {
2131         (void) create_contig_pfnlist(PG_WAIT);
2132 }
2133 
2134 /*
2135  * Create list of freelist pfns that have underlying
2136  * contiguous mfns.  The list is kept in ascending mfn order.
2137  * returns 1 if list created else 0.
2138  */
2139 static int
2140 create_contig_pfnlist(uint_t flags)
2141 {
2142         pfn_t pfn;
2143         page_t *pp;
2144         int ret = 1;
2145 
2146         mutex_enter(&contig_list_lock);
2147         if (contig_pfn_list != NULL)
2148                 goto out;
2149         contig_pfn_max = freemem + (freemem / 10);
2150         contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t),
2151             (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP);
2152         if (contig_pfn_list == NULL) {
2153                 /*
2154                  * If we could not create the contig list (because
2155                  * we could not sleep for memory).  Dispatch a taskq that can
2156                  * sleep to get the memory.
2157                  */
2158                 if (!create_contig_pending) {
2159                         if (taskq_dispatch(system_taskq, call_create_contiglist,
2160                             NULL, TQ_NOSLEEP) != NULL)
2161                                 create_contig_pending = 1;
2162                 }
2163                 contig_pfnlist_buildfailed++;   /* count list build failures */
2164                 ret = 0;
2165                 goto out;
2166         }
2167         create_contig_pending = 0;
2168         ASSERT(contig_pfn_cnt == 0);
2169         for (pfn = 0; pfn < mfn_count; pfn++) {
2170                 pp = page_numtopp_nolock(pfn);
2171                 if (pp == NULL || !PP_ISFREE(pp))
2172                         continue;
2173                 contig_pfn_list[contig_pfn_cnt] = pfn;
2174                 if (++contig_pfn_cnt == contig_pfn_max)
2175                         break;
2176         }
2177         /*
2178          * Sanity check the new list.
2179          */
2180         if (contig_pfn_cnt < 2) { /* no contig pfns */
2181                 contig_pfn_cnt = 0;
2182                 contig_pfnlist_buildfailed++;
2183                 kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t));
2184                 contig_pfn_list = NULL;
2185                 contig_pfn_max = 0;
2186                 ret = 0;
2187                 goto out;
2188         }
2189         qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare);
2190         compact_contig_pfn_list();
2191         /*
2192          * Make sure next search of the newly created contiguous pfn
2193          * list starts at the beginning of the list.
2194          */
2195         next_alloc_pfn = 0;
2196         contig_pfnlist_builds++;        /* count list builds */
2197 out:
2198         mutex_exit(&contig_list_lock);
2199         return (ret);
2200 }
2201 
2202 
2203 /*
2204  * Toss the current contig pfnlist.  Someone is about to do a massive
2205  * update to pfn<->mfn mappings.  So we have them destroy the list and lock
2206  * it till they are done with their update.
2207  */
2208 void
2209 clear_and_lock_contig_pfnlist()
2210 {
2211         pfn_t *listp = NULL;
2212         size_t listsize;
2213 
2214         mutex_enter(&contig_list_lock);
2215         if (contig_pfn_list != NULL) {
2216                 listp = contig_pfn_list;
2217                 listsize = contig_pfn_max * sizeof (pfn_t);
2218                 contig_pfn_list = NULL;
2219                 contig_pfn_max = contig_pfn_cnt = 0;
2220         }
2221         if (listp != NULL)
2222                 kmem_free(listp, listsize);
2223 }
2224 
2225 /*
2226  * Unlock the contig_pfn_list.  The next attempted use of it will cause
2227  * it to be re-created.
2228  */
2229 void
2230 unlock_contig_pfnlist()
2231 {
2232         mutex_exit(&contig_list_lock);
2233 }
2234 
2235 /*
2236  * Update the contiguous pfn list in response to a pfn <-> mfn reassignment
2237  */
2238 void
2239 update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn)
2240 {
2241         int probe_hi, probe_lo, probe_pos, insert_after, insert_point;
2242         pfn_t probe_pfn;
2243         mfn_t probe_mfn;
2244         int drop_lock = 0;
2245 
2246         if (mutex_owner(&contig_list_lock) != curthread) {
2247                 drop_lock = 1;
2248                 mutex_enter(&contig_list_lock);
2249         }
2250         if (contig_pfn_list == NULL)
2251                 goto done;
2252         contig_pfnlist_updates++;
2253         /*
2254          * Find the pfn in the current list.  Use a binary chop to locate it.
2255          */
2256         probe_hi = contig_pfn_cnt - 1;
2257         probe_lo = 0;
2258         probe_pos = (probe_hi + probe_lo) / 2;
2259         while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) {
2260                 if (probe_pos == probe_lo) { /* pfn not in list */
2261                         probe_pos = -1;
2262                         break;
2263                 }
2264                 if (pfn_to_mfn(probe_pfn) <= oldmfn)
2265                         probe_lo = probe_pos;
2266                 else
2267                         probe_hi = probe_pos;
2268                 probe_pos = (probe_hi + probe_lo) / 2;
2269         }
2270         if (probe_pos >= 0) {
2271                 /*
2272                  * Remove pfn from list and ensure next alloc
2273                  * position stays in bounds.
2274                  */
2275                 if (--contig_pfn_cnt <= next_alloc_pfn)
2276                         next_alloc_pfn = 0;
2277                 if (contig_pfn_cnt < 2) { /* no contig pfns */
2278                         contig_pfn_cnt = 0;
2279                         kmem_free(contig_pfn_list,
2280                             contig_pfn_max * sizeof (pfn_t));
2281                         contig_pfn_list = NULL;
2282                         contig_pfn_max = 0;
2283                         goto done;
2284                 }
2285                 ovbcopy(&contig_pfn_list[probe_pos + 1],
2286                     &contig_pfn_list[probe_pos],
2287                     (contig_pfn_cnt - probe_pos) * sizeof (pfn_t));
2288         }
2289         if (newmfn == MFN_INVALID)
2290                 goto done;
2291         /*
2292          * Check if new mfn has adjacent mfns in the list
2293          */
2294         probe_hi = contig_pfn_cnt - 1;
2295         probe_lo = 0;
2296         insert_after = -2;
2297         do {
2298                 probe_pos = (probe_hi + probe_lo) / 2;
2299                 probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]);
2300                 if (newmfn == probe_mfn + 1)
2301                         insert_after = probe_pos;
2302                 else if (newmfn == probe_mfn - 1)
2303                         insert_after = probe_pos - 1;
2304                 if (probe_pos == probe_lo)
2305                         break;
2306                 if (probe_mfn <= newmfn)
2307                         probe_lo = probe_pos;
2308                 else
2309                         probe_hi = probe_pos;
2310         } while (insert_after == -2);
2311         /*
2312          * If there is space in the list and there are adjacent mfns
2313          * insert the pfn in to its proper place in the list.
2314          */
2315         if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) {
2316                 insert_point = insert_after + 1;
2317                 ovbcopy(&contig_pfn_list[insert_point],
2318                     &contig_pfn_list[insert_point + 1],
2319                     (contig_pfn_cnt - insert_point) * sizeof (pfn_t));
2320                 contig_pfn_list[insert_point] = pfn;
2321                 contig_pfn_cnt++;
2322         }
2323 done:
2324         if (drop_lock)
2325                 mutex_exit(&contig_list_lock);
2326 }
2327 
2328 /*
2329  * Called to (re-)populate the io_pool from the free page lists.
2330  */
2331 long
2332 populate_io_pool(void)
2333 {
2334         pfn_t pfn;
2335         mfn_t mfn, max_mfn;
2336         page_t *pp;
2337 
2338         /*
2339          * Figure out the bounds of the pool on first invocation.
2340          * We use a percentage of memory for the io pool size.
2341          * we allow that to shrink, but not to less than a fixed minimum
2342          */
2343         if (io_pool_cnt_max == 0) {
2344                 io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct);
2345                 io_pool_cnt_lowater = io_pool_cnt_max;
2346                 /*
2347                  * This is the first time in populate_io_pool, grab a va to use
2348                  * when we need to allocate pages.
2349                  */
2350                 io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
2351         }
2352         /*
2353          * If we are out of pages in the pool, then grow the size of the pool
2354          */
2355         if (io_pool_cnt == 0) {
2356                 /*
2357                  * Grow the max size of the io pool by 5%, but never more than
2358                  * 25% of physical memory.
2359                  */
2360                 if (io_pool_cnt_max < physmem / 4)
2361                         io_pool_cnt_max += io_pool_cnt_max / 20;
2362         }
2363         io_pool_grows++;        /* should be a kstat? */
2364 
2365         /*
2366          * Get highest mfn on this platform, but limit to the 32 bit DMA max.
2367          */
2368         (void) mfn_to_pfn(start_mfn);
2369         max_mfn = MIN(cached_max_mfn, PFN_4GIG);
2370         for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) {
2371                 pfn = mfn_to_pfn(mfn);
2372                 if (pfn & PFN_IS_FOREIGN_MFN)
2373                         continue;
2374                 /*
2375                  * try to allocate it from free pages
2376                  */
2377                 pp = page_numtopp_alloc(pfn);
2378                 if (pp == NULL)
2379                         continue;
2380                 PP_CLRFREE(pp);
2381                 add_page_to_pool(pp, 1);
2382                 if (io_pool_cnt >= io_pool_cnt_max)
2383                         break;
2384         }
2385 
2386         return (io_pool_cnt);
2387 }
2388 
2389 /*
2390  * Destroy a page that was being used for DMA I/O. It may or
2391  * may not actually go back to the io_pool.
2392  */
2393 void
2394 page_destroy_io(page_t *pp)
2395 {
2396         mfn_t mfn = mfn_list[pp->p_pagenum];
2397 
2398         /*
2399          * When the page was alloc'd a reservation was made, release it now
2400          */
2401         page_unresv(1);
2402         /*
2403          * Unload translations, if any, then hash out the
2404          * page to erase its identity.
2405          */
2406         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
2407         page_hashout(pp, NULL);
2408 
2409         /*
2410          * If the page came from the free lists, just put it back to them.
2411          * DomU pages always go on the free lists as well.
2412          */
2413         if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) {
2414                 page_free(pp, 1);
2415                 return;
2416         }
2417 
2418         add_page_to_pool(pp, 0);
2419 }
2420 
2421 
2422 long contig_searches;           /* count of times contig pages requested */
2423 long contig_search_restarts;    /* count of contig ranges tried */
2424 long contig_search_failed;      /* count of contig alloc failures */
2425 
2426 /*
2427  * Free partial page list
2428  */
2429 static void
2430 free_partial_list(page_t **pplist)
2431 {
2432         page_t *pp;
2433 
2434         while (*pplist != NULL) {
2435                 pp = *pplist;
2436                 page_io_pool_sub(pplist, pp, pp);
2437                 page_free(pp, 1);
2438         }
2439 }
2440 
2441 /*
2442  * Look thru the contiguous pfns that are not part of the io_pool for
2443  * contiguous free pages.  Return a list of the found pages or NULL.
2444  */
2445 page_t *
2446 find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg,
2447     pgcnt_t pfnalign)
2448 {
2449         page_t *pp, *plist = NULL;
2450         mfn_t mfn, prev_mfn, start_mfn;
2451         pfn_t pfn;
2452         int pages_needed, pages_requested;
2453         int search_start;
2454 
2455         /*
2456          * create the contig pfn list if not already done
2457          */
2458 retry:
2459         mutex_enter(&contig_list_lock);
2460         if (contig_pfn_list == NULL) {
2461                 mutex_exit(&contig_list_lock);
2462                 if (!create_contig_pfnlist(flags)) {
2463                         return (NULL);
2464                 }
2465                 goto retry;
2466         }
2467         contig_searches++;
2468         /*
2469          * Search contiguous pfn list for physically contiguous pages not in
2470          * the io_pool.  Start the search where the last search left off.
2471          */
2472         pages_requested = pages_needed = npages;
2473         search_start = next_alloc_pfn;
2474         start_mfn = prev_mfn = 0;
2475         while (pages_needed) {
2476                 pfn = contig_pfn_list[next_alloc_pfn];
2477                 mfn = pfn_to_mfn(pfn);
2478                 /*
2479                  * Check if mfn is first one or contig to previous one and
2480                  * if page corresponding to mfn is free and that mfn
2481                  * range is not crossing a segment boundary.
2482                  */
2483                 if ((prev_mfn == 0 || mfn == prev_mfn + 1) &&
2484                     (pp = page_numtopp_alloc(pfn)) != NULL &&
2485                     !((mfn & pfnseg) < (start_mfn & pfnseg))) {
2486                         PP_CLRFREE(pp);
2487                         page_io_pool_add(&plist, pp);
2488                         pages_needed--;
2489                         if (prev_mfn == 0) {
2490                                 if (pfnalign &&
2491                                     mfn != P2ROUNDUP(mfn, pfnalign)) {
2492                                         /*
2493                                          * not properly aligned
2494                                          */
2495                                         contig_search_restarts++;
2496                                         free_partial_list(&plist);
2497                                         pages_needed = pages_requested;
2498                                         start_mfn = prev_mfn = 0;
2499                                         goto skip;
2500                                 }
2501                                 start_mfn = mfn;
2502                         }
2503                         prev_mfn = mfn;
2504                 } else {
2505                         contig_search_restarts++;
2506                         free_partial_list(&plist);
2507                         pages_needed = pages_requested;
2508                         start_mfn = prev_mfn = 0;
2509                 }
2510 skip:
2511                 if (++next_alloc_pfn == contig_pfn_cnt)
2512                         next_alloc_pfn = 0;
2513                 if (next_alloc_pfn == search_start)
2514                         break; /* all pfns searched */
2515         }
2516         mutex_exit(&contig_list_lock);
2517         if (pages_needed) {
2518                 contig_search_failed++;
2519                 /*
2520                  * Failed to find enough contig pages.
2521                  * free partial page list
2522                  */
2523                 free_partial_list(&plist);
2524         }
2525         return (plist);
2526 }
2527 
2528 /*
2529  * Search the reserved io pool pages for a page range with the
2530  * desired characteristics.
2531  */
2532 page_t *
2533 page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg)
2534 {
2535         page_t *pp_first, *pp_last;
2536         page_t *pp, **poolp;
2537         pgcnt_t nwanted, pfnalign;
2538         uint64_t pfnseg;
2539         mfn_t mfn, tmfn, hi_mfn, lo_mfn;
2540         int align, attempt = 0;
2541 
2542         if (minctg == 1)
2543                 contig = 0;
2544         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2545         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2546         pfnseg = mmu_btop(mattr->dma_attr_seg);
2547         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2548         if (align > MMU_PAGESIZE)
2549                 pfnalign = mmu_btop(align);
2550         else
2551                 pfnalign = 0;
2552 
2553 try_again:
2554         /*
2555          * See if we want pages for a legacy device
2556          */
2557         if (hi_mfn < PFN_16MEG)
2558                 poolp = &io_pool_16m;
2559         else
2560                 poolp = &io_pool_4g;
2561 try_smaller:
2562         /*
2563          * Take pages from I/O pool. We'll use pages from the highest
2564          * MFN range possible.
2565          */
2566         pp_first = pp_last = NULL;
2567         mutex_enter(&io_pool_lock);
2568         nwanted = minctg;
2569         for (pp = *poolp; pp && nwanted > 0; ) {
2570                 pp = pp->p_prev;
2571 
2572                 /*
2573                  * skip pages above allowable range
2574                  */
2575                 mfn = mfn_list[pp->p_pagenum];
2576                 if (hi_mfn < mfn)
2577                         goto skip;
2578 
2579                 /*
2580                  * stop at pages below allowable range
2581                  */
2582                 if (lo_mfn > mfn)
2583                         break;
2584 restart:
2585                 if (pp_last == NULL) {
2586                         /*
2587                          * Check alignment
2588                          */
2589                         tmfn = mfn - (minctg - 1);
2590                         if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign))
2591                                 goto skip; /* not properly aligned */
2592                         /*
2593                          * Check segment
2594                          */
2595                         if ((mfn & pfnseg) < (tmfn & pfnseg))
2596                                 goto skip; /* crosses seg boundary */
2597                         /*
2598                          * Start building page list
2599                          */
2600                         pp_first = pp_last = pp;
2601                         nwanted--;
2602                 } else {
2603                         /*
2604                          * check physical contiguity if required
2605                          */
2606                         if (contig &&
2607                             mfn_list[pp_first->p_pagenum] != mfn + 1) {
2608                                 /*
2609                                  * not a contiguous page, restart list.
2610                                  */
2611                                 pp_last = NULL;
2612                                 nwanted = minctg;
2613                                 goto restart;
2614                         } else { /* add page to list */
2615                                 pp_first = pp;
2616                                 nwanted--;
2617                         }
2618                 }
2619 skip:
2620                 if (pp == *poolp)
2621                         break;
2622         }
2623 
2624         /*
2625          * If we didn't find memory. Try the more constrained pool, then
2626          * sweep free pages into the DMA pool and try again.
2627          */
2628         if (nwanted != 0) {
2629                 mutex_exit(&io_pool_lock);
2630                 /*
2631                  * If we were looking in the less constrained pool and
2632                  * didn't find pages, try the more constrained pool.
2633                  */
2634                 if (poolp == &io_pool_4g) {
2635                         poolp = &io_pool_16m;
2636                         goto try_smaller;
2637                 }
2638                 kmem_reap();
2639                 if (++attempt < 4) {
2640                         /*
2641                          * Grab some more io_pool pages
2642                          */
2643                         (void) populate_io_pool();
2644                         goto try_again; /* go around and retry */
2645                 }
2646                 return (NULL);
2647         }
2648         /*
2649          * Found the pages, now snip them from the list
2650          */
2651         page_io_pool_sub(poolp, pp_first, pp_last);
2652         io_pool_cnt -= minctg;
2653         /*
2654          * reset low water mark
2655          */
2656         if (io_pool_cnt < io_pool_cnt_lowater)
2657                 io_pool_cnt_lowater = io_pool_cnt;
2658         mutex_exit(&io_pool_lock);
2659         return (pp_first);
2660 }
2661 
2662 page_t *
2663 page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr,
2664     ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg)
2665 {
2666         uint_t kflags;
2667         int order, extra, extpages, i, contig, nbits, extents;
2668         page_t *pp, *expp, *pp_first, **pplist = NULL;
2669         mfn_t *mfnlist = NULL;
2670 
2671         contig = flags & PG_PHYSCONTIG;
2672         if (minctg == 1)
2673                 contig = 0;
2674         flags &= ~PG_PHYSCONTIG;
2675         kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP;
2676         /*
2677          * Hypervisor will allocate extents, if we want contig
2678          * pages extent must be >= minctg
2679          */
2680         if (contig) {
2681                 order = highbit(minctg) - 1;
2682                 if (minctg & ((1 << order) - 1))
2683                         order++;
2684                 extpages = 1 << order;
2685         } else {
2686                 order = 0;
2687                 extpages = minctg;
2688         }
2689         if (extpages > minctg) {
2690                 extra = extpages - minctg;
2691                 if (!page_resv(extra, kflags))
2692                         return (NULL);
2693         }
2694         pp_first = NULL;
2695         pplist = kmem_alloc(extpages * sizeof (page_t *), kflags);
2696         if (pplist == NULL)
2697                 goto balloon_fail;
2698         mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags);
2699         if (mfnlist == NULL)
2700                 goto balloon_fail;
2701         pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr);
2702         if (pp == NULL)
2703                 goto balloon_fail;
2704         pp_first = pp;
2705         if (extpages > minctg) {
2706                 /*
2707                  * fill out the rest of extent pages to swap
2708                  * with the hypervisor
2709                  */
2710                 for (i = 0; i < extra; i++) {
2711                         expp = page_create_va(vp,
2712                             (u_offset_t)(uintptr_t)io_pool_kva,
2713                             PAGESIZE, flags, &kvseg, io_pool_kva);
2714                         if (expp == NULL)
2715                                 goto balloon_fail;
2716                         (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD);
2717                         page_io_unlock(expp);
2718                         page_hashout(expp, NULL);
2719                         page_io_lock(expp);
2720                         /*
2721                          * add page to end of list
2722                          */
2723                         expp->p_prev = pp_first->p_prev;
2724                         expp->p_next = pp_first;
2725                         expp->p_prev->p_next = expp;
2726                         pp_first->p_prev = expp;
2727                 }
2728 
2729         }
2730         for (i = 0; i < extpages; i++) {
2731                 pplist[i] = pp;
2732                 pp = pp->p_next;
2733         }
2734         nbits = highbit(mattr->dma_attr_addr_hi);
2735         extents = contig ? 1 : minctg;
2736         if (balloon_replace_pages(extents, pplist, nbits, order,
2737             mfnlist) != extents) {
2738                 if (ioalloc_dbg)
2739                         cmn_err(CE_NOTE, "request to hypervisor"
2740                             " for %d pages, maxaddr %" PRIx64 " failed",
2741                             extpages, mattr->dma_attr_addr_hi);
2742                 goto balloon_fail;
2743         }
2744 
2745         kmem_free(pplist, extpages * sizeof (page_t *));
2746         kmem_free(mfnlist, extpages * sizeof (mfn_t));
2747         /*
2748          * Return any excess pages to free list
2749          */
2750         if (extpages > minctg) {
2751                 for (i = 0; i < extra; i++) {
2752                         pp = pp_first->p_prev;
2753                         page_sub(&pp_first, pp);
2754                         page_io_unlock(pp);
2755                         page_unresv(1);
2756                         page_free(pp, 1);
2757                 }
2758         }
2759         return (pp_first);
2760 balloon_fail:
2761         /*
2762          * Return pages to free list and return failure
2763          */
2764         while (pp_first != NULL) {
2765                 pp = pp_first;
2766                 page_sub(&pp_first, pp);
2767                 page_io_unlock(pp);
2768                 if (pp->p_vnode != NULL)
2769                         page_hashout(pp, NULL);
2770                 page_free(pp, 1);
2771         }
2772         if (pplist)
2773                 kmem_free(pplist, extpages * sizeof (page_t *));
2774         if (mfnlist)
2775                 kmem_free(mfnlist, extpages * sizeof (mfn_t));
2776         page_unresv(extpages - minctg);
2777         return (NULL);
2778 }
2779 
2780 static void
2781 return_partial_alloc(page_t *plist)
2782 {
2783         page_t *pp;
2784 
2785         while (plist != NULL) {
2786                 pp = plist;
2787                 page_sub(&plist, pp);
2788                 page_io_unlock(pp);
2789                 page_destroy_io(pp);
2790         }
2791 }
2792 
2793 static page_t *
2794 page_get_contigpages(
2795         struct vnode    *vp,
2796         u_offset_t      off,
2797         int             *npagesp,
2798         uint_t          flags,
2799         caddr_t         vaddr,
2800         ddi_dma_attr_t  *mattr)
2801 {
2802         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2803         page_t  *plist; /* list to return */
2804         page_t  *pp, *mcpl;
2805         int     contig, anyaddr, npages, getone = 0;
2806         mfn_t   lo_mfn;
2807         mfn_t   hi_mfn;
2808         pgcnt_t pfnalign = 0;
2809         int     align, sgllen;
2810         uint64_t pfnseg;
2811         pgcnt_t minctg;
2812 
2813         npages = *npagesp;
2814         ASSERT(mattr != NULL);
2815         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2816         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2817         sgllen = mattr->dma_attr_sgllen;
2818         pfnseg = mmu_btop(mattr->dma_attr_seg);
2819         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2820         if (align > MMU_PAGESIZE)
2821                 pfnalign = mmu_btop(align);
2822 
2823         contig = flags & PG_PHYSCONTIG;
2824         if (npages == -1) {
2825                 npages = 1;
2826                 pfnalign = 0;
2827         }
2828         /*
2829          * Clear the contig flag if only one page is needed.
2830          */
2831         if (npages == 1) {
2832                 getone = 1;
2833                 contig = 0;
2834         }
2835 
2836         /*
2837          * Check if any page in the system is fine.
2838          */
2839         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn;
2840         if (!contig && anyaddr && !pfnalign) {
2841                 flags &= ~PG_PHYSCONTIG;
2842                 plist = page_create_va(vp, off, npages * MMU_PAGESIZE,
2843                     flags, &kvseg, vaddr);
2844                 if (plist != NULL) {
2845                         *npagesp = 0;
2846                         return (plist);
2847                 }
2848         }
2849         plist = NULL;
2850         minctg = howmany(npages, sgllen);
2851         while (npages > sgllen || getone) {
2852                 if (minctg > npages)
2853                         minctg = npages;
2854                 mcpl = NULL;
2855                 /*
2856                  * We could want contig pages with no address range limits.
2857                  */
2858                 if (anyaddr && contig) {
2859                         /*
2860                          * Look for free contig pages to satisfy the request.
2861                          */
2862                         mcpl = find_contig_free(minctg, flags, pfnseg,
2863                             pfnalign);
2864                 }
2865                 /*
2866                  * Try the reserved io pools next
2867                  */
2868                 if (mcpl == NULL)
2869                         mcpl = page_io_pool_alloc(mattr, contig, minctg);
2870                 if (mcpl != NULL) {
2871                         pp = mcpl;
2872                         do {
2873                                 if (!page_hashin(pp, vp, off, NULL)) {
2874                                         panic("page_get_contigpages:"
2875                                             " hashin failed"
2876                                             " pp %p, vp %p, off %llx",
2877                                             (void *)pp, (void *)vp, off);
2878                                 }
2879                                 off += MMU_PAGESIZE;
2880                                 PP_CLRFREE(pp);
2881                                 PP_CLRAGED(pp);
2882                                 page_set_props(pp, P_REF);
2883                                 page_io_lock(pp);
2884                                 pp = pp->p_next;
2885                         } while (pp != mcpl);
2886                 } else {
2887                         /*
2888                          * Hypervisor exchange doesn't handle segment or
2889                          * alignment constraints
2890                          */
2891                         if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi ||
2892                             pfnalign)
2893                                 goto fail;
2894                         /*
2895                          * Try exchanging pages with the hypervisor
2896                          */
2897                         mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr,
2898                             flags, minctg);
2899                         if (mcpl == NULL)
2900                                 goto fail;
2901                         off += minctg * MMU_PAGESIZE;
2902                 }
2903                 check_dma(mattr, mcpl, minctg);
2904                 /*
2905                  * Here with a minctg run of contiguous pages, add them to the
2906                  * list we will return for this request.
2907                  */
2908                 page_list_concat(&plist, &mcpl);
2909                 npages -= minctg;
2910                 *npagesp = npages;
2911                 sgllen--;
2912                 if (getone)
2913                         break;
2914         }
2915         return (plist);
2916 fail:
2917         return_partial_alloc(plist);
2918         return (NULL);
2919 }
2920 
2921 /*
2922  * Allocator for domain 0 I/O pages. We match the required
2923  * DMA attributes and contiguity constraints.
2924  */
2925 /*ARGSUSED*/
2926 page_t *
2927 page_create_io(
2928         struct vnode    *vp,
2929         u_offset_t      off,
2930         uint_t          bytes,
2931         uint_t          flags,
2932         struct as       *as,
2933         caddr_t         vaddr,
2934         ddi_dma_attr_t  *mattr)
2935 {
2936         page_t  *plist = NULL, *pp;
2937         int     npages = 0, contig, anyaddr, pages_req;
2938         mfn_t   lo_mfn;
2939         mfn_t   hi_mfn;
2940         pgcnt_t pfnalign = 0;
2941         int     align;
2942         int     is_domu = 0;
2943         int     dummy, bytes_got;
2944         mfn_t   max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
2945 
2946         ASSERT(mattr != NULL);
2947         lo_mfn = mmu_btop(mattr->dma_attr_addr_lo);
2948         hi_mfn = mmu_btop(mattr->dma_attr_addr_hi);
2949         align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer);
2950         if (align > MMU_PAGESIZE)
2951                 pfnalign = mmu_btop(align);
2952 
2953         /*
2954          * Clear the contig flag if only one page is needed or the scatter
2955          * gather list length is >= npages.
2956          */
2957         pages_req = npages = mmu_btopr(bytes);
2958         contig = (flags & PG_PHYSCONTIG);
2959         bytes = P2ROUNDUP(bytes, MMU_PAGESIZE);
2960         if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages)
2961                 contig = 0;
2962 
2963         /*
2964          * Check if any old page in the system is fine.
2965          * DomU should always go down this path.
2966          */
2967         is_domu = !DOMAIN_IS_INITDOMAIN(xen_info);
2968         anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign;
2969         if ((!contig && anyaddr) || is_domu) {
2970                 flags &= ~PG_PHYSCONTIG;
2971                 plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr);
2972                 if (plist != NULL)
2973                         return (plist);
2974                 else if (is_domu)
2975                         return (NULL); /* no memory available */
2976         }
2977         /*
2978          * DomU should never reach here
2979          */
2980         if (contig) {
2981                 plist = page_get_contigpages(vp, off, &npages, flags, vaddr,
2982                     mattr);
2983                 if (plist == NULL)
2984                         goto fail;
2985                 bytes_got = (pages_req - npages) << MMU_PAGESHIFT;
2986                 vaddr += bytes_got;
2987                 off += bytes_got;
2988                 /*
2989                  * We now have all the contiguous pages we need, but
2990                  * we may still need additional non-contiguous pages.
2991                  */
2992         }
2993         /*
2994          * now loop collecting the requested number of pages, these do
2995          * not have to be contiguous pages but we will use the contig
2996          * page alloc code to get the pages since it will honor any
2997          * other constraints the pages may have.
2998          */
2999         while (npages--) {
3000                 dummy = -1;
3001                 pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr);
3002                 if (pp == NULL)
3003                         goto fail;
3004                 page_add(&plist, pp);
3005                 vaddr += MMU_PAGESIZE;
3006                 off += MMU_PAGESIZE;
3007         }
3008         return (plist);
3009 fail:
3010         /*
3011          * Failed to get enough pages, return ones we did get
3012          */
3013         return_partial_alloc(plist);
3014         return (NULL);
3015 }
3016 
3017 /*
3018  * Lock and return the page with the highest mfn that we can find.  last_mfn
3019  * holds the last one found, so the next search can start from there.  We
3020  * also keep a counter so that we don't loop forever if the machine has no
3021  * free pages.
3022  *
3023  * This is called from the balloon thread to find pages to give away.  new_high
3024  * is used when new mfn's have been added to the system - we will reset our
3025  * search if the new mfn's are higher than our current search position.
3026  */
3027 page_t *
3028 page_get_high_mfn(mfn_t new_high)
3029 {
3030         static mfn_t last_mfn = 0;
3031         pfn_t pfn;
3032         page_t *pp;
3033         ulong_t loop_count = 0;
3034 
3035         if (new_high > last_mfn)
3036                 last_mfn = new_high;
3037 
3038         for (; loop_count < mfn_count; loop_count++, last_mfn--) {
3039                 if (last_mfn == 0) {
3040                         last_mfn = cached_max_mfn;
3041                 }
3042 
3043                 pfn = mfn_to_pfn(last_mfn);
3044                 if (pfn & PFN_IS_FOREIGN_MFN)
3045                         continue;
3046 
3047                 /* See if the page is free.  If so, lock it. */
3048                 pp = page_numtopp_alloc(pfn);
3049                 if (pp == NULL)
3050                         continue;
3051                 PP_CLRFREE(pp);
3052 
3053                 ASSERT(PAGE_EXCL(pp));
3054                 ASSERT(pp->p_vnode == NULL);
3055                 ASSERT(!hat_page_is_mapped(pp));
3056                 last_mfn--;
3057                 return (pp);
3058         }
3059         return (NULL);
3060 }
3061 
3062 #else /* !__xpv */
3063 
3064 /*
3065  * get a page from any list with the given mnode
3066  */
3067 static page_t *
3068 page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags,
3069     int mnode, int mtype, ddi_dma_attr_t *dma_attr)
3070 {
3071         kmutex_t                *pcm;
3072         int                     i;
3073         page_t                  *pp;
3074         page_t                  *first_pp;
3075         uint64_t                pgaddr;
3076         ulong_t                 bin;
3077         int                     mtypestart;
3078         int                     plw_initialized;
3079         page_list_walker_t      plw;
3080 
3081         VM_STAT_ADD(pga_vmstats.pgma_alloc);
3082 
3083         ASSERT((flags & PG_MATCH_COLOR) == 0);
3084         ASSERT(szc == 0);
3085         ASSERT(dma_attr != NULL);
3086 
3087         MTYPE_START(mnode, mtype, flags);
3088         if (mtype < 0) {
3089                 VM_STAT_ADD(pga_vmstats.pgma_allocempty);
3090                 return (NULL);
3091         }
3092 
3093         mtypestart = mtype;
3094 
3095         bin = origbin;
3096 
3097         /*
3098          * check up to page_colors + 1 bins - origbin may be checked twice
3099          * because of BIN_STEP skip
3100          */
3101         do {
3102                 plw_initialized = 0;
3103 
3104                 for (plw.plw_count = 0;
3105                     plw.plw_count < page_colors; plw.plw_count++) {
3106 
3107                         if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL)
3108                                 goto nextfreebin;
3109 
3110                         pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
3111                         mutex_enter(pcm);
3112                         pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
3113                         first_pp = pp;
3114                         while (pp != NULL) {
3115                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3116                                     SE_EXCL) == 0) {
3117                                         pp = pp->p_next;
3118                                         if (pp == first_pp) {
3119                                                 pp = NULL;
3120                                         }
3121                                         continue;
3122                                 }
3123 
3124                                 ASSERT(PP_ISFREE(pp));
3125                                 ASSERT(PP_ISAGED(pp));
3126                                 ASSERT(pp->p_vnode == NULL);
3127                                 ASSERT(pp->p_hash == NULL);
3128                                 ASSERT(pp->p_offset == (u_offset_t)-1);
3129                                 ASSERT(pp->p_szc == szc);
3130                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3131                                 /* check if page within DMA attributes */
3132                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3133                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3134                                     (pgaddr + MMU_PAGESIZE - 1 <=
3135                                     dma_attr->dma_attr_addr_hi)) {
3136                                         break;
3137                                 }
3138 
3139                                 /* continue looking */
3140                                 page_unlock(pp);
3141                                 pp = pp->p_next;
3142                                 if (pp == first_pp)
3143                                         pp = NULL;
3144 
3145                         }
3146                         if (pp != NULL) {
3147                                 ASSERT(mtype == PP_2_MTYPE(pp));
3148                                 ASSERT(pp->p_szc == 0);
3149 
3150                                 /* found a page with specified DMA attributes */
3151                                 page_sub(&PAGE_FREELISTS(mnode, szc, bin,
3152                                     mtype), pp);
3153                                 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
3154 
3155                                 if ((PP_ISFREE(pp) == 0) ||
3156                                     (PP_ISAGED(pp) == 0)) {
3157                                         cmn_err(CE_PANIC, "page %p is not free",
3158                                             (void *)pp);
3159                                 }
3160 
3161                                 mutex_exit(pcm);
3162                                 check_dma(dma_attr, pp, 1);
3163                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3164                                 return (pp);
3165                         }
3166                         mutex_exit(pcm);
3167 nextfreebin:
3168                         if (plw_initialized == 0) {
3169                                 page_list_walk_init(szc, 0, bin, 1, 0, &plw);
3170                                 ASSERT(plw.plw_ceq_dif == page_colors);
3171                                 plw_initialized = 1;
3172                         }
3173 
3174                         if (plw.plw_do_split) {
3175                                 pp = page_freelist_split(szc, bin, mnode,
3176                                     mtype,
3177                                     mmu_btop(dma_attr->dma_attr_addr_lo),
3178                                     mmu_btop(dma_attr->dma_attr_addr_hi + 1),
3179                                     &plw);
3180                                 if (pp != NULL) {
3181                                         check_dma(dma_attr, pp, 1);
3182                                         return (pp);
3183                                 }
3184                         }
3185 
3186                         bin = page_list_walk_next_bin(szc, bin, &plw);
3187                 }
3188 
3189                 MTYPE_NEXT(mnode, mtype, flags);
3190         } while (mtype >= 0);
3191 
3192         /* failed to find a page in the freelist; try it in the cachelist */
3193 
3194         /* reset mtype start for cachelist search */
3195         mtype = mtypestart;
3196         ASSERT(mtype >= 0);
3197 
3198         /* start with the bin of matching color */
3199         bin = origbin;
3200 
3201         do {
3202                 for (i = 0; i <= page_colors; i++) {
3203                         if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL)
3204                                 goto nextcachebin;
3205                         pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3206                         mutex_enter(pcm);
3207                         pp = PAGE_CACHELISTS(mnode, bin, mtype);
3208                         first_pp = pp;
3209                         while (pp != NULL) {
3210                                 if (IS_DUMP_PAGE(pp) || page_trylock(pp,
3211                                     SE_EXCL) == 0) {
3212                                         pp = pp->p_next;
3213                                         if (pp == first_pp)
3214                                                 pp = NULL;
3215                                         continue;
3216                                 }
3217                                 ASSERT(pp->p_vnode);
3218                                 ASSERT(PP_ISAGED(pp) == 0);
3219                                 ASSERT(pp->p_szc == 0);
3220                                 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3221 
3222                                 /* check if page within DMA attributes */
3223 
3224                                 pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum));
3225                                 if ((pgaddr >= dma_attr->dma_attr_addr_lo) &&
3226                                     (pgaddr + MMU_PAGESIZE - 1 <=
3227                                     dma_attr->dma_attr_addr_hi)) {
3228                                         break;
3229                                 }
3230 
3231                                 /* continue looking */
3232                                 page_unlock(pp);
3233                                 pp = pp->p_next;
3234                                 if (pp == first_pp)
3235                                         pp = NULL;
3236                         }
3237 
3238                         if (pp != NULL) {
3239                                 ASSERT(mtype == PP_2_MTYPE(pp));
3240                                 ASSERT(pp->p_szc == 0);
3241 
3242                                 /* found a page with specified DMA attributes */
3243                                 page_sub(&PAGE_CACHELISTS(mnode, bin,
3244                                     mtype), pp);
3245                                 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3246 
3247                                 mutex_exit(pcm);
3248                                 ASSERT(pp->p_vnode);
3249                                 ASSERT(PP_ISAGED(pp) == 0);
3250                                 check_dma(dma_attr, pp, 1);
3251                                 VM_STAT_ADD(pga_vmstats.pgma_allocok);
3252                                 return (pp);
3253                         }
3254                         mutex_exit(pcm);
3255 nextcachebin:
3256                         bin += (i == 0) ? BIN_STEP : 1;
3257                         bin &= page_colors_mask;
3258                 }
3259                 MTYPE_NEXT(mnode, mtype, flags);
3260         } while (mtype >= 0);
3261 
3262         VM_STAT_ADD(pga_vmstats.pgma_allocfailed);
3263         return (NULL);
3264 }
3265 
3266 /*
3267  * This function is similar to page_get_freelist()/page_get_cachelist()
3268  * but it searches both the lists to find a page with the specified
3269  * color (or no color) and DMA attributes. The search is done in the
3270  * freelist first and then in the cache list within the highest memory
3271  * range (based on DMA attributes) before searching in the lower
3272  * memory ranges.
3273  *
3274  * Note: This function is called only by page_create_io().
3275  */
3276 /*ARGSUSED*/
3277 static page_t *
3278 page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr,
3279     size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp)
3280 {
3281         uint_t          bin;
3282         int             mtype;
3283         page_t          *pp;
3284         int             n;
3285         int             m;
3286         int             szc;
3287         int             fullrange;
3288         int             mnode;
3289         int             local_failed_stat = 0;
3290         lgrp_mnode_cookie_t     lgrp_cookie;
3291 
3292         VM_STAT_ADD(pga_vmstats.pga_alloc);
3293 
3294         /* only base pagesize currently supported */
3295         if (size != MMU_PAGESIZE)
3296                 return (NULL);
3297 
3298         /*
3299          * If we're passed a specific lgroup, we use it.  Otherwise,
3300          * assume first-touch placement is desired.
3301          */
3302         if (!LGRP_EXISTS(lgrp))
3303                 lgrp = lgrp_home_lgrp();
3304 
3305         /* LINTED */
3306         AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3307 
3308         /*
3309          * Only hold one freelist or cachelist lock at a time, that way we
3310          * can start anywhere and not have to worry about lock
3311          * ordering.
3312          */
3313         if (dma_attr == NULL) {
3314                 n = mtype16m;
3315                 m = mtypetop;
3316                 fullrange = 1;
3317                 VM_STAT_ADD(pga_vmstats.pga_nulldmaattr);
3318         } else {
3319                 pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo);
3320                 pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi);
3321 
3322                 /*
3323                  * We can guarantee alignment only for page boundary.
3324                  */
3325                 if (dma_attr->dma_attr_align > MMU_PAGESIZE)
3326                         return (NULL);
3327 
3328                 /* Sanity check the dma_attr */
3329                 if (pfnlo > pfnhi)
3330                         return (NULL);
3331 
3332                 n = pfn_2_mtype(pfnlo);
3333                 m = pfn_2_mtype(pfnhi);
3334 
3335                 fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) &&
3336                     (pfnhi >= mnoderanges[m].mnr_pfnhi));
3337         }
3338         VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange);
3339 
3340         szc = 0;
3341 
3342         /* cylcing thru mtype handled by RANGE0 if n == mtype16m */
3343         if (n == mtype16m) {
3344                 flags |= PGI_MT_RANGE0;
3345                 n = m;
3346         }
3347 
3348         /*
3349          * Try local memory node first, but try remote if we can't
3350          * get a page of the right color.
3351          */
3352         LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER);
3353         while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3354                 /*
3355                  * allocate pages from high pfn to low.
3356                  */
3357                 mtype = m;
3358                 do {
3359                         if (fullrange != 0) {
3360                                 pp = page_get_mnode_freelist(mnode,
3361                                     bin, mtype, szc, flags);
3362                                 if (pp == NULL) {
3363                                         pp = page_get_mnode_cachelist(
3364                                             bin, flags, mnode, mtype);
3365                                 }
3366                         } else {
3367                                 pp = page_get_mnode_anylist(bin, szc,
3368                                     flags, mnode, mtype, dma_attr);
3369                         }
3370                         if (pp != NULL) {
3371                                 VM_STAT_ADD(pga_vmstats.pga_allocok);
3372                                 check_dma(dma_attr, pp, 1);
3373                                 return (pp);
3374                         }
3375                 } while (mtype != n &&
3376                     (mtype = mnoderanges[mtype].mnr_next) != -1);
3377                 if (!local_failed_stat) {
3378                         lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3379                         local_failed_stat = 1;
3380                 }
3381         }
3382         VM_STAT_ADD(pga_vmstats.pga_allocfailed);
3383 
3384         return (NULL);
3385 }
3386 
3387 /*
3388  * page_create_io()
3389  *
3390  * This function is a copy of page_create_va() with an additional
3391  * argument 'mattr' that specifies DMA memory requirements to
3392  * the page list functions. This function is used by the segkmem
3393  * allocator so it is only to create new pages (i.e PG_EXCL is
3394  * set).
3395  *
3396  * Note: This interface is currently used by x86 PSM only and is
3397  *       not fully specified so the commitment level is only for
3398  *       private interface specific to x86. This interface uses PSM
3399  *       specific page_get_anylist() interface.
3400  */
3401 
3402 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
3403         for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
3404                 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
3405                         break; \
3406         } \
3407 }
3408 
3409 
3410 page_t *
3411 page_create_io(
3412         struct vnode    *vp,
3413         u_offset_t      off,
3414         uint_t          bytes,
3415         uint_t          flags,
3416         struct as       *as,
3417         caddr_t         vaddr,
3418         ddi_dma_attr_t  *mattr) /* DMA memory attributes if any */
3419 {
3420         page_t          *plist = NULL;
3421         uint_t          plist_len = 0;
3422         pgcnt_t         npages;
3423         page_t          *npp = NULL;
3424         uint_t          pages_req;
3425         page_t          *pp;
3426         kmutex_t        *phm = NULL;
3427         uint_t          index;
3428 
3429         TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
3430             "page_create_start:vp %p off %llx bytes %u flags %x",
3431             vp, off, bytes, flags);
3432 
3433         ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0);
3434 
3435         pages_req = npages = mmu_btopr(bytes);
3436 
3437         /*
3438          * Do the freemem and pcf accounting.
3439          */
3440         if (!page_create_wait(npages, flags)) {
3441                 return (NULL);
3442         }
3443 
3444         TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
3445             "page_create_success:vp %p off %llx", vp, off);
3446 
3447         /*
3448          * If satisfying this request has left us with too little
3449          * memory, start the wheels turning to get some back.  The
3450          * first clause of the test prevents waking up the pageout
3451          * daemon in situations where it would decide that there's
3452          * nothing to do.
3453          */
3454         if (nscan < desscan && freemem < minfree) {
3455                 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
3456                     "pageout_cv_signal:freemem %ld", freemem);
3457                 cv_signal(&proc_pageout->p_cv);
3458         }
3459 
3460         if (flags & PG_PHYSCONTIG) {
3461 
3462                 plist = page_get_contigpage(&npages, mattr, 1);
3463                 if (plist == NULL) {
3464                         page_create_putback(npages);
3465                         return (NULL);
3466                 }
3467 
3468                 pp = plist;
3469 
3470                 do {
3471                         if (!page_hashin(pp, vp, off, NULL)) {
3472                                 panic("pg_creat_io: hashin failed %p %p %llx",
3473                                     (void *)pp, (void *)vp, off);
3474                         }
3475                         VM_STAT_ADD(page_create_new);
3476                         off += MMU_PAGESIZE;
3477                         PP_CLRFREE(pp);
3478                         PP_CLRAGED(pp);
3479                         page_set_props(pp, P_REF);
3480                         pp = pp->p_next;
3481                 } while (pp != plist);
3482 
3483                 if (!npages) {
3484                         check_dma(mattr, plist, pages_req);
3485                         return (plist);
3486                 } else {
3487                         vaddr += (pages_req - npages) << MMU_PAGESHIFT;
3488                 }
3489 
3490                 /*
3491                  * fall-thru:
3492                  *
3493                  * page_get_contigpage returns when npages <= sgllen.
3494                  * Grab the rest of the non-contig pages below from anylist.
3495                  */
3496         }
3497 
3498         /*
3499          * Loop around collecting the requested number of pages.
3500          * Most of the time, we have to `create' a new page. With
3501          * this in mind, pull the page off the free list before
3502          * getting the hash lock.  This will minimize the hash
3503          * lock hold time, nesting, and the like.  If it turns
3504          * out we don't need the page, we put it back at the end.
3505          */
3506         while (npages--) {
3507                 phm = NULL;
3508 
3509                 index = PAGE_HASH_FUNC(vp, off);
3510 top:
3511                 ASSERT(phm == NULL);
3512                 ASSERT(index == PAGE_HASH_FUNC(vp, off));
3513                 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3514 
3515                 if (npp == NULL) {
3516                         /*
3517                          * Try to get the page of any color either from
3518                          * the freelist or from the cache list.
3519                          */
3520                         npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE,
3521                             flags & ~PG_MATCH_COLOR, mattr, NULL);
3522                         if (npp == NULL) {
3523                                 if (mattr == NULL) {
3524                                         /*
3525                                          * Not looking for a special page;
3526                                          * panic!
3527                                          */
3528                                         panic("no page found %d", (int)npages);
3529                                 }
3530                                 /*
3531                                  * No page found! This can happen
3532                                  * if we are looking for a page
3533                                  * within a specific memory range
3534                                  * for DMA purposes. If PG_WAIT is
3535                                  * specified then we wait for a
3536                                  * while and then try again. The
3537                                  * wait could be forever if we
3538                                  * don't get the page(s) we need.
3539                                  *
3540                                  * Note: XXX We really need a mechanism
3541                                  * to wait for pages in the desired
3542                                  * range. For now, we wait for any
3543                                  * pages and see if we can use it.
3544                                  */
3545 
3546                                 if ((mattr != NULL) && (flags & PG_WAIT)) {
3547                                         delay(10);
3548                                         goto top;
3549                                 }
3550                                 goto fail; /* undo accounting stuff */
3551                         }
3552 
3553                         if (PP_ISAGED(npp) == 0) {
3554                                 /*
3555                                  * Since this page came from the
3556                                  * cachelist, we must destroy the
3557                                  * old vnode association.
3558                                  */
3559                                 page_hashout(npp, (kmutex_t *)NULL);
3560                         }
3561                 }
3562 
3563                 /*
3564                  * We own this page!
3565                  */
3566                 ASSERT(PAGE_EXCL(npp));
3567                 ASSERT(npp->p_vnode == NULL);
3568                 ASSERT(!hat_page_is_mapped(npp));
3569                 PP_CLRFREE(npp);
3570                 PP_CLRAGED(npp);
3571 
3572                 /*
3573                  * Here we have a page in our hot little mits and are
3574                  * just waiting to stuff it on the appropriate lists.
3575                  * Get the mutex and check to see if it really does
3576                  * not exist.
3577                  */
3578                 phm = PAGE_HASH_MUTEX(index);
3579                 mutex_enter(phm);
3580                 PAGE_HASH_SEARCH(index, pp, vp, off);
3581                 if (pp == NULL) {
3582                         VM_STAT_ADD(page_create_new);
3583                         pp = npp;
3584                         npp = NULL;
3585                         if (!page_hashin(pp, vp, off, phm)) {
3586                                 /*
3587                                  * Since we hold the page hash mutex and
3588                                  * just searched for this page, page_hashin
3589                                  * had better not fail.  If it does, that
3590                                  * means somethread did not follow the
3591                                  * page hash mutex rules.  Panic now and
3592                                  * get it over with.  As usual, go down
3593                                  * holding all the locks.
3594                                  */
3595                                 ASSERT(MUTEX_HELD(phm));
3596                                 panic("page_create: hashin fail %p %p %llx %p",
3597                                     (void *)pp, (void *)vp, off, (void *)phm);
3598 
3599                         }
3600                         ASSERT(MUTEX_HELD(phm));
3601                         mutex_exit(phm);
3602                         phm = NULL;
3603 
3604                         /*
3605                          * Hat layer locking need not be done to set
3606                          * the following bits since the page is not hashed
3607                          * and was on the free list (i.e., had no mappings).
3608                          *
3609                          * Set the reference bit to protect
3610                          * against immediate pageout
3611                          *
3612                          * XXXmh modify freelist code to set reference
3613                          * bit so we don't have to do it here.
3614                          */
3615                         page_set_props(pp, P_REF);
3616                 } else {
3617                         ASSERT(MUTEX_HELD(phm));
3618                         mutex_exit(phm);
3619                         phm = NULL;
3620                         /*
3621                          * NOTE: This should not happen for pages associated
3622                          *       with kernel vnode 'kvp'.
3623                          */
3624                         /* XX64 - to debug why this happens! */
3625                         ASSERT(!VN_ISKAS(vp));
3626                         if (VN_ISKAS(vp))
3627                                 cmn_err(CE_NOTE,
3628                                     "page_create: page not expected "
3629                                     "in hash list for kernel vnode - pp 0x%p",
3630                                     (void *)pp);
3631                         VM_STAT_ADD(page_create_exists);
3632                         goto fail;
3633                 }
3634 
3635                 /*
3636                  * Got a page!  It is locked.  Acquire the i/o
3637                  * lock since we are going to use the p_next and
3638                  * p_prev fields to link the requested pages together.
3639                  */
3640                 page_io_lock(pp);
3641                 page_add(&plist, pp);
3642                 plist = plist->p_next;
3643                 off += MMU_PAGESIZE;
3644                 vaddr += MMU_PAGESIZE;
3645         }
3646 
3647         check_dma(mattr, plist, pages_req);
3648         return (plist);
3649 
3650 fail:
3651         if (npp != NULL) {
3652                 /*
3653                  * Did not need this page after all.
3654                  * Put it back on the free list.
3655                  */
3656                 VM_STAT_ADD(page_create_putbacks);
3657                 PP_SETFREE(npp);
3658                 PP_SETAGED(npp);
3659                 npp->p_offset = (u_offset_t)-1;
3660                 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
3661                 page_unlock(npp);
3662         }
3663 
3664         /*
3665          * Give up the pages we already got.
3666          */
3667         while (plist != NULL) {
3668                 pp = plist;
3669                 page_sub(&plist, pp);
3670                 page_io_unlock(pp);
3671                 plist_len++;
3672                 /*LINTED: constant in conditional ctx*/
3673                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
3674         }
3675 
3676         /*
3677          * VN_DISPOSE does freemem accounting for the pages in plist
3678          * by calling page_free. So, we need to undo the pcf accounting
3679          * for only the remaining pages.
3680          */
3681         VM_STAT_ADD(page_create_putbacks);
3682         page_create_putback(pages_req - plist_len);
3683 
3684         return (NULL);
3685 }
3686 #endif /* !__xpv */
3687 
3688 
3689 /*
3690  * Copy the data from the physical page represented by "frompp" to
3691  * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and
3692  * CPU->cpu_caddr2.  It assumes that no one uses either map at interrupt
3693  * level and no one sleeps with an active mapping there.
3694  *
3695  * Note that the ref/mod bits in the page_t's are not affected by
3696  * this operation, hence it is up to the caller to update them appropriately.
3697  */
3698 int
3699 ppcopy(page_t *frompp, page_t *topp)
3700 {
3701         caddr_t         pp_addr1;
3702         caddr_t         pp_addr2;
3703         hat_mempte_t    pte1;
3704         hat_mempte_t    pte2;
3705         kmutex_t        *ppaddr_mutex;
3706         label_t         ljb;
3707         int             ret = 1;
3708 
3709         ASSERT_STACK_ALIGNED();
3710         ASSERT(PAGE_LOCKED(frompp));
3711         ASSERT(PAGE_LOCKED(topp));
3712 
3713         if (kpm_enable) {
3714                 pp_addr1 = hat_kpm_page2va(frompp, 0);
3715                 pp_addr2 = hat_kpm_page2va(topp, 0);
3716                 kpreempt_disable();
3717         } else {
3718                 /*
3719                  * disable pre-emption so that CPU can't change
3720                  */
3721                 kpreempt_disable();
3722 
3723                 pp_addr1 = CPU->cpu_caddr1;
3724                 pp_addr2 = CPU->cpu_caddr2;
3725                 pte1 = CPU->cpu_caddr1pte;
3726                 pte2 = CPU->cpu_caddr2pte;
3727 
3728                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3729                 mutex_enter(ppaddr_mutex);
3730 
3731                 hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1,
3732                     PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST);
3733                 hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2,
3734                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3735                     HAT_LOAD_NOCONSIST);
3736         }
3737 
3738         if (on_fault(&ljb)) {
3739                 ret = 0;
3740                 goto faulted;
3741         }
3742         if (use_sse_pagecopy)
3743 #ifdef __xpv
3744                 page_copy_no_xmm(pp_addr2, pp_addr1);
3745 #else
3746                 hwblkpagecopy(pp_addr1, pp_addr2);
3747 #endif
3748         else
3749                 bcopy(pp_addr1, pp_addr2, PAGESIZE);
3750 
3751         no_fault();
3752 faulted:
3753         if (!kpm_enable) {
3754 #ifdef __xpv
3755                 /*
3756                  * We can't leave unused mappings laying about under the
3757                  * hypervisor, so blow them away.
3758                  */
3759                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0,
3760                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3761                         panic("HYPERVISOR_update_va_mapping() failed");
3762                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3763                     UVMF_INVLPG | UVMF_LOCAL) < 0)
3764                         panic("HYPERVISOR_update_va_mapping() failed");
3765 #endif
3766                 mutex_exit(ppaddr_mutex);
3767         }
3768         kpreempt_enable();
3769         return (ret);
3770 }
3771 
3772 void
3773 pagezero(page_t *pp, uint_t off, uint_t len)
3774 {
3775         ASSERT(PAGE_LOCKED(pp));
3776         pfnzero(page_pptonum(pp), off, len);
3777 }
3778 
3779 /*
3780  * Zero the physical page from off to off + len given by pfn
3781  * without changing the reference and modified bits of page.
3782  *
3783  * We use this using CPU private page address #2, see ppcopy() for more info.
3784  * pfnzero() must not be called at interrupt level.
3785  */
3786 void
3787 pfnzero(pfn_t pfn, uint_t off, uint_t len)
3788 {
3789         caddr_t         pp_addr2;
3790         hat_mempte_t    pte2;
3791         kmutex_t        *ppaddr_mutex = NULL;
3792 
3793         ASSERT_STACK_ALIGNED();
3794         ASSERT(len <= MMU_PAGESIZE);
3795         ASSERT(off <= MMU_PAGESIZE);
3796         ASSERT(off + len <= MMU_PAGESIZE);
3797 
3798         if (kpm_enable && !pfn_is_foreign(pfn)) {
3799                 pp_addr2 = hat_kpm_pfn2va(pfn);
3800                 kpreempt_disable();
3801         } else {
3802                 kpreempt_disable();
3803 
3804                 pp_addr2 = CPU->cpu_caddr2;
3805                 pte2 = CPU->cpu_caddr2pte;
3806 
3807                 ppaddr_mutex = &CPU->cpu_ppaddr_mutex;
3808                 mutex_enter(ppaddr_mutex);
3809 
3810                 hat_mempte_remap(pfn, pp_addr2, pte2,
3811                     PROT_READ | PROT_WRITE | HAT_STORECACHING_OK,
3812                     HAT_LOAD_NOCONSIST);
3813         }
3814 
3815         if (use_sse_pagezero) {
3816 #ifdef __xpv
3817                 uint_t rem;
3818 
3819                 /*
3820                  * zero a byte at a time until properly aligned for
3821                  * block_zero_no_xmm().
3822                  */
3823                 while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0)
3824                         pp_addr2[off++] = 0;
3825 
3826                 /*
3827                  * Now use faster block_zero_no_xmm() for any range
3828                  * that is properly aligned and sized.
3829                  */
3830                 rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN));
3831                 len -= rem;
3832                 if (len != 0) {
3833                         block_zero_no_xmm(pp_addr2 + off, len);
3834                         off += len;
3835                 }
3836 
3837                 /*
3838                  * zero remainder with byte stores.
3839                  */
3840                 while (rem-- > 0)
3841                         pp_addr2[off++] = 0;
3842 #else
3843                 hwblkclr(pp_addr2 + off, len);
3844 #endif
3845         } else {
3846                 bzero(pp_addr2 + off, len);
3847         }
3848 
3849         if (!kpm_enable || pfn_is_foreign(pfn)) {
3850 #ifdef __xpv
3851                 /*
3852                  * On the hypervisor this page might get used for a page
3853                  * table before any intervening change to this mapping,
3854                  * so blow it away.
3855                  */
3856                 if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0,
3857                     UVMF_INVLPG) < 0)
3858                         panic("HYPERVISOR_update_va_mapping() failed");
3859 #endif
3860                 mutex_exit(ppaddr_mutex);
3861         }
3862 
3863         kpreempt_enable();
3864 }
3865 
3866 /*
3867  * Platform-dependent page scrub call.
3868  */
3869 void
3870 pagescrub(page_t *pp, uint_t off, uint_t len)
3871 {
3872         /*
3873          * For now, we rely on the fact that pagezero() will
3874          * always clear UEs.
3875          */
3876         pagezero(pp, off, len);
3877 }
3878 
3879 /*
3880  * set up two private addresses for use on a given CPU for use in ppcopy()
3881  */
3882 void
3883 setup_vaddr_for_ppcopy(struct cpu *cpup)
3884 {
3885         void *addr;
3886         hat_mempte_t pte_pa;
3887 
3888         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3889         pte_pa = hat_mempte_setup(addr);
3890         cpup->cpu_caddr1 = addr;
3891         cpup->cpu_caddr1pte = pte_pa;
3892 
3893         addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP);
3894         pte_pa = hat_mempte_setup(addr);
3895         cpup->cpu_caddr2 = addr;
3896         cpup->cpu_caddr2pte = pte_pa;
3897 
3898         mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL);
3899 }
3900 
3901 /*
3902  * Undo setup_vaddr_for_ppcopy
3903  */
3904 void
3905 teardown_vaddr_for_ppcopy(struct cpu *cpup)
3906 {
3907         mutex_destroy(&cpup->cpu_ppaddr_mutex);
3908 
3909         hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte);
3910         cpup->cpu_caddr2pte = 0;
3911         vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1));
3912         cpup->cpu_caddr2 = 0;
3913 
3914         hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte);
3915         cpup->cpu_caddr1pte = 0;
3916         vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1));
3917         cpup->cpu_caddr1 = 0;
3918 }
3919 
3920 /*
3921  * Function for flushing D-cache when performing module relocations
3922  * to an alternate mapping.  Unnecessary on Intel / AMD platforms.
3923  */
3924 void
3925 dcache_flushall()
3926 {}
3927 
3928 size_t
3929 exec_get_spslew(void)
3930 {
3931         return (0);
3932 }
3933 
3934 /*
3935  * Allocate a memory page.  The argument 'seed' can be any pseudo-random
3936  * number to vary where the pages come from.  This is quite a hacked up
3937  * method -- it works for now, but really needs to be fixed up a bit.
3938  *
3939  * We currently use page_create_va() on the kvp with fake offsets,
3940  * segments and virt address.  This is pretty bogus, but was copied from the
3941  * old hat_i86.c code.  A better approach would be to specify either mnode
3942  * random or mnode local and takes a page from whatever color has the MOST
3943  * available - this would have a minimal impact on page coloring.
3944  */
3945 page_t *
3946 page_get_physical(uintptr_t seed)
3947 {
3948         page_t *pp;
3949         u_offset_t offset;
3950         static struct seg tmpseg;
3951         static uintptr_t ctr = 0;
3952 
3953         /*
3954          * This code is gross, we really need a simpler page allocator.
3955          *
3956          * We need to assign an offset for the page to call page_create_va()
3957          * To avoid conflicts with other pages, we get creative with the offset.
3958          * For 32 bits, we need an offset > 4Gig
3959          * For 64 bits, need an offset somewhere in the VA hole.
3960          */
3961         offset = seed;
3962         if (offset > kernelbase)
3963                 offset -= kernelbase;
3964         offset <<= MMU_PAGESHIFT;
3965 #if defined(__amd64)
3966         offset += mmu.hole_start;       /* something in VA hole */
3967 #else
3968         offset += 1ULL << 40;     /* something > 4 Gig */
3969 #endif
3970 
3971         if (page_resv(1, KM_NOSLEEP) == 0)
3972                 return (NULL);
3973 
3974 #ifdef  DEBUG
3975         pp = page_exists(&kvp, offset);
3976         if (pp != NULL)
3977                 panic("page already exists %p", (void *)pp);
3978 #endif
3979 
3980         pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
3981             &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));       /* changing VA usage */
3982         if (pp != NULL) {
3983                 page_io_unlock(pp);
3984                 page_downgrade(pp);
3985         }
3986         return (pp);
3987 }