big-one New usr/src/uts/i86pc/vm/hat

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 /*
  29  * Copyright 2019 Nexenta Systems, Inc.  All rights reserved.
  30  * Copyright 2018 Joyent, Inc.  All rights reserved.
  31  * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
  32  */
  33 
  34 /*
  35  * VM - Hardware Address Translation management for i386 and amd64
  36  *
  37  * Implementation of the interfaces described in <common/vm/hat.h>
  38  *
  39  * Nearly all the details of how the hardware is managed should not be
  40  * visible outside this layer except for misc. machine specific functions
  41  * that work in conjunction with this code.
  42  *
  43  * Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
  44  */
  45 
  46 #include <sys/machparam.h>
  47 #include <sys/machsystm.h>
  48 #include <sys/mman.h>
  49 #include <sys/types.h>
  50 #include <sys/systm.h>
  51 #include <sys/cpuvar.h>
  52 #include <sys/thread.h>
  53 #include <sys/proc.h>
  54 #include <sys/cpu.h>
  55 #include <sys/kmem.h>
  56 #include <sys/disp.h>
  57 #include <sys/shm.h>
  58 #include <sys/sysmacros.h>
  59 #include <sys/machparam.h>
  60 #include <sys/vmem.h>
  61 #include <sys/vmsystm.h>
  62 #include <sys/promif.h>
  63 #include <sys/var.h>
  64 #include <sys/x86_archext.h>
  65 #include <sys/atomic.h>
  66 #include <sys/bitmap.h>
  67 #include <sys/controlregs.h>
  68 #include <sys/bootconf.h>
  69 #include <sys/bootsvcs.h>
  70 #include <sys/bootinfo.h>
  71 #include <sys/archsystm.h>
  72 
  73 #include <vm/seg_kmem.h>
  74 #include <vm/hat_i86.h>
  75 #include <vm/as.h>
  76 #include <vm/seg.h>
  77 #include <vm/page.h>
  78 #include <vm/seg_kp.h>
  79 #include <vm/seg_kpm.h>
  80 #include <vm/vm_dep.h>
  81 #ifdef __xpv
  82 #include <sys/hypervisor.h>
  83 #endif
  84 #include <vm/kboot_mmu.h>
  85 #include <vm/seg_spt.h>
  86 
  87 #include <sys/cmn_err.h>
  88 
  89 /*
  90  * Basic parameters for hat operation.
  91  */
  92 struct hat_mmu_info mmu;
  93 
  94 /*
  95  * The page that is the kernel's top level pagetable.
  96  *
  97  * For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries
  98  * on this 4K page for its top level page table. The remaining groups of
  99  * 4 entries are used for per processor copies of user VLP pagetables for
 100  * running threads.  See hat_switch() and reload_pae32() for details.
 101  *
 102  * vlp_page[0..3] - level==2 PTEs for kernel HAT
 103  * vlp_page[4..7] - level==2 PTEs for user thread on cpu 0
 104  * vlp_page[8..11]  - level==2 PTE for user thread on cpu 1
 105  * etc...
 106  */
 107 static x86pte_t *vlp_page;
 108 
 109 /*
 110  * forward declaration of internal utility routines
 111  */
 112 static x86pte_t hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected,
 113         x86pte_t new);
 114 
 115 /*
 116  * The kernel address space exists in all HATs. To implement this the
 117  * kernel reserves a fixed number of entries in the topmost level(s) of page
 118  * tables. The values are setup during startup and then copied to every user
 119  * hat created by hat_alloc(). This means that kernelbase must be:
 120  *
 121  *        4Meg aligned for 32 bit kernels
 122  *      512Gig aligned for x86_64 64 bit kernel
 123  *
 124  * The hat_kernel_range_ts describe what needs to be copied from kernel hat
 125  * to each user hat.
 126  */
 127 typedef struct hat_kernel_range {
 128         level_t         hkr_level;
 129         uintptr_t       hkr_start_va;
 130         uintptr_t       hkr_end_va;     /* zero means to end of memory */
 131 } hat_kernel_range_t;
 132 #define NUM_KERNEL_RANGE 2
 133 static hat_kernel_range_t kernel_ranges[NUM_KERNEL_RANGE];
 134 static int num_kernel_ranges;
 135 
 136 uint_t use_boot_reserve = 1;    /* cleared after early boot process */
 137 uint_t can_steal_post_boot = 0; /* set late in boot to enable stealing */
 138 
 139 /*
 140  * enable_1gpg: controls 1g page support for user applications.
 141  * By default, 1g pages are exported to user applications. enable_1gpg can
 142  * be set to 0 to not export.
 143  */
 144 int     enable_1gpg = 1;
 145 
 146 /*
 147  * AMD shanghai processors provide better management of 1gb ptes in its tlb.
 148  * By default, 1g page support will be disabled for pre-shanghai AMD
 149  * processors that don't have optimal tlb support for the 1g page size.
 150  * chk_optimal_1gtlb can be set to 0 to force 1g page support on sub-optimal
 151  * processors.
 152  */
 153 int     chk_optimal_1gtlb = 1;
 154 
 155 
 156 #ifdef DEBUG
 157 uint_t  map1gcnt;
 158 #endif
 159 
 160 
 161 /*
 162  * A cpuset for all cpus. This is used for kernel address cross calls, since
 163  * the kernel addresses apply to all cpus.
 164  */
 165 cpuset_t khat_cpuset;
 166 
 167 /*
 168  * management stuff for hat structures
 169  */
 170 kmutex_t        hat_list_lock;
 171 kcondvar_t      hat_list_cv;
 172 kmem_cache_t    *hat_cache;
 173 kmem_cache_t    *hat_hash_cache;
 174 kmem_cache_t    *vlp_hash_cache;
 175 
 176 /*
 177  * Simple statistics
 178  */
 179 struct hatstats hatstat;
 180 
 181 /*
 182  * Some earlier hypervisor versions do not emulate cmpxchg of PTEs
 183  * correctly.  For such hypervisors we must set PT_USER for kernel
 184  * entries ourselves (normally the emulation would set PT_USER for
 185  * kernel entries and PT_USER|PT_GLOBAL for user entries).  pt_kern is
 186  * thus set appropriately.  Note that dboot/kbm is OK, as only the full
 187  * HAT uses cmpxchg() and the other paths (hypercall etc.) were never
 188  * incorrect.
 189  */
 190 int pt_kern;
 191 
 192 /*
 193  * useful stuff for atomic access/clearing/setting REF/MOD/RO bits in page_t's.
 194  */
 195 extern void atomic_orb(uchar_t *addr, uchar_t val);
 196 extern void atomic_andb(uchar_t *addr, uchar_t val);
 197 
 198 #ifndef __xpv
 199 extern pfn_t memseg_get_start(struct memseg *);
 200 #endif
 201 
 202 #define PP_GETRM(pp, rmmask)    (pp->p_nrm & rmmask)
 203 #define PP_ISMOD(pp)            PP_GETRM(pp, P_MOD)
 204 #define PP_ISREF(pp)            PP_GETRM(pp, P_REF)
 205 #define PP_ISRO(pp)             PP_GETRM(pp, P_RO)
 206 
 207 #define PP_SETRM(pp, rm)        atomic_orb(&(pp->p_nrm), rm)
 208 #define PP_SETMOD(pp)           PP_SETRM(pp, P_MOD)
 209 #define PP_SETREF(pp)           PP_SETRM(pp, P_REF)
 210 #define PP_SETRO(pp)            PP_SETRM(pp, P_RO)
 211 
 212 #define PP_CLRRM(pp, rm)        atomic_andb(&(pp->p_nrm), ~(rm))
 213 #define PP_CLRMOD(pp)           PP_CLRRM(pp, P_MOD)
 214 #define PP_CLRREF(pp)           PP_CLRRM(pp, P_REF)
 215 #define PP_CLRRO(pp)            PP_CLRRM(pp, P_RO)
 216 #define PP_CLRALL(pp)           PP_CLRRM(pp, P_MOD | P_REF | P_RO)
 217 
 218 /*
 219  * kmem cache constructor for struct hat
 220  */
 221 /*ARGSUSED*/
 222 static int
 223 hati_constructor(void *buf, void *handle, int kmflags)
 224 {
 225         hat_t   *hat = buf;
 226 
 227         mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 228         bzero(hat->hat_pages_mapped,
 229             sizeof (pgcnt_t) * (mmu.max_page_level + 1));
 230         hat->hat_ism_pgcnt = 0;
 231         hat->hat_stats = 0;
 232         hat->hat_flags = 0;
 233         CPUSET_ZERO(hat->hat_cpus);
 234         hat->hat_htable = NULL;
 235         hat->hat_ht_hash = NULL;
 236         return (0);
 237 }
 238 
 239 /*
 240  * Allocate a hat structure for as. We also create the top level
 241  * htable and initialize it to contain the kernel hat entries.
 242  */
 243 hat_t *
 244 hat_alloc(struct as *as)
 245 {
 246         hat_t                   *hat;
 247         htable_t                *ht;    /* top level htable */
 248         uint_t                  use_vlp;
 249         uint_t                  r;
 250         hat_kernel_range_t      *rp;
 251         uintptr_t               va;
 252         uintptr_t               eva;
 253         uint_t                  start;
 254         uint_t                  cnt;
 255         htable_t                *src;
 256 
 257         /*
 258          * Once we start creating user process HATs we can enable
 259          * the htable_steal() code.
 260          */
 261         if (can_steal_post_boot == 0)
 262                 can_steal_post_boot = 1;
 263 
 264         ASSERT(AS_WRITE_HELD(as));
 265         hat = kmem_cache_alloc(hat_cache, KM_SLEEP);
 266         hat->hat_as = as;
 267         mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 268         ASSERT(hat->hat_flags == 0);
 269         hat->hat_unmaps = 0;
 270 
 271 #if defined(__xpv)
 272         /*
 273          * No VLP stuff on the hypervisor due to the 64-bit split top level
 274          * page tables.  On 32-bit it's not needed as the hypervisor takes
 275          * care of copying the top level PTEs to a below 4Gig page.
 276          */
 277         use_vlp = 0;
 278 #else   /* __xpv */
 279         /* 32 bit processes uses a VLP style hat when running with PAE */
 280 #if defined(__amd64)
 281         use_vlp = (ttoproc(curthread)->p_model == DATAMODEL_ILP32);
 282 #elif defined(__i386)
 283         use_vlp = mmu.pae_hat;
 284 #endif
 285 #endif  /* __xpv */
 286         if (use_vlp) {
 287                 hat->hat_flags = HAT_VLP;
 288                 bzero(hat->hat_vlp_ptes, VLP_SIZE);
 289         }
 290 
 291         /*
 292          * Allocate the htable hash
 293          */
 294         if ((hat->hat_flags & HAT_VLP)) {
 295                 hat->hat_num_hash = mmu.vlp_hash_cnt;
 296                 hat->hat_ht_hash = kmem_cache_alloc(vlp_hash_cache, KM_SLEEP);
 297         } else {
 298                 hat->hat_num_hash = mmu.hash_cnt;
 299                 hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP);
 300         }
 301         bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *));
 302 
 303         /*
 304          * Initialize Kernel HAT entries at the top of the top level page
 305          * tables for the new hat.
 306          */
 307         hat->hat_htable = NULL;
 308         hat->hat_ht_cached = NULL;
 309         XPV_DISALLOW_MIGRATE();
 310         ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL);
 311         hat->hat_htable = ht;
 312 
 313 #if defined(__amd64)
 314         if (hat->hat_flags & HAT_VLP)
 315                 goto init_done;
 316 #endif
 317 
 318         for (r = 0; r < num_kernel_ranges; ++r) {
 319                 rp = &kernel_ranges[r];
 320                 for (va = rp->hkr_start_va; va != rp->hkr_end_va;
 321                     va += cnt * LEVEL_SIZE(rp->hkr_level)) {
 322 
 323                         if (rp->hkr_level == TOP_LEVEL(hat))
 324                                 ht = hat->hat_htable;
 325                         else
 326                                 ht = htable_create(hat, va, rp->hkr_level,
 327                                     NULL);
 328 
 329                         start = htable_va2entry(va, ht);
 330                         cnt = HTABLE_NUM_PTES(ht) - start;
 331                         eva = va +
 332                             ((uintptr_t)cnt << LEVEL_SHIFT(rp->hkr_level));
 333                         if (rp->hkr_end_va != 0 &&
 334                             (eva > rp->hkr_end_va || eva == 0))
 335                                 cnt = htable_va2entry(rp->hkr_end_va, ht) -
 336                                     start;
 337 
 338 #if defined(__i386) && !defined(__xpv)
 339                         if (ht->ht_flags & HTABLE_VLP) {
 340                                 bcopy(&vlp_page[start],
 341                                     &hat->hat_vlp_ptes[start],
 342                                     cnt * sizeof (x86pte_t));
 343                                 continue;
 344                         }
 345 #endif
 346                         src = htable_lookup(kas.a_hat, va, rp->hkr_level);
 347                         ASSERT(src != NULL);
 348                         x86pte_copy(src, ht, start, cnt);
 349                         htable_release(src);
 350                 }
 351         }
 352 
 353 init_done:
 354 
 355 #if defined(__xpv)
 356         /*
 357          * Pin top level page tables after initializing them
 358          */
 359         xen_pin(hat->hat_htable->ht_pfn, mmu.max_level);
 360 #if defined(__amd64)
 361         xen_pin(hat->hat_user_ptable, mmu.max_level);
 362 #endif
 363 #endif
 364         XPV_ALLOW_MIGRATE();
 365 
 366         /*
 367          * Put it at the start of the global list of all hats (used by stealing)
 368          *
 369          * kas.a_hat is not in the list but is instead used to find the
 370          * first and last items in the list.
 371          *
 372          * - kas.a_hat->hat_next points to the start of the user hats.
 373          *   The list ends where hat->hat_next == NULL
 374          *
 375          * - kas.a_hat->hat_prev points to the last of the user hats.
 376          *   The list begins where hat->hat_prev == NULL
 377          */
 378         mutex_enter(&hat_list_lock);
 379         hat->hat_prev = NULL;
 380         hat->hat_next = kas.a_hat->hat_next;
 381         if (hat->hat_next)
 382                 hat->hat_next->hat_prev = hat;
 383         else
 384                 kas.a_hat->hat_prev = hat;
 385         kas.a_hat->hat_next = hat;
 386         mutex_exit(&hat_list_lock);
 387 
 388         return (hat);
 389 }
 390 
 391 /*
 392  * process has finished executing but as has not been cleaned up yet.
 393  */
 394 /*ARGSUSED*/
 395 void
 396 hat_free_start(hat_t *hat)
 397 {
 398         ASSERT(AS_WRITE_HELD(hat->hat_as));
 399 
 400         /*
 401          * If the hat is currently a stealing victim, wait for the stealing
 402          * to finish.  Once we mark it as HAT_FREEING, htable_steal()
 403          * won't look at its pagetables anymore.
 404          */
 405         mutex_enter(&hat_list_lock);
 406         while ((hat->hat_flags & HAT_VICTIM) || (hat->hat_unmaps > 0))
 407                 cv_wait(&hat_list_cv, &hat_list_lock);
 408         hat->hat_flags |= HAT_FREEING;
 409         mutex_exit(&hat_list_lock);
 410 }
 411 
 412 /*
 413  * An address space is being destroyed, so we destroy the associated hat.
 414  */
 415 void
 416 hat_free_end(hat_t *hat)
 417 {
 418         kmem_cache_t *cache;
 419 
 420         ASSERT(hat->hat_flags & HAT_FREEING);
 421 
 422         /*
 423          * must not be running on the given hat
 424          */
 425         ASSERT(CPU->cpu_current_hat != hat);
 426 
 427         /*
 428          * Remove it from the list of HATs
 429          */
 430         mutex_enter(&hat_list_lock);
 431         if (hat->hat_prev)
 432                 hat->hat_prev->hat_next = hat->hat_next;
 433         else
 434                 kas.a_hat->hat_next = hat->hat_next;
 435         if (hat->hat_next)
 436                 hat->hat_next->hat_prev = hat->hat_prev;
 437         else
 438                 kas.a_hat->hat_prev = hat->hat_prev;
 439         mutex_exit(&hat_list_lock);
 440         hat->hat_next = hat->hat_prev = NULL;
 441 
 442 #if defined(__xpv)
 443         /*
 444          * On the hypervisor, unpin top level page table(s)
 445          */
 446         xen_unpin(hat->hat_htable->ht_pfn);
 447 #if defined(__amd64)
 448         xen_unpin(hat->hat_user_ptable);
 449 #endif
 450 #endif
 451 
 452         /*
 453          * Make a pass through the htables freeing them all up.
 454          */
 455         htable_purge_hat(hat);
 456 
 457         /*
 458          * Decide which kmem cache the hash table came from, then free it.
 459          */
 460         if (hat->hat_flags & HAT_VLP)
 461                 cache = vlp_hash_cache;
 462         else
 463                 cache = hat_hash_cache;
 464         kmem_cache_free(cache, hat->hat_ht_hash);
 465         hat->hat_ht_hash = NULL;
 466 
 467         hat->hat_flags = 0;
 468         kmem_cache_free(hat_cache, hat);
 469 }
 470 
 471 /*
 472  * round kernelbase down to a supported value to use for _userlimit
 473  *
 474  * userlimit must be aligned down to an entry in the top level htable.
 475  * The one exception is for 32 bit HAT's running PAE.
 476  */
 477 uintptr_t
 478 hat_kernelbase(uintptr_t va)
 479 {
 480 #if defined(__i386)
 481         va &= LEVEL_MASK(1);
 482 #endif
 483         if (IN_VA_HOLE(va))
 484                 panic("_userlimit %p will fall in VA hole\n", (void *)va);
 485         return (va);
 486 }
 487 
 488 /*
 489  *
 490  */
 491 static void
 492 set_max_page_level()
 493 {
 494         level_t lvl;
 495 
 496         if (!kbm_largepage_support) {
 497                 lvl = 0;
 498         } else {
 499                 if (is_x86_feature(x86_featureset, X86FSET_1GPG)) {
 500                         lvl = 2;
 501                         if (chk_optimal_1gtlb &&
 502                             cpuid_opteron_erratum(CPU, 6671130)) {
 503                                 lvl = 1;
 504                         }
 505                         if (plat_mnode_xcheck(LEVEL_SIZE(2) >>
 506                             LEVEL_SHIFT(0))) {
 507                                 lvl = 1;
 508                         }
 509                 } else {
 510                         lvl = 1;
 511                 }
 512         }
 513         mmu.max_page_level = lvl;
 514 
 515         if ((lvl == 2) && (enable_1gpg == 0))
 516                 mmu.umax_page_level = 1;
 517         else
 518                 mmu.umax_page_level = lvl;
 519 }
 520 
 521 /*
 522  * Initialize hat data structures based on processor MMU information.
 523  */
 524 void
 525 mmu_init(void)
 526 {
 527         uint_t max_htables;
 528         uint_t pa_bits;
 529         uint_t va_bits;
 530         int i;
 531 
 532         /*
 533          * If CPU enabled the page table global bit, use it for the kernel
 534          * This is bit 7 in CR4 (PGE - Page Global Enable).
 535          */
 536         if (is_x86_feature(x86_featureset, X86FSET_PGE) &&
 537             (getcr4() & CR4_PGE) != 0)
 538                 mmu.pt_global = PT_GLOBAL;
 539 
 540         /*
 541          * Detect NX and PAE usage.
 542          */
 543         mmu.pae_hat = kbm_pae_support;
 544         if (kbm_nx_support)
 545                 mmu.pt_nx = PT_NX;
 546         else
 547                 mmu.pt_nx = 0;
 548 
 549         /*
 550          * Use CPU info to set various MMU parameters
 551          */
 552         cpuid_get_addrsize(CPU, &pa_bits, &va_bits);
 553 
 554         if (va_bits < sizeof (void *) * NBBY) {
 555                 mmu.hole_start = (1ul << (va_bits - 1));
 556                 mmu.hole_end = 0ul - mmu.hole_start - 1;
 557         } else {
 558                 mmu.hole_end = 0;
 559                 mmu.hole_start = mmu.hole_end - 1;
 560         }
 561 #if defined(OPTERON_ERRATUM_121)
 562         /*
 563          * If erratum 121 has already been detected at this time, hole_start
 564          * contains the value to be subtracted from mmu.hole_start.
 565          */
 566         ASSERT(hole_start == 0 || opteron_erratum_121 != 0);
 567         hole_start = mmu.hole_start - hole_start;
 568 #else
 569         hole_start = mmu.hole_start;
 570 #endif
 571         hole_end = mmu.hole_end;
 572 
 573         mmu.highest_pfn = mmu_btop((1ull << pa_bits) - 1);
 574         if (mmu.pae_hat == 0 && pa_bits > 32)
 575                 mmu.highest_pfn = PFN_4G - 1;
 576 
 577         if (mmu.pae_hat) {
 578                 mmu.pte_size = 8;       /* 8 byte PTEs */
 579                 mmu.pte_size_shift = 3;
 580         } else {
 581                 mmu.pte_size = 4;       /* 4 byte PTEs */
 582                 mmu.pte_size_shift = 2;
 583         }
 584 
 585         if (mmu.pae_hat && !is_x86_feature(x86_featureset, X86FSET_PAE))
 586                 panic("Processor does not support PAE");
 587 
 588         if (!is_x86_feature(x86_featureset, X86FSET_CX8))
 589                 panic("Processor does not support cmpxchg8b instruction");
 590 
 591 #if defined(__amd64)
 592 
 593         mmu.num_level = 4;
 594         mmu.max_level = 3;
 595         mmu.ptes_per_table = 512;
 596         mmu.top_level_count = 512;
 597 
 598         mmu.level_shift[0] = 12;
 599         mmu.level_shift[1] = 21;
 600         mmu.level_shift[2] = 30;
 601         mmu.level_shift[3] = 39;
 602 
 603 #elif defined(__i386)
 604 
 605         if (mmu.pae_hat) {
 606                 mmu.num_level = 3;
 607                 mmu.max_level = 2;
 608                 mmu.ptes_per_table = 512;
 609                 mmu.top_level_count = 4;
 610 
 611                 mmu.level_shift[0] = 12;
 612                 mmu.level_shift[1] = 21;
 613                 mmu.level_shift[2] = 30;
 614 
 615         } else {
 616                 mmu.num_level = 2;
 617                 mmu.max_level = 1;
 618                 mmu.ptes_per_table = 1024;
 619                 mmu.top_level_count = 1024;
 620 
 621                 mmu.level_shift[0] = 12;
 622                 mmu.level_shift[1] = 22;
 623         }
 624 
 625 #endif  /* __i386 */
 626 
 627         for (i = 0; i < mmu.num_level; ++i) {
 628                 mmu.level_size[i] = 1UL << mmu.level_shift[i];
 629                 mmu.level_offset[i] = mmu.level_size[i] - 1;
 630                 mmu.level_mask[i] = ~mmu.level_offset[i];
 631         }
 632 
 633         set_max_page_level();
 634 
 635         mmu_page_sizes = mmu.max_page_level + 1;
 636         mmu_exported_page_sizes = mmu.umax_page_level + 1;
 637 
 638         /* restrict legacy applications from using pagesizes 1g and above */
 639         mmu_legacy_page_sizes =
 640             (mmu_exported_page_sizes > 2) ? 2 : mmu_exported_page_sizes;
 641 
 642 
 643         for (i = 0; i <= mmu.max_page_level; ++i) {
 644                 mmu.pte_bits[i] = PT_VALID | pt_kern;
 645                 if (i > 0)
 646                         mmu.pte_bits[i] |= PT_PAGESIZE;
 647         }
 648 
 649         /*
 650          * NOTE Legacy 32 bit PAE mode only has the P_VALID bit at top level.
 651          */
 652         for (i = 1; i < mmu.num_level; ++i)
 653                 mmu.ptp_bits[i] = PT_PTPBITS;
 654 
 655 #if defined(__i386)
 656         mmu.ptp_bits[2] = PT_VALID;
 657 #endif
 658 
 659         /*
 660          * Compute how many hash table entries to have per process for htables.
 661          * We start with 1 page's worth of entries.
 662          *
 663          * If physical memory is small, reduce the amount need to cover it.
 664          */
 665         max_htables = physmax / mmu.ptes_per_table;
 666         mmu.hash_cnt = MMU_PAGESIZE / sizeof (htable_t *);
 667         while (mmu.hash_cnt > 16 && mmu.hash_cnt >= max_htables)
 668                 mmu.hash_cnt >>= 1;
 669         mmu.vlp_hash_cnt = mmu.hash_cnt;
 670 
 671 #if defined(__amd64)
 672         /*
 673          * If running in 64 bits and physical memory is large,
 674          * increase the size of the cache to cover all of memory for
 675          * a 64 bit process.
 676          */
 677 #define HASH_MAX_LENGTH 4
 678         while (mmu.hash_cnt * HASH_MAX_LENGTH < max_htables)
 679                 mmu.hash_cnt <<= 1;
 680 #endif
 681 }
 682 
 683 
 684 /*
 685  * initialize hat data structures
 686  */
 687 void
 688 hat_init()
 689 {
 690 #if defined(__i386)
 691         /*
 692          * _userlimit must be aligned correctly
 693          */
 694         if ((_userlimit & LEVEL_MASK(1)) != _userlimit) {
 695                 prom_printf("hat_init(): _userlimit=%p, not aligned at %p\n",
 696                     (void *)_userlimit, (void *)LEVEL_SIZE(1));
 697                 halt("hat_init(): Unable to continue");
 698         }
 699 #endif
 700 
 701         cv_init(&hat_list_cv, NULL, CV_DEFAULT, NULL);
 702 
 703         /*
 704          * initialize kmem caches
 705          */
 706         htable_init();
 707         hment_init();
 708 
 709         hat_cache = kmem_cache_create("hat_t",
 710             sizeof (hat_t), 0, hati_constructor, NULL, NULL,
 711             NULL, 0, 0);
 712 
 713         hat_hash_cache = kmem_cache_create("HatHash",
 714             mmu.hash_cnt * sizeof (htable_t *), 0, NULL, NULL, NULL,
 715             NULL, 0, 0);
 716 
 717         /*
 718          * VLP hats can use a smaller hash table size on large memroy machines
 719          */
 720         if (mmu.hash_cnt == mmu.vlp_hash_cnt) {
 721                 vlp_hash_cache = hat_hash_cache;
 722         } else {
 723                 vlp_hash_cache = kmem_cache_create("HatVlpHash",
 724                     mmu.vlp_hash_cnt * sizeof (htable_t *), 0, NULL, NULL, NULL,
 725                     NULL, 0, 0);
 726         }
 727 
 728         /*
 729          * Set up the kernel's hat
 730          */
 731         AS_LOCK_ENTER(&kas, RW_WRITER);
 732         kas.a_hat = kmem_cache_alloc(hat_cache, KM_NOSLEEP);
 733         mutex_init(&kas.a_hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 734         kas.a_hat->hat_as = &kas;
 735         kas.a_hat->hat_flags = 0;
 736         AS_LOCK_EXIT(&kas);
 737 
 738         CPUSET_ZERO(khat_cpuset);
 739         CPUSET_ADD(khat_cpuset, CPU->cpu_id);
 740 
 741         /*
 742          * The kernel hat's next pointer serves as the head of the hat list .
 743          * The kernel hat's prev pointer tracks the last hat on the list for
 744          * htable_steal() to use.
 745          */
 746         kas.a_hat->hat_next = NULL;
 747         kas.a_hat->hat_prev = NULL;
 748 
 749         /*
 750          * Allocate an htable hash bucket for the kernel
 751          * XX64 - tune for 64 bit procs
 752          */
 753         kas.a_hat->hat_num_hash = mmu.hash_cnt;
 754         kas.a_hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_NOSLEEP);
 755         bzero(kas.a_hat->hat_ht_hash, mmu.hash_cnt * sizeof (htable_t *));
 756 
 757         /*
 758          * zero out the top level and cached htable pointers
 759          */
 760         kas.a_hat->hat_ht_cached = NULL;
 761         kas.a_hat->hat_htable = NULL;
 762 
 763         /*
 764          * Pre-allocate hrm_hashtab before enabling the collection of
 765          * refmod statistics.  Allocating on the fly would mean us
 766          * running the risk of suffering recursive mutex enters or
 767          * deadlocks.
 768          */
 769         hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *),
 770             KM_SLEEP);
 771 }
 772 
 773 /*
 774  * Prepare CPU specific pagetables for VLP processes on 64 bit kernels.
 775  *
 776  * Each CPU has a set of 2 pagetables that are reused for any 32 bit
 777  * process it runs. They are the top level pagetable, hci_vlp_l3ptes, and
 778  * the next to top level table for the bottom 512 Gig, hci_vlp_l2ptes.
 779  */
 780 /*ARGSUSED*/
 781 static void
 782 hat_vlp_setup(struct cpu *cpu)
 783 {
 784 #if defined(__amd64) && !defined(__xpv)
 785         struct hat_cpu_info *hci = cpu->cpu_hat_info;
 786         pfn_t pfn;
 787 
 788         /*
 789          * allocate the level==2 page table for the bottom most
 790          * 512Gig of address space (this is where 32 bit apps live)
 791          */
 792         ASSERT(hci != NULL);
 793         hci->hci_vlp_l2ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
 794 
 795         /*
 796          * Allocate a top level pagetable and copy the kernel's
 797          * entries into it. Then link in hci_vlp_l2ptes in the 1st entry.
 798          */
 799         hci->hci_vlp_l3ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
 800         hci->hci_vlp_pfn =
 801             hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l3ptes);
 802         ASSERT(hci->hci_vlp_pfn != PFN_INVALID);
 803         bcopy(vlp_page, hci->hci_vlp_l3ptes, MMU_PAGESIZE);
 804 
 805         pfn = hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l2ptes);
 806         ASSERT(pfn != PFN_INVALID);
 807         hci->hci_vlp_l3ptes[0] = MAKEPTP(pfn, 2);
 808 #endif /* __amd64 && !__xpv */
 809 }
 810 
 811 /*ARGSUSED*/
 812 static void
 813 hat_vlp_teardown(cpu_t *cpu)
 814 {
 815 #if defined(__amd64) && !defined(__xpv)
 816         struct hat_cpu_info *hci;
 817 
 818         if ((hci = cpu->cpu_hat_info) == NULL)
 819                 return;
 820         if (hci->hci_vlp_l2ptes)
 821                 kmem_free(hci->hci_vlp_l2ptes, MMU_PAGESIZE);
 822         if (hci->hci_vlp_l3ptes)
 823                 kmem_free(hci->hci_vlp_l3ptes, MMU_PAGESIZE);
 824 #endif
 825 }
 826 
 827 #define NEXT_HKR(r, l, s, e) {                  \
 828         kernel_ranges[r].hkr_level = l;         \
 829         kernel_ranges[r].hkr_start_va = s;      \
 830         kernel_ranges[r].hkr_end_va = e;        \
 831         ++r;                                    \
 832 }
 833 
 834 /*
 835  * Finish filling in the kernel hat.
 836  * Pre fill in all top level kernel page table entries for the kernel's
 837  * part of the address range.  From this point on we can't use any new
 838  * kernel large pages if they need PTE's at max_level
 839  *
 840  * create the kmap mappings.
 841  */
 842 void
 843 hat_init_finish(void)
 844 {
 845         size_t          size;
 846         uint_t          r = 0;
 847         uintptr_t       va;
 848         hat_kernel_range_t *rp;
 849 
 850 
 851         /*
 852          * We are now effectively running on the kernel hat.
 853          * Clearing use_boot_reserve shuts off using the pre-allocated boot
 854          * reserve for all HAT allocations.  From here on, the reserves are
 855          * only used when avoiding recursion in kmem_alloc().
 856          */
 857         use_boot_reserve = 0;
 858         htable_adjust_reserve();
 859 
 860         /*
 861          * User HATs are initialized with copies of all kernel mappings in
 862          * higher level page tables. Ensure that those entries exist.
 863          */
 864 #if defined(__amd64)
 865 
 866         NEXT_HKR(r, 3, kernelbase, 0);
 867 #if defined(__xpv)
 868         NEXT_HKR(r, 3, HYPERVISOR_VIRT_START, HYPERVISOR_VIRT_END);
 869 #endif
 870 
 871 #elif defined(__i386)
 872 
 873 #if !defined(__xpv)
 874         if (mmu.pae_hat) {
 875                 va = kernelbase;
 876                 if ((va & LEVEL_MASK(2)) != va) {
 877                         va = P2ROUNDUP(va, LEVEL_SIZE(2));
 878                         NEXT_HKR(r, 1, kernelbase, va);
 879                 }
 880                 if (va != 0)
 881                         NEXT_HKR(r, 2, va, 0);
 882         } else
 883 #endif /* __xpv */
 884                 NEXT_HKR(r, 1, kernelbase, 0);
 885 
 886 #endif /* __i386 */
 887 
 888         num_kernel_ranges = r;
 889 
 890         /*
 891          * Create all the kernel pagetables that will have entries
 892          * shared to user HATs.
 893          */
 894         for (r = 0; r < num_kernel_ranges; ++r) {
 895                 rp = &kernel_ranges[r];
 896                 for (va = rp->hkr_start_va; va != rp->hkr_end_va;
 897                     va += LEVEL_SIZE(rp->hkr_level)) {
 898                         htable_t *ht;
 899 
 900                         if (IN_HYPERVISOR_VA(va))
 901                                 continue;
 902 
 903                         /* can/must skip if a page mapping already exists */
 904                         if (rp->hkr_level <= mmu.max_page_level &&
 905                             (ht = htable_getpage(kas.a_hat, va, NULL)) !=
 906                             NULL) {
 907                                 htable_release(ht);
 908                                 continue;
 909                         }
 910 
 911                         (void) htable_create(kas.a_hat, va, rp->hkr_level - 1,
 912                             NULL);
 913                 }
 914         }
 915 
 916         /*
 917          * 32 bit PAE metal kernels use only 4 of the 512 entries in the
 918          * page holding the top level pagetable. We use the remainder for
 919          * the "per CPU" page tables for VLP processes.
 920          * Map the top level kernel pagetable into the kernel to make
 921          * it easy to use bcopy access these tables.
 922          */
 923         if (mmu.pae_hat) {
 924                 vlp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP);
 925                 hat_devload(kas.a_hat, (caddr_t)vlp_page, MMU_PAGESIZE,
 926                     kas.a_hat->hat_htable->ht_pfn,
 927 #if !defined(__xpv)
 928                     PROT_WRITE |
 929 #endif
 930                     PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
 931                     HAT_LOAD | HAT_LOAD_NOCONSIST);
 932         }
 933         hat_vlp_setup(CPU);
 934 
 935         /*
 936          * Create kmap (cached mappings of kernel PTEs)
 937          * for 32 bit we map from segmap_start .. ekernelheap
 938          * for 64 bit we map from segmap_start .. segmap_start + segmapsize;
 939          */
 940 #if defined(__i386)
 941         size = (uintptr_t)ekernelheap - segmap_start;
 942 #elif defined(__amd64)
 943         size = segmapsize;
 944 #endif
 945         hat_kmap_init((uintptr_t)segmap_start, size);
 946 }
 947 
 948 /*
 949  * On 32 bit PAE mode, PTE's are 64 bits, but ordinary atomic memory references
 950  * are 32 bit, so for safety we must use atomic_cas_64() to install these.
 951  */
 952 #ifdef __i386
 953 static void
 954 reload_pae32(hat_t *hat, cpu_t *cpu)
 955 {
 956         x86pte_t *src;
 957         x86pte_t *dest;
 958         x86pte_t pte;
 959         int i;
 960 
 961         /*
 962          * Load the 4 entries of the level 2 page table into this
 963          * cpu's range of the vlp_page and point cr3 at them.
 964          */
 965         ASSERT(mmu.pae_hat);
 966         src = hat->hat_vlp_ptes;
 967         dest = vlp_page + (cpu->cpu_id + 1) * VLP_NUM_PTES;
 968         for (i = 0; i < VLP_NUM_PTES; ++i) {
 969                 for (;;) {
 970                         pte = dest[i];
 971                         if (pte == src[i])
 972                                 break;
 973                         if (atomic_cas_64(dest + i, pte, src[i]) != src[i])
 974                                 break;
 975                 }
 976         }
 977 }
 978 #endif
 979 
 980 /*
 981  * Switch to a new active hat, maintaining bit masks to track active CPUs.
 982  *
 983  * On the 32-bit PAE hypervisor, %cr3 is a 64-bit value, on metal it
 984  * remains a 32-bit value.
 985  */
 986 void
 987 hat_switch(hat_t *hat)
 988 {
 989         uint64_t        newcr3;
 990         cpu_t           *cpu = CPU;
 991         hat_t           *old = cpu->cpu_current_hat;
 992 
 993         /*
 994          * set up this information first, so we don't miss any cross calls
 995          */
 996         if (old != NULL) {
 997                 if (old == hat)
 998                         return;
 999                 if (old != kas.a_hat)
1000                         CPUSET_ATOMIC_DEL(old->hat_cpus, cpu->cpu_id);
1001         }
1002 
1003         /*
1004          * Add this CPU to the active set for this HAT.
1005          */
1006         if (hat != kas.a_hat) {
1007                 CPUSET_ATOMIC_ADD(hat->hat_cpus, cpu->cpu_id);
1008         }
1009         cpu->cpu_current_hat = hat;
1010 
1011         /*
1012          * now go ahead and load cr3
1013          */
1014         if (hat->hat_flags & HAT_VLP) {
1015 #if defined(__amd64)
1016                 x86pte_t *vlpptep = cpu->cpu_hat_info->hci_vlp_l2ptes;
1017 
1018                 VLP_COPY(hat->hat_vlp_ptes, vlpptep);
1019                 newcr3 = MAKECR3(cpu->cpu_hat_info->hci_vlp_pfn);
1020 #elif defined(__i386)
1021                 reload_pae32(hat, cpu);
1022                 newcr3 = MAKECR3(kas.a_hat->hat_htable->ht_pfn) +
1023                     (cpu->cpu_id + 1) * VLP_SIZE;
1024 #endif
1025         } else {
1026                 newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn);
1027         }
1028 #ifdef __xpv
1029         {
1030                 struct mmuext_op t[2];
1031                 uint_t retcnt;
1032                 uint_t opcnt = 1;
1033 
1034                 t[0].cmd = MMUEXT_NEW_BASEPTR;
1035                 t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
1036 #if defined(__amd64)
1037                 /*
1038                  * There's an interesting problem here, as to what to
1039                  * actually specify when switching to the kernel hat.
1040                  * For now we'll reuse the kernel hat again.
1041                  */
1042                 t[1].cmd = MMUEXT_NEW_USER_BASEPTR;
1043                 if (hat == kas.a_hat)
1044                         t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
1045                 else
1046                         t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable);
1047                 ++opcnt;
1048 #endif  /* __amd64 */
1049                 if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0)
1050                         panic("HYPERVISOR_mmu_update() failed");
1051                 ASSERT(retcnt == opcnt);
1052 
1053         }
1054 #else
1055         setcr3(newcr3);
1056 #endif
1057         ASSERT(cpu == CPU);
1058 }
1059 
1060 /*
1061  * Utility to return a valid x86pte_t from protections, pfn, and level number
1062  */
1063 static x86pte_t
1064 hati_mkpte(pfn_t pfn, uint_t attr, level_t level, uint_t flags)
1065 {
1066         x86pte_t        pte;
1067         uint_t          cache_attr = attr & HAT_ORDER_MASK;
1068 
1069         pte = MAKEPTE(pfn, level);
1070 
1071         if (attr & PROT_WRITE)
1072                 PTE_SET(pte, PT_WRITABLE);
1073 
1074         if (attr & PROT_USER)
1075                 PTE_SET(pte, PT_USER);
1076 
1077         if (!(attr & PROT_EXEC))
1078                 PTE_SET(pte, mmu.pt_nx);
1079 
1080         /*
1081          * Set the software bits used track ref/mod sync's and hments.
1082          * If not using REF/MOD, set them to avoid h/w rewriting PTEs.
1083          */
1084         if (flags & HAT_LOAD_NOCONSIST)
1085                 PTE_SET(pte, PT_NOCONSIST | PT_REF | PT_MOD);
1086         else if (attr & HAT_NOSYNC)
1087                 PTE_SET(pte, PT_NOSYNC | PT_REF | PT_MOD);
1088 
1089         /*
1090          * Set the caching attributes in the PTE. The combination
1091          * of attributes are poorly defined, so we pay attention
1092          * to them in the given order.
1093          *
1094          * The test for HAT_STRICTORDER is different because it's defined
1095          * as "0" - which was a stupid thing to do, but is too late to change!
1096          */
1097         if (cache_attr == HAT_STRICTORDER) {
1098                 PTE_SET(pte, PT_NOCACHE);
1099         /*LINTED [Lint hates empty ifs, but it's the obvious way to do this] */
1100         } else if (cache_attr & (HAT_UNORDERED_OK | HAT_STORECACHING_OK)) {
1101                 /* nothing to set */;
1102         } else if (cache_attr & (HAT_MERGING_OK | HAT_LOADCACHING_OK)) {
1103                 PTE_SET(pte, PT_NOCACHE);
1104                 if (is_x86_feature(x86_featureset, X86FSET_PAT))
1105                         PTE_SET(pte, (level == 0) ? PT_PAT_4K : PT_PAT_LARGE);
1106                 else
1107                         PTE_SET(pte, PT_WRITETHRU);
1108         } else {
1109                 panic("hati_mkpte(): bad caching attributes: %x\n", cache_attr);
1110         }
1111 
1112         return (pte);
1113 }
1114 
1115 /*
1116  * Duplicate address translations of the parent to the child.
1117  * This function really isn't used anymore.
1118  */
1119 /*ARGSUSED*/
1120 int
1121 hat_dup(hat_t *old, hat_t *new, caddr_t addr, size_t len, uint_t flag)
1122 {
1123         ASSERT((uintptr_t)addr < kernelbase);
1124         ASSERT(new != kas.a_hat);
1125         ASSERT(old != kas.a_hat);
1126         return (0);
1127 }
1128 
1129 /*
1130  * Allocate any hat resources required for a process being swapped in.
1131  */
1132 /*ARGSUSED*/
1133 void
1134 hat_swapin(hat_t *hat)
1135 {
1136         /* do nothing - we let everything fault back in */
1137 }
1138 
1139 /*
1140  * Unload all translations associated with an address space of a process
1141  * that is being swapped out.
1142  */
1143 void
1144 hat_swapout(hat_t *hat)
1145 {
1146         uintptr_t       vaddr = (uintptr_t)0;
1147         uintptr_t       eaddr = _userlimit;
1148         htable_t        *ht = NULL;
1149         level_t         l;
1150 
1151         XPV_DISALLOW_MIGRATE();
1152         /*
1153          * We can't just call hat_unload(hat, 0, _userlimit...)  here, because
1154          * seg_spt and shared pagetables can't be swapped out.
1155          * Take a look at segspt_shmswapout() - it's a big no-op.
1156          *
1157          * Instead we'll walk through all the address space and unload
1158          * any mappings which we are sure are not shared, not locked.
1159          */
1160         ASSERT(IS_PAGEALIGNED(vaddr));
1161         ASSERT(IS_PAGEALIGNED(eaddr));
1162         ASSERT(AS_LOCK_HELD(hat->hat_as));
1163         if ((uintptr_t)hat->hat_as->a_userlimit < eaddr)
1164                 eaddr = (uintptr_t)hat->hat_as->a_userlimit;
1165 
1166         while (vaddr < eaddr) {
1167                 (void) htable_walk(hat, &ht, &vaddr, eaddr);
1168                 if (ht == NULL)
1169                         break;
1170 
1171                 ASSERT(!IN_VA_HOLE(vaddr));
1172 
1173                 /*
1174                  * If the page table is shared skip its entire range.
1175                  */
1176                 l = ht->ht_level;
1177                 if (ht->ht_flags & HTABLE_SHARED_PFN) {
1178                         vaddr = ht->ht_vaddr + LEVEL_SIZE(l + 1);
1179                         htable_release(ht);
1180                         ht = NULL;
1181                         continue;
1182                 }
1183 
1184                 /*
1185                  * If the page table has no locked entries, unload this one.
1186                  */
1187                 if (ht->ht_lock_cnt == 0)
1188                         hat_unload(hat, (caddr_t)vaddr, LEVEL_SIZE(l),
1189                             HAT_UNLOAD_UNMAP);
1190 
1191                 /*
1192                  * If we have a level 0 page table with locked entries,
1193                  * skip the entire page table, otherwise skip just one entry.
1194                  */
1195                 if (ht->ht_lock_cnt > 0 && l == 0)
1196                         vaddr = ht->ht_vaddr + LEVEL_SIZE(1);
1197                 else
1198                         vaddr += LEVEL_SIZE(l);
1199         }
1200         if (ht)
1201                 htable_release(ht);
1202 
1203         /*
1204          * We're in swapout because the system is low on memory, so
1205          * go back and flush all the htables off the cached list.
1206          */
1207         htable_purge_hat(hat);
1208         XPV_ALLOW_MIGRATE();
1209 }
1210 
1211 /*
1212  * returns number of bytes that have valid mappings in hat.
1213  */
1214 size_t
1215 hat_get_mapped_size(hat_t *hat)
1216 {
1217         size_t total = 0;
1218         int l;
1219 
1220         for (l = 0; l <= mmu.max_page_level; l++)
1221                 total += (hat->hat_pages_mapped[l] << LEVEL_SHIFT(l));
1222         total += hat->hat_ism_pgcnt;
1223 
1224         return (total);
1225 }
1226 
1227 /*
1228  * enable/disable collection of stats for hat.
1229  */
1230 int
1231 hat_stats_enable(hat_t *hat)
1232 {
1233         atomic_inc_32(&hat->hat_stats);
1234         return (1);
1235 }
1236 
1237 void
1238 hat_stats_disable(hat_t *hat)
1239 {
1240         atomic_dec_32(&hat->hat_stats);
1241 }
1242 
1243 /*
1244  * Utility to sync the ref/mod bits from a page table entry to the page_t
1245  * We must be holding the mapping list lock when this is called.
1246  */
1247 static void
1248 hati_sync_pte_to_page(page_t *pp, x86pte_t pte, level_t level)
1249 {
1250         uint_t  rm = 0;
1251         pgcnt_t pgcnt;
1252 
1253         if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC)
1254                 return;
1255 
1256         if (PTE_GET(pte, PT_REF))
1257                 rm |= P_REF;
1258 
1259         if (PTE_GET(pte, PT_MOD))
1260                 rm |= P_MOD;
1261 
1262         if (rm == 0)
1263                 return;
1264 
1265         /*
1266          * sync to all constituent pages of a large page
1267          */
1268         ASSERT(x86_hm_held(pp));
1269         pgcnt = page_get_pagecnt(level);
1270         ASSERT(IS_P2ALIGNED(pp->p_pagenum, pgcnt));
1271         for (; pgcnt > 0; --pgcnt) {
1272                 /*
1273                  * hat_page_demote() can't decrease
1274                  * pszc below this mapping size
1275                  * since this large mapping existed after we
1276                  * took mlist lock.
1277                  */
1278                 ASSERT(pp->p_szc >= level);
1279                 hat_page_setattr(pp, rm);
1280                 ++pp;
1281         }
1282 }
1283 
1284 /*
1285  * This the set of PTE bits for PFN, permissions and caching
1286  * that are allowed to change on a HAT_LOAD_REMAP
1287  */
1288 #define PT_REMAP_BITS                                                   \
1289         (PT_PADDR | PT_NX | PT_WRITABLE | PT_WRITETHRU |                \
1290         PT_NOCACHE | PT_PAT_4K | PT_PAT_LARGE | PT_IGNORE | PT_REF | PT_MOD)
1291 
1292 #define REMAPASSERT(EX) if (!(EX)) panic("hati_pte_map: " #EX)
1293 /*
1294  * Do the low-level work to get a mapping entered into a HAT's pagetables
1295  * and in the mapping list of the associated page_t.
1296  */
1297 static int
1298 hati_pte_map(
1299         htable_t        *ht,
1300         uint_t          entry,
1301         page_t          *pp,
1302         x86pte_t        pte,
1303         int             flags,
1304         void            *pte_ptr)
1305 {
1306         hat_t           *hat = ht->ht_hat;
1307         x86pte_t        old_pte;
1308         level_t         l = ht->ht_level;
1309         hment_t         *hm;
1310         uint_t          is_consist;
1311         uint_t          is_locked;
1312         int             rv = 0;
1313 
1314         /*
1315          * Is this a consistent (ie. need mapping list lock) mapping?
1316          */
1317         is_consist = (pp != NULL && (flags & HAT_LOAD_NOCONSIST) == 0);
1318 
1319         /*
1320          * Track locked mapping count in the htable.  Do this first,
1321          * as we track locking even if there already is a mapping present.
1322          */
1323         is_locked = (flags & HAT_LOAD_LOCK) != 0 && hat != kas.a_hat;
1324         if (is_locked)
1325                 HTABLE_LOCK_INC(ht);
1326 
1327         /*
1328          * Acquire the page's mapping list lock and get an hment to use.
1329          * Note that hment_prepare() might return NULL.
1330          */
1331         if (is_consist) {
1332                 x86_hm_enter(pp);
1333                 hm = hment_prepare(ht, entry, pp);
1334         }
1335 
1336         /*
1337          * Set the new pte, retrieving the old one at the same time.
1338          */
1339         old_pte = x86pte_set(ht, entry, pte, pte_ptr);
1340 
1341         /*
1342          * Did we get a large page / page table collision?
1343          */
1344         if (old_pte == LPAGE_ERROR) {
1345                 if (is_locked)
1346                         HTABLE_LOCK_DEC(ht);
1347                 rv = -1;
1348                 goto done;
1349         }
1350 
1351         /*
1352          * If the mapping didn't change there is nothing more to do.
1353          */
1354         if (PTE_EQUIV(pte, old_pte))
1355                 goto done;
1356 
1357         /*
1358          * Install a new mapping in the page's mapping list
1359          */
1360         if (!PTE_ISVALID(old_pte)) {
1361                 if (is_consist) {
1362                         hment_assign(ht, entry, pp, hm);
1363                         x86_hm_exit(pp);
1364                 } else {
1365                         ASSERT(flags & HAT_LOAD_NOCONSIST);
1366                 }
1367 #if defined(__amd64)
1368                 if (ht->ht_flags & HTABLE_VLP) {
1369                         cpu_t *cpu = CPU;
1370                         x86pte_t *vlpptep = cpu->cpu_hat_info->hci_vlp_l2ptes;
1371                         VLP_COPY(hat->hat_vlp_ptes, vlpptep);
1372                 }
1373 #endif
1374                 HTABLE_INC(ht->ht_valid_cnt);
1375                 PGCNT_INC(hat, l);
1376                 return (rv);
1377         }
1378 
1379         /*
1380          * Remap's are more complicated:
1381          *  - HAT_LOAD_REMAP must be specified if changing the pfn.
1382          *    We also require that NOCONSIST be specified.
1383          *  - Otherwise only permission or caching bits may change.
1384          */
1385         if (!PTE_ISPAGE(old_pte, l))
1386                 panic("non-null/page mapping pte=" FMT_PTE, old_pte);
1387 
1388         if (PTE2PFN(old_pte, l) != PTE2PFN(pte, l)) {
1389                 REMAPASSERT(flags & HAT_LOAD_REMAP);
1390                 REMAPASSERT(flags & HAT_LOAD_NOCONSIST);
1391                 REMAPASSERT(PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST);
1392                 REMAPASSERT(pf_is_memory(PTE2PFN(old_pte, l)) ==
1393                     pf_is_memory(PTE2PFN(pte, l)));
1394                 REMAPASSERT(!is_consist);
1395         }
1396 
1397         /*
1398          * We only let remaps change the certain bits in the PTE.
1399          */
1400         if (PTE_GET(old_pte, ~PT_REMAP_BITS) != PTE_GET(pte, ~PT_REMAP_BITS))
1401                 panic("remap bits changed: old_pte="FMT_PTE", pte="FMT_PTE"\n",
1402                     old_pte, pte);
1403 
1404         /*
1405          * We don't create any mapping list entries on a remap, so release
1406          * any allocated hment after we drop the mapping list lock.
1407          */
1408 done:
1409         if (is_consist) {
1410                 x86_hm_exit(pp);
1411                 if (hm != NULL)
1412                         hment_free(hm);
1413         }
1414         return (rv);
1415 }
1416 
1417 /*
1418  * Internal routine to load a single page table entry. This only fails if
1419  * we attempt to overwrite a page table link with a large page.
1420  */
1421 static int
1422 hati_load_common(
1423         hat_t           *hat,
1424         uintptr_t       va,
1425         page_t          *pp,
1426         uint_t          attr,
1427         uint_t          flags,
1428         level_t         level,
1429         pfn_t           pfn)
1430 {
1431         htable_t        *ht;
1432         uint_t          entry;
1433         x86pte_t        pte;
1434         int             rv = 0;
1435 
1436         /*
1437          * The number 16 is arbitrary and here to catch a recursion problem
1438          * early before we blow out the kernel stack.
1439          */
1440         ++curthread->t_hatdepth;
1441         ASSERT(curthread->t_hatdepth < 16);
1442 
1443         ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as));
1444 
1445         if (flags & HAT_LOAD_SHARE)
1446                 hat->hat_flags |= HAT_SHARED;
1447 
1448         /*
1449          * Find the page table that maps this page if it already exists.
1450          */
1451         ht = htable_lookup(hat, va, level);
1452 
1453         /*
1454          * We must have HAT_LOAD_NOCONSIST if page_t is NULL.
1455          */
1456         if (pp == NULL)
1457                 flags |= HAT_LOAD_NOCONSIST;
1458 
1459         if (ht == NULL) {
1460                 ht = htable_create(hat, va, level, NULL);
1461                 ASSERT(ht != NULL);
1462         }
1463         entry = htable_va2entry(va, ht);
1464 
1465         /*
1466          * a bunch of paranoid error checking
1467          */
1468         ASSERT(ht->ht_busy > 0);
1469         if (ht->ht_vaddr > va || va > HTABLE_LAST_PAGE(ht))
1470                 panic("hati_load_common: bad htable %p, va %p",
1471                     (void *)ht, (void *)va);
1472         ASSERT(ht->ht_level == level);
1473 
1474         /*
1475          * construct the new PTE
1476          */
1477         if (hat == kas.a_hat)
1478                 attr &= ~PROT_USER;
1479         pte = hati_mkpte(pfn, attr, level, flags);
1480         if (hat == kas.a_hat && va >= kernelbase)
1481                 PTE_SET(pte, mmu.pt_global);
1482 
1483         /*
1484          * establish the mapping
1485          */
1486         rv = hati_pte_map(ht, entry, pp, pte, flags, NULL);
1487 
1488         /*
1489          * release the htable and any reserves
1490          */
1491         htable_release(ht);
1492         --curthread->t_hatdepth;
1493         return (rv);
1494 }
1495 
1496 /*
1497  * special case of hat_memload to deal with some kernel addrs for performance
1498  */
1499 static void
1500 hat_kmap_load(
1501         caddr_t         addr,
1502         page_t          *pp,
1503         uint_t          attr,
1504         uint_t          flags)
1505 {
1506         uintptr_t       va = (uintptr_t)addr;
1507         x86pte_t        pte;
1508         pfn_t           pfn = page_pptonum(pp);
1509         pgcnt_t         pg_off = mmu_btop(va - mmu.kmap_addr);
1510         htable_t        *ht;
1511         uint_t          entry;
1512         void            *pte_ptr;
1513 
1514         /*
1515          * construct the requested PTE
1516          */
1517         attr &= ~PROT_USER;
1518         attr |= HAT_STORECACHING_OK;
1519         pte = hati_mkpte(pfn, attr, 0, flags);
1520         PTE_SET(pte, mmu.pt_global);
1521 
1522         /*
1523          * Figure out the pte_ptr and htable and use common code to finish up
1524          */
1525         if (mmu.pae_hat)
1526                 pte_ptr = mmu.kmap_ptes + pg_off;
1527         else
1528                 pte_ptr = (x86pte32_t *)mmu.kmap_ptes + pg_off;
1529         ht = mmu.kmap_htables[(va - mmu.kmap_htables[0]->ht_vaddr) >>
1530             LEVEL_SHIFT(1)];
1531         entry = htable_va2entry(va, ht);
1532         ++curthread->t_hatdepth;
1533         ASSERT(curthread->t_hatdepth < 16);
1534         (void) hati_pte_map(ht, entry, pp, pte, flags, pte_ptr);
1535         --curthread->t_hatdepth;
1536 }
1537 
1538 /*
1539  * hat_memload() - load a translation to the given page struct
1540  *
1541  * Flags for hat_memload/hat_devload/hat_*attr.
1542  *
1543  *      HAT_LOAD        Default flags to load a translation to the page.
1544  *
1545  *      HAT_LOAD_LOCK   Lock down mapping resources; hat_map(), hat_memload(),
1546  *                      and hat_devload().
1547  *
1548  *      HAT_LOAD_NOCONSIST Do not add mapping to page_t mapping list.
1549  *                      sets PT_NOCONSIST
1550  *
1551  *      HAT_LOAD_SHARE  A flag to hat_memload() to indicate h/w page tables
1552  *                      that map some user pages (not kas) is shared by more
1553  *                      than one process (eg. ISM).
1554  *
1555  *      HAT_LOAD_REMAP  Reload a valid pte with a different page frame.
1556  *
1557  *      HAT_NO_KALLOC   Do not kmem_alloc while creating the mapping; at this
1558  *                      point, it's setting up mapping to allocate internal
1559  *                      hat layer data structures.  This flag forces hat layer
1560  *                      to tap its reserves in order to prevent infinite
1561  *                      recursion.
1562  *
1563  * The following is a protection attribute (like PROT_READ, etc.)
1564  *
1565  *      HAT_NOSYNC      set PT_NOSYNC - this mapping's ref/mod bits
1566  *                      are never cleared.
1567  *
1568  * Installing new valid PTE's and creation of the mapping list
1569  * entry are controlled under the same lock. It's derived from the
1570  * page_t being mapped.
1571  */
1572 static uint_t supported_memload_flags =
1573         HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_ADV | HAT_LOAD_NOCONSIST |
1574         HAT_LOAD_SHARE | HAT_NO_KALLOC | HAT_LOAD_REMAP | HAT_LOAD_TEXT;
1575 
1576 void
1577 hat_memload(
1578         hat_t           *hat,
1579         caddr_t         addr,
1580         page_t          *pp,
1581         uint_t          attr,
1582         uint_t          flags)
1583 {
1584         uintptr_t       va = (uintptr_t)addr;
1585         level_t         level = 0;
1586         pfn_t           pfn = page_pptonum(pp);
1587 
1588         XPV_DISALLOW_MIGRATE();
1589         ASSERT(IS_PAGEALIGNED(va));
1590         ASSERT(hat == kas.a_hat || va < _userlimit);
1591         ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as));
1592         ASSERT((flags & supported_memload_flags) == flags);
1593 
1594         ASSERT(!IN_VA_HOLE(va));
1595         ASSERT(!PP_ISFREE(pp));
1596 
1597         /*
1598          * kernel address special case for performance.
1599          */
1600         if (mmu.kmap_addr <= va && va < mmu.kmap_eaddr) {
1601                 ASSERT(hat == kas.a_hat);
1602                 hat_kmap_load(addr, pp, attr, flags);
1603                 XPV_ALLOW_MIGRATE();
1604                 return;
1605         }
1606 
1607         /*
1608          * This is used for memory with normal caching enabled, so
1609          * always set HAT_STORECACHING_OK.
1610          */
1611         attr |= HAT_STORECACHING_OK;
1612         if (hati_load_common(hat, va, pp, attr, flags, level, pfn) != 0)
1613                 panic("unexpected hati_load_common() failure");
1614         XPV_ALLOW_MIGRATE();
1615 }
1616 
1617 /* ARGSUSED */
1618 void
1619 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp,
1620     uint_t attr, uint_t flags, hat_region_cookie_t rcookie)
1621 {
1622         hat_memload(hat, addr, pp, attr, flags);
1623 }
1624 
1625 /*
1626  * Load the given array of page structs using large pages when possible
1627  */
1628 void
1629 hat_memload_array(
1630         hat_t           *hat,
1631         caddr_t         addr,
1632         size_t          len,
1633         page_t          **pages,
1634         uint_t          attr,
1635         uint_t          flags)
1636 {
1637         uintptr_t       va = (uintptr_t)addr;
1638         uintptr_t       eaddr = va + len;
1639         level_t         level;
1640         size_t          pgsize;
1641         pgcnt_t         pgindx = 0;
1642         pfn_t           pfn;
1643         pgcnt_t         i;
1644 
1645         XPV_DISALLOW_MIGRATE();
1646         ASSERT(IS_PAGEALIGNED(va));
1647         ASSERT(hat == kas.a_hat || va + len <= _userlimit);
1648         ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as));
1649         ASSERT((flags & supported_memload_flags) == flags);
1650 
1651         /*
1652          * memload is used for memory with full caching enabled, so
1653          * set HAT_STORECACHING_OK.
1654          */
1655         attr |= HAT_STORECACHING_OK;
1656 
1657         /*
1658          * handle all pages using largest possible pagesize
1659          */
1660         while (va < eaddr) {
1661                 /*
1662                  * decide what level mapping to use (ie. pagesize)
1663                  */
1664                 pfn = page_pptonum(pages[pgindx]);
1665                 for (level = mmu.max_page_level; ; --level) {
1666                         pgsize = LEVEL_SIZE(level);
1667                         if (level == 0)
1668                                 break;
1669 
1670                         if (!IS_P2ALIGNED(va, pgsize) ||
1671                             (eaddr - va) < pgsize ||
1672                             !IS_P2ALIGNED(pfn_to_pa(pfn), pgsize))
1673                                 continue;
1674 
1675                         /*
1676                          * To use a large mapping of this size, all the
1677                          * pages we are passed must be sequential subpages
1678                          * of the large page.
1679                          * hat_page_demote() can't change p_szc because
1680                          * all pages are locked.
1681                          */
1682                         if (pages[pgindx]->p_szc >= level) {
1683                                 for (i = 0; i < mmu_btop(pgsize); ++i) {
1684                                         if (pfn + i !=
1685                                             page_pptonum(pages[pgindx + i]))
1686                                                 break;
1687                                         ASSERT(pages[pgindx + i]->p_szc >=
1688                                             level);
1689                                         ASSERT(pages[pgindx] + i ==
1690                                             pages[pgindx + i]);
1691                                 }
1692                                 if (i == mmu_btop(pgsize)) {
1693 #ifdef DEBUG
1694                                         if (level == 2)
1695                                                 map1gcnt++;
1696 #endif
1697                                         break;
1698                                 }
1699                         }
1700                 }
1701 
1702                 /*
1703                  * Load this page mapping. If the load fails, try a smaller
1704                  * pagesize.
1705                  */
1706                 ASSERT(!IN_VA_HOLE(va));
1707                 while (hati_load_common(hat, va, pages[pgindx], attr,
1708                     flags, level, pfn) != 0) {
1709                         if (level == 0)
1710                                 panic("unexpected hati_load_common() failure");
1711                         --level;
1712                         pgsize = LEVEL_SIZE(level);
1713                 }
1714 
1715                 /*
1716                  * move to next page
1717                  */
1718                 va += pgsize;
1719                 pgindx += mmu_btop(pgsize);
1720         }
1721         XPV_ALLOW_MIGRATE();
1722 }
1723 
1724 /* ARGSUSED */
1725 void
1726 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len,
1727     struct page **pps, uint_t attr, uint_t flags,
1728     hat_region_cookie_t rcookie)
1729 {
1730         hat_memload_array(hat, addr, len, pps, attr, flags);
1731 }
1732 
1733 /*
1734  * void hat_devload(hat, addr, len, pf, attr, flags)
1735  *      load/lock the given page frame number
1736  *
1737  * Advisory ordering attributes. Apply only to device mappings.
1738  *
1739  * HAT_STRICTORDER: the CPU must issue the references in order, as the
1740  *      programmer specified.  This is the default.
1741  * HAT_UNORDERED_OK: the CPU may reorder the references (this is all kinds
1742  *      of reordering; store or load with store or load).
1743  * HAT_MERGING_OK: merging and batching: the CPU may merge individual stores
1744  *      to consecutive locations (for example, turn two consecutive byte
1745  *      stores into one halfword store), and it may batch individual loads
1746  *      (for example, turn two consecutive byte loads into one halfword load).
1747  *      This also implies re-ordering.
1748  * HAT_LOADCACHING_OK: the CPU may cache the data it fetches and reuse it
1749  *      until another store occurs.  The default is to fetch new data
1750  *      on every load.  This also implies merging.
1751  * HAT_STORECACHING_OK: the CPU may keep the data in the cache and push it to
1752  *      the device (perhaps with other data) at a later time.  The default is
1753  *      to push the data right away.  This also implies load caching.
1754  *
1755  * Equivalent of hat_memload(), but can be used for device memory where
1756  * there are no page_t's and we support additional flags (write merging, etc).
1757  * Note that we can have large page mappings with this interface.
1758  */
1759 int supported_devload_flags = HAT_LOAD | HAT_LOAD_LOCK |
1760         HAT_LOAD_NOCONSIST | HAT_STRICTORDER | HAT_UNORDERED_OK |
1761         HAT_MERGING_OK | HAT_LOADCACHING_OK | HAT_STORECACHING_OK;
1762 
1763 void
1764 hat_devload(
1765         hat_t           *hat,
1766         caddr_t         addr,
1767         size_t          len,
1768         pfn_t           pfn,
1769         uint_t          attr,
1770         int             flags)
1771 {
1772         uintptr_t       va = ALIGN2PAGE(addr);
1773         uintptr_t       eva = va + len;
1774         level_t         level;
1775         size_t          pgsize;
1776         page_t          *pp;
1777         int             f;      /* per PTE copy of flags  - maybe modified */
1778         uint_t          a;      /* per PTE copy of attr */
1779 
1780         XPV_DISALLOW_MIGRATE();
1781         ASSERT(IS_PAGEALIGNED(va));
1782         ASSERT(hat == kas.a_hat || eva <= _userlimit);
1783         ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as));
1784         ASSERT((flags & supported_devload_flags) == flags);
1785 
1786         /*
1787          * handle all pages
1788          */
1789         while (va < eva) {
1790 
1791                 /*
1792                  * decide what level mapping to use (ie. pagesize)
1793                  */
1794                 for (level = mmu.max_page_level; ; --level) {
1795                         pgsize = LEVEL_SIZE(level);
1796                         if (level == 0)
1797                                 break;
1798                         if (IS_P2ALIGNED(va, pgsize) &&
1799                             (eva - va) >= pgsize &&
1800                             IS_P2ALIGNED(pfn, mmu_btop(pgsize))) {
1801 #ifdef DEBUG
1802                                 if (level == 2)
1803                                         map1gcnt++;
1804 #endif
1805                                 break;
1806                         }
1807                 }
1808 
1809                 /*
1810                  * If this is just memory then allow caching (this happens
1811                  * for the nucleus pages) - though HAT_PLAT_NOCACHE can be used
1812                  * to override that. If we don't have a page_t then make sure
1813                  * NOCONSIST is set.
1814                  */
1815                 a = attr;
1816                 f = flags;
1817                 if (!pf_is_memory(pfn))
1818                         f |= HAT_LOAD_NOCONSIST;
1819                 else if (!(a & HAT_PLAT_NOCACHE))
1820                         a |= HAT_STORECACHING_OK;
1821 
1822                 if (f & HAT_LOAD_NOCONSIST)
1823                         pp = NULL;
1824                 else
1825                         pp = page_numtopp_nolock(pfn);
1826 
1827                 /*
1828                  * Check to make sure we are really trying to map a valid
1829                  * memory page. The caller wishing to intentionally map
1830                  * free memory pages will have passed the HAT_LOAD_NOCONSIST
1831                  * flag, then pp will be NULL.
1832                  */
1833                 if (pp != NULL) {
1834                         if (PP_ISFREE(pp)) {
1835                                 panic("hat_devload: loading "
1836                                     "a mapping to free page %p", (void *)pp);
1837                         }
1838 
1839                         if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) {
1840                                 panic("hat_devload: loading a mapping "
1841                                     "to an unlocked page %p",
1842                                     (void *)pp);
1843                         }
1844                 }
1845 
1846                 /*
1847                  * load this page mapping
1848                  */
1849                 ASSERT(!IN_VA_HOLE(va));
1850                 while (hati_load_common(hat, va, pp, a, f, level, pfn) != 0) {
1851                         if (level == 0)
1852                                 panic("unexpected hati_load_common() failure");
1853                         --level;
1854                         pgsize = LEVEL_SIZE(level);
1855                 }
1856 
1857                 /*
1858                  * move to next page
1859                  */
1860                 va += pgsize;
1861                 pfn += mmu_btop(pgsize);
1862         }
1863         XPV_ALLOW_MIGRATE();
1864 }
1865 
1866 /*
1867  * void hat_unlock(hat, addr, len)
1868  *      unlock the mappings to a given range of addresses
1869  *
1870  * Locks are tracked by ht_lock_cnt in the htable.
1871  */
1872 void
1873 hat_unlock(hat_t *hat, caddr_t addr, size_t len)
1874 {
1875         uintptr_t       vaddr = (uintptr_t)addr;
1876         uintptr_t       eaddr = vaddr + len;
1877         htable_t        *ht = NULL;
1878 
1879         /*
1880          * kernel entries are always locked, we don't track lock counts
1881          */
1882         ASSERT(hat == kas.a_hat || eaddr <= _userlimit);
1883         ASSERT(IS_PAGEALIGNED(vaddr));
1884         ASSERT(IS_PAGEALIGNED(eaddr));
1885         if (hat == kas.a_hat)
1886                 return;
1887         if (eaddr > _userlimit)
1888                 panic("hat_unlock() address out of range - above _userlimit");
1889 
1890         XPV_DISALLOW_MIGRATE();
1891         ASSERT(AS_LOCK_HELD(hat->hat_as));
1892         while (vaddr < eaddr) {
1893                 (void) htable_walk(hat, &ht, &vaddr, eaddr);
1894                 if (ht == NULL)
1895                         break;
1896 
1897                 ASSERT(!IN_VA_HOLE(vaddr));
1898 
1899                 if (ht->ht_lock_cnt < 1)
1900                         panic("hat_unlock(): lock_cnt < 1, "
1901                             "htable=%p, vaddr=%p\n", (void *)ht, (void *)vaddr);
1902                 HTABLE_LOCK_DEC(ht);
1903 
1904                 vaddr += LEVEL_SIZE(ht->ht_level);
1905         }
1906         if (ht)
1907                 htable_release(ht);
1908         XPV_ALLOW_MIGRATE();
1909 }
1910 
1911 /* ARGSUSED */
1912 void
1913 hat_unlock_region(struct hat *hat, caddr_t addr, size_t len,
1914     hat_region_cookie_t rcookie)
1915 {
1916         panic("No shared region support on x86");
1917 }
1918 
1919 #if !defined(__xpv)
1920 /*
1921  * Cross call service routine to demap a virtual page on
1922  * the current CPU or flush all mappings in TLB.
1923  */
1924 /*ARGSUSED*/
1925 static int
1926 hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
1927 {
1928         hat_t   *hat = (hat_t *)a1;
1929         caddr_t addr = (caddr_t)a2;
1930         size_t len = (size_t)a3;
1931 
1932         /*
1933          * If the target hat isn't the kernel and this CPU isn't operating
1934          * in the target hat, we can ignore the cross call.
1935          */
1936         if (hat != kas.a_hat && hat != CPU->cpu_current_hat)
1937                 return (0);
1938 
1939         /*
1940          * For a normal address, we flush a range of contiguous mappings
1941          */
1942         if ((uintptr_t)addr != DEMAP_ALL_ADDR) {
1943                 for (size_t i = 0; i < len; i += MMU_PAGESIZE)
1944                         mmu_tlbflush_entry(addr + i);
1945                 return (0);
1946         }
1947 
1948         /*
1949          * Otherwise we reload cr3 to effect a complete TLB flush.
1950          *
1951          * A reload of cr3 on a VLP process also means we must also recopy in
1952          * the pte values from the struct hat
1953          */
1954         if (hat->hat_flags & HAT_VLP) {
1955 #if defined(__amd64)
1956                 x86pte_t *vlpptep = CPU->cpu_hat_info->hci_vlp_l2ptes;
1957 
1958                 VLP_COPY(hat->hat_vlp_ptes, vlpptep);
1959 #elif defined(__i386)
1960                 reload_pae32(hat, CPU);
1961 #endif
1962         }
1963         reload_cr3();
1964         return (0);
1965 }
1966 
1967 /*
1968  * Flush all TLB entries, including global (ie. kernel) ones.
1969  */
1970 static void
1971 flush_all_tlb_entries(void)
1972 {
1973         ulong_t cr4 = getcr4();
1974 
1975         if (cr4 & CR4_PGE) {
1976                 setcr4(cr4 & ~(ulong_t)CR4_PGE);
1977                 setcr4(cr4);
1978 
1979                 /*
1980                  * 32 bit PAE also needs to always reload_cr3()
1981                  */
1982                 if (mmu.max_level == 2)
1983                         reload_cr3();
1984         } else {
1985                 reload_cr3();
1986         }
1987 }
1988 
1989 #define TLB_CPU_HALTED  (01ul)
1990 #define TLB_INVAL_ALL   (02ul)
1991 #define CAS_TLB_INFO(cpu, old, new)     \
1992         atomic_cas_ulong((ulong_t *)&(cpu)->cpu_m.mcpu_tlb_info, (old), (new))
1993 
1994 /*
1995  * Record that a CPU is going idle
1996  */
1997 void
1998 tlb_going_idle(void)
1999 {
2000         atomic_or_ulong((ulong_t *)&CPU->cpu_m.mcpu_tlb_info, TLB_CPU_HALTED);
2001 }
2002 
2003 /*
2004  * Service a delayed TLB flush if coming out of being idle.
2005  * It will be called from cpu idle notification with interrupt disabled.
2006  */
2007 void
2008 tlb_service(void)
2009 {
2010         ulong_t tlb_info;
2011         ulong_t found;
2012 
2013         /*
2014          * We only have to do something if coming out of being idle.
2015          */
2016         tlb_info = CPU->cpu_m.mcpu_tlb_info;
2017         if (tlb_info & TLB_CPU_HALTED) {
2018                 ASSERT(CPU->cpu_current_hat == kas.a_hat);
2019 
2020                 /*
2021                  * Atomic clear and fetch of old state.
2022                  */
2023                 while ((found = CAS_TLB_INFO(CPU, tlb_info, 0)) != tlb_info) {
2024                         ASSERT(found & TLB_CPU_HALTED);
2025                         tlb_info = found;
2026                         SMT_PAUSE();
2027                 }
2028                 if (tlb_info & TLB_INVAL_ALL)
2029                         flush_all_tlb_entries();
2030         }
2031 }
2032 #endif /* !__xpv */
2033 
2034 /*
2035  * Internal routine to do cross calls to invalidate a range of pages on
2036  * all CPUs using a given hat.
2037  */
2038 void
2039 hat_tlb_inval_range(hat_t *hat, uintptr_t va, size_t len)
2040 {
2041         extern int      flushes_require_xcalls; /* from mp_startup.c */
2042         cpuset_t        justme;
2043         cpuset_t        cpus_to_shootdown;
2044 #ifndef __xpv
2045         cpuset_t        check_cpus;
2046         cpu_t           *cpup;
2047         int             c;
2048 #endif
2049 
2050         /*
2051          * If the hat is being destroyed, there are no more users, so
2052          * demap need not do anything.
2053          */
2054         if (hat->hat_flags & HAT_FREEING)
2055                 return;
2056 
2057         /*
2058          * If demapping from a shared pagetable, we best demap the
2059          * entire set of user TLBs, since we don't know what addresses
2060          * these were shared at.
2061          */
2062         if (hat->hat_flags & HAT_SHARED) {
2063                 hat = kas.a_hat;
2064                 va = DEMAP_ALL_ADDR;
2065         }
2066 
2067         /*
2068          * if not running with multiple CPUs, don't use cross calls
2069          */
2070         if (panicstr || !flushes_require_xcalls) {
2071 #ifdef __xpv
2072                 if (va == DEMAP_ALL_ADDR) {
2073                         xen_flush_tlb();
2074                 } else {
2075                         for (size_t i = 0; i < len; i += MMU_PAGESIZE)
2076                                 xen_flush_va((caddr_t)(va + i));
2077                 }
2078 #else
2079                 (void) hati_demap_func((xc_arg_t)hat,
2080                     (xc_arg_t)va, (xc_arg_t)len);
2081 #endif
2082                 return;
2083         }
2084 
2085 
2086         /*
2087          * Determine CPUs to shootdown. Kernel changes always do all CPUs.
2088          * Otherwise it's just CPUs currently executing in this hat.
2089          */
2090         kpreempt_disable();
2091         CPUSET_ONLY(justme, CPU->cpu_id);
2092         if (hat == kas.a_hat)
2093                 cpus_to_shootdown = khat_cpuset;
2094         else
2095                 cpus_to_shootdown = hat->hat_cpus;
2096 
2097 #ifndef __xpv
2098         /*
2099          * If any CPUs in the set are idle, just request a delayed flush
2100          * and avoid waking them up.
2101          */
2102         check_cpus = cpus_to_shootdown;
2103         for (c = 0; c < NCPU && !CPUSET_ISNULL(check_cpus); ++c) {
2104                 ulong_t tlb_info;
2105 
2106                 if (!CPU_IN_SET(check_cpus, c))
2107                         continue;
2108                 CPUSET_DEL(check_cpus, c);
2109                 cpup = cpu[c];
2110                 if (cpup == NULL)
2111                         continue;
2112 
2113                 tlb_info = cpup->cpu_m.mcpu_tlb_info;
2114                 while (tlb_info == TLB_CPU_HALTED) {
2115                         (void) CAS_TLB_INFO(cpup, TLB_CPU_HALTED,
2116                             TLB_CPU_HALTED | TLB_INVAL_ALL);
2117                         SMT_PAUSE();
2118                         tlb_info = cpup->cpu_m.mcpu_tlb_info;
2119                 }
2120                 if (tlb_info == (TLB_CPU_HALTED | TLB_INVAL_ALL)) {
2121                         HATSTAT_INC(hs_tlb_inval_delayed);
2122                         CPUSET_DEL(cpus_to_shootdown, c);
2123                 }
2124         }
2125 #endif
2126 
2127         if (CPUSET_ISNULL(cpus_to_shootdown) ||
2128             CPUSET_ISEQUAL(cpus_to_shootdown, justme)) {
2129 
2130 #ifdef __xpv
2131                 if (va == DEMAP_ALL_ADDR) {
2132                         xen_flush_tlb();
2133                 } else {
2134                         for (size_t i = 0; i < len; i += MMU_PAGESIZE)
2135                                 xen_flush_va((caddr_t)(va + i));
2136                 }
2137 #else
2138                 (void) hati_demap_func((xc_arg_t)hat,
2139                     (xc_arg_t)va, (xc_arg_t)len);
2140 #endif
2141 
2142         } else {
2143 
2144                 CPUSET_ADD(cpus_to_shootdown, CPU->cpu_id);
2145 #ifdef __xpv
2146                 if (va == DEMAP_ALL_ADDR) {
2147                         xen_gflush_tlb(cpus_to_shootdown);
2148                 } else {
2149                         for (size_t i = 0; i < len; i += MMU_PAGESIZE) {
2150                                 xen_gflush_va((caddr_t)(va + i),
2151                                     cpus_to_shootdown);
2152                         }
2153                 }
2154 #else
2155                 xc_call((xc_arg_t)hat, (xc_arg_t)va, (xc_arg_t)len,
2156                     CPUSET2BV(cpus_to_shootdown), hati_demap_func);
2157 #endif
2158 
2159         }
2160         kpreempt_enable();
2161 }
2162 
2163 void
2164 hat_tlb_inval(hat_t *hat, uintptr_t va)
2165 {
2166         hat_tlb_inval_range(hat, va, MMU_PAGESIZE);
2167 }
2168 
2169 /*
2170  * Interior routine for HAT_UNLOADs from hat_unload_callback(),
2171  * hat_kmap_unload() OR from hat_steal() code.  This routine doesn't
2172  * handle releasing of the htables.
2173  */
2174 void
2175 hat_pte_unmap(
2176         htable_t        *ht,
2177         uint_t          entry,
2178         uint_t          flags,
2179         x86pte_t        old_pte,
2180         void            *pte_ptr,
2181         boolean_t       tlb)
2182 {
2183         hat_t           *hat = ht->ht_hat;
2184         hment_t         *hm = NULL;
2185         page_t          *pp = NULL;
2186         level_t         l = ht->ht_level;
2187         pfn_t           pfn;
2188 
2189         /*
2190          * We always track the locking counts, even if nothing is unmapped
2191          */
2192         if ((flags & HAT_UNLOAD_UNLOCK) != 0 && hat != kas.a_hat) {
2193                 ASSERT(ht->ht_lock_cnt > 0);
2194                 HTABLE_LOCK_DEC(ht);
2195         }
2196 
2197         /*
2198          * Figure out which page's mapping list lock to acquire using the PFN
2199          * passed in "old" PTE. We then attempt to invalidate the PTE.
2200          * If another thread, probably a hat_pageunload, has asynchronously
2201          * unmapped/remapped this address we'll loop here.
2202          */
2203         ASSERT(ht->ht_busy > 0);
2204         while (PTE_ISVALID(old_pte)) {
2205                 pfn = PTE2PFN(old_pte, l);
2206                 if (PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST) {
2207                         pp = NULL;
2208                 } else {
2209 #ifdef __xpv
2210                         if (pfn == PFN_INVALID)
2211                                 panic("Invalid PFN, but not PT_NOCONSIST");
2212 #endif
2213                         pp = page_numtopp_nolock(pfn);
2214                         if (pp == NULL) {
2215                                 panic("no page_t, not NOCONSIST: old_pte="
2216                                     FMT_PTE " ht=%lx entry=0x%x pte_ptr=%lx",
2217                                     old_pte, (uintptr_t)ht, entry,
2218                                     (uintptr_t)pte_ptr);
2219                         }
2220                         x86_hm_enter(pp);
2221                 }
2222 
2223                 old_pte = x86pte_inval(ht, entry, old_pte, pte_ptr, tlb);
2224 
2225                 /*
2226                  * If the page hadn't changed we've unmapped it and can proceed
2227                  */
2228                 if (PTE_ISVALID(old_pte) && PTE2PFN(old_pte, l) == pfn)
2229                         break;
2230 
2231                 /*
2232                  * Otherwise, we'll have to retry with the current old_pte.
2233                  * Drop the hment lock, since the pfn may have changed.
2234                  */
2235                 if (pp != NULL) {
2236                         x86_hm_exit(pp);
2237                         pp = NULL;
2238                 } else {
2239                         ASSERT(PTE_GET(old_pte, PT_SOFTWARE) >= PT_NOCONSIST);
2240                 }
2241         }
2242 
2243         /*
2244          * If the old mapping wasn't valid, there's nothing more to do
2245          */
2246         if (!PTE_ISVALID(old_pte)) {
2247                 if (pp != NULL)
2248                         x86_hm_exit(pp);
2249                 return;
2250         }
2251 
2252         /*
2253          * Take care of syncing any MOD/REF bits and removing the hment.
2254          */
2255         if (pp != NULL) {
2256                 if (!(flags & HAT_UNLOAD_NOSYNC))
2257                         hati_sync_pte_to_page(pp, old_pte, l);
2258                 hm = hment_remove(pp, ht, entry);
2259                 x86_hm_exit(pp);
2260                 if (hm != NULL)
2261                         hment_free(hm);
2262         }
2263 
2264         /*
2265          * Handle book keeping in the htable and hat
2266          */
2267         ASSERT(ht->ht_valid_cnt > 0);
2268         HTABLE_DEC(ht->ht_valid_cnt);
2269         PGCNT_DEC(hat, l);
2270 }
2271 
2272 /*
2273  * very cheap unload implementation to special case some kernel addresses
2274  */
2275 static void
2276 hat_kmap_unload(caddr_t addr, size_t len, uint_t flags)
2277 {
2278         uintptr_t       va = (uintptr_t)addr;
2279         uintptr_t       eva = va + len;
2280         pgcnt_t         pg_index;
2281         htable_t        *ht;
2282         uint_t          entry;
2283         x86pte_t        *pte_ptr;
2284         x86pte_t        old_pte;
2285 
2286         for (; va < eva; va += MMU_PAGESIZE) {
2287                 /*
2288                  * Get the PTE
2289                  */
2290                 pg_index = mmu_btop(va - mmu.kmap_addr);
2291                 pte_ptr = PT_INDEX_PTR(mmu.kmap_ptes, pg_index);
2292                 old_pte = GET_PTE(pte_ptr);
2293 
2294                 /*
2295                  * get the htable / entry
2296                  */
2297                 ht = mmu.kmap_htables[(va - mmu.kmap_htables[0]->ht_vaddr)
2298                     >> LEVEL_SHIFT(1)];
2299                 entry = htable_va2entry(va, ht);
2300 
2301                 /*
2302                  * use mostly common code to unmap it.
2303                  */
2304                 hat_pte_unmap(ht, entry, flags, old_pte, pte_ptr, B_TRUE);
2305         }
2306 }
2307 
2308 
2309 /*
2310  * unload a range of virtual address space (no callback)
2311  */
2312 void
2313 hat_unload(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
2314 {
2315         uintptr_t va = (uintptr_t)addr;
2316 
2317         XPV_DISALLOW_MIGRATE();
2318         ASSERT(hat == kas.a_hat || va + len <= _userlimit);
2319 
2320         /*
2321          * special case for performance.
2322          */
2323         if (mmu.kmap_addr <= va && va < mmu.kmap_eaddr) {
2324                 ASSERT(hat == kas.a_hat);
2325                 hat_kmap_unload(addr, len, flags);
2326         } else {
2327                 hat_unload_callback(hat, addr, len, flags, NULL);
2328         }
2329         XPV_ALLOW_MIGRATE();
2330 }
2331 
2332 /*
2333  * Do the callbacks for ranges being unloaded.
2334  */
2335 typedef struct range_info {
2336         uintptr_t       rng_va;
2337         ulong_t         rng_cnt;
2338         level_t         rng_level;
2339 } range_info_t;
2340 
2341 /*
2342  * Invalidate the TLB, and perform the callback to the upper level VM system,
2343  * for the specified ranges of contiguous pages.
2344  */
2345 static void
2346 handle_ranges(hat_t *hat, hat_callback_t *cb, uint_t cnt, range_info_t *range)
2347 {
2348         while (cnt > 0) {
2349                 size_t len;
2350 
2351                 --cnt;
2352                 len = range[cnt].rng_cnt << LEVEL_SHIFT(range[cnt].rng_level);
2353                 hat_tlb_inval_range(hat, (uintptr_t)range[cnt].rng_va, len);
2354 
2355                 if (cb != NULL) {
2356                         cb->hcb_start_addr = (caddr_t)range[cnt].rng_va;
2357                         cb->hcb_end_addr = cb->hcb_start_addr;
2358                         cb->hcb_end_addr += len;
2359                         cb->hcb_function(cb);
2360                 }
2361         }
2362 }
2363 
2364 /*
2365  * Unload a given range of addresses (has optional callback)
2366  *
2367  * Flags:
2368  * define       HAT_UNLOAD              0x00
2369  * define       HAT_UNLOAD_NOSYNC       0x02
2370  * define       HAT_UNLOAD_UNLOCK       0x04
2371  * define       HAT_UNLOAD_OTHER        0x08 - not used
2372  * define       HAT_UNLOAD_UNMAP        0x10 - same as HAT_UNLOAD
2373  */
2374 #define MAX_UNLOAD_CNT (8)
2375 void
2376 hat_unload_callback(
2377         hat_t           *hat,
2378         caddr_t         addr,
2379         size_t          len,
2380         uint_t          flags,
2381         hat_callback_t  *cb)
2382 {
2383         uintptr_t       vaddr = (uintptr_t)addr;
2384         uintptr_t       eaddr = vaddr + len;
2385         htable_t        *ht = NULL;
2386         uint_t          entry;
2387         uintptr_t       contig_va = (uintptr_t)-1L;
2388         range_info_t    r[MAX_UNLOAD_CNT];
2389         uint_t          r_cnt = 0;
2390         x86pte_t        old_pte;
2391 
2392         XPV_DISALLOW_MIGRATE();
2393         ASSERT(hat == kas.a_hat || eaddr <= _userlimit);
2394         ASSERT(IS_PAGEALIGNED(vaddr));
2395         ASSERT(IS_PAGEALIGNED(eaddr));
2396 
2397         /*
2398          * Special case a single page being unloaded for speed. This happens
2399          * quite frequently, COW faults after a fork() for example.
2400          */
2401         if (cb == NULL && len == MMU_PAGESIZE) {
2402                 ht = htable_getpte(hat, vaddr, &entry, &old_pte, 0);
2403                 if (ht != NULL) {
2404                         if (PTE_ISVALID(old_pte)) {
2405                                 hat_pte_unmap(ht, entry, flags, old_pte,
2406                                     NULL, B_TRUE);
2407                         }
2408                         htable_release(ht);
2409                 }
2410                 XPV_ALLOW_MIGRATE();
2411                 return;
2412         }
2413 
2414         while (vaddr < eaddr) {
2415                 old_pte = htable_walk(hat, &ht, &vaddr, eaddr);
2416                 if (ht == NULL)
2417                         break;
2418 
2419                 ASSERT(!IN_VA_HOLE(vaddr));
2420 
2421                 if (vaddr < (uintptr_t)addr)
2422                         panic("hat_unload_callback(): unmap inside large page");
2423 
2424                 /*
2425                  * We'll do the call backs for contiguous ranges
2426                  */
2427                 if (vaddr != contig_va ||
2428                     (r_cnt > 0 && r[r_cnt - 1].rng_level != ht->ht_level)) {
2429                         if (r_cnt == MAX_UNLOAD_CNT) {
2430                                 handle_ranges(hat, cb, r_cnt, r);
2431                                 r_cnt = 0;
2432                         }
2433                         r[r_cnt].rng_va = vaddr;
2434                         r[r_cnt].rng_cnt = 0;
2435                         r[r_cnt].rng_level = ht->ht_level;
2436                         ++r_cnt;
2437                 }
2438 
2439                 /*
2440                  * Unload one mapping (for a single page) from the page tables.
2441                  * Note that we do not remove the mapping from the TLB yet,
2442                  * as indicated by the tlb=FALSE argument to hat_pte_unmap().
2443                  * handle_ranges() will clear the TLB entries with one call to
2444                  * hat_tlb_inval_range() per contiguous range.  This is
2445                  * safe because the page can not be reused until the
2446                  * callback is made (or we return).
2447                  */
2448                 entry = htable_va2entry(vaddr, ht);
2449                 hat_pte_unmap(ht, entry, flags, old_pte, NULL, B_FALSE);
2450                 ASSERT(ht->ht_level <= mmu.max_page_level);
2451                 vaddr += LEVEL_SIZE(ht->ht_level);
2452                 contig_va = vaddr;
2453                 ++r[r_cnt - 1].rng_cnt;
2454         }
2455         if (ht)
2456                 htable_release(ht);
2457 
2458         /*
2459          * handle last range for callbacks
2460          */
2461         if (r_cnt > 0)
2462                 handle_ranges(hat, cb, r_cnt, r);
2463         XPV_ALLOW_MIGRATE();
2464 }
2465 
2466 /*
2467  * Flush the TLB for the local CPU
2468  * Invoked from a slave CPU during panic() dumps.
2469  */
2470 void
2471 hat_flush(void)
2472 {
2473 #ifdef __xpv
2474                         xen_flush_tlb();
2475 #else
2476                         flush_all_tlb_entries();
2477 #endif
2478 }
2479 
2480 /*
2481  * synchronize mapping with software data structures
2482  *
2483  * This interface is currently only used by the working set monitor
2484  * driver.
2485  */
2486 /*ARGSUSED*/
2487 void
2488 hat_sync(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
2489 {
2490         uintptr_t       vaddr = (uintptr_t)addr;
2491         uintptr_t       eaddr = vaddr + len;
2492         htable_t        *ht = NULL;
2493         uint_t          entry;
2494         x86pte_t        pte;
2495         x86pte_t        save_pte;
2496         x86pte_t        new;
2497         page_t          *pp;
2498 
2499         ASSERT(!IN_VA_HOLE(vaddr));
2500         ASSERT(IS_PAGEALIGNED(vaddr));
2501         ASSERT(IS_PAGEALIGNED(eaddr));
2502         ASSERT(hat == kas.a_hat || eaddr <= _userlimit);
2503 
2504         XPV_DISALLOW_MIGRATE();
2505         for (; vaddr < eaddr; vaddr += LEVEL_SIZE(ht->ht_level)) {
2506 try_again:
2507                 pte = htable_walk(hat, &ht, &vaddr, eaddr);
2508                 if (ht == NULL)
2509                         break;
2510                 entry = htable_va2entry(vaddr, ht);
2511 
2512                 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC ||
2513                     PTE_GET(pte, PT_REF | PT_MOD) == 0)
2514                         continue;
2515 
2516                 /*
2517                  * We need to acquire the mapping list lock to protect
2518                  * against hat_pageunload(), hat_unload(), etc.
2519                  */
2520                 pp = page_numtopp_nolock(PTE2PFN(pte, ht->ht_level));
2521                 if (pp == NULL)
2522                         break;
2523                 x86_hm_enter(pp);
2524                 save_pte = pte;
2525                 pte = x86pte_get(ht, entry);
2526                 if (pte != save_pte) {
2527                         x86_hm_exit(pp);
2528                         goto try_again;
2529                 }
2530                 if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC ||
2531                     PTE_GET(pte, PT_REF | PT_MOD) == 0) {
2532                         x86_hm_exit(pp);
2533                         continue;
2534                 }
2535 
2536                 /*
2537                  * Need to clear ref or mod bits. We may compete with
2538                  * hardware updating the R/M bits and have to try again.
2539                  */
2540                 if (flags == HAT_SYNC_ZERORM) {
2541                         new = pte;
2542                         PTE_CLR(new, PT_REF | PT_MOD);
2543                         pte = hati_update_pte(ht, entry, pte, new);
2544                         if (pte != 0) {
2545                                 x86_hm_exit(pp);
2546                                 goto try_again;
2547                         }
2548                 } else {
2549                         /*
2550                          * sync the PTE to the page_t
2551                          */
2552                         hati_sync_pte_to_page(pp, save_pte, ht->ht_level);
2553                 }
2554                 x86_hm_exit(pp);
2555         }
2556         if (ht)
2557                 htable_release(ht);
2558         XPV_ALLOW_MIGRATE();
2559 }
2560 
2561 /*
2562  * void hat_map(hat, addr, len, flags)
2563  */
2564 /*ARGSUSED*/
2565 void
2566 hat_map(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
2567 {
2568         /* does nothing */
2569 }
2570 
2571 /*
2572  * uint_t hat_getattr(hat, addr, *attr)
2573  *      returns attr for <hat,addr> in *attr.  returns 0 if there was a
2574  *      mapping and *attr is valid, nonzero if there was no mapping and
2575  *      *attr is not valid.
2576  */
2577 uint_t
2578 hat_getattr(hat_t *hat, caddr_t addr, uint_t *attr)
2579 {
2580         uintptr_t       vaddr = ALIGN2PAGE(addr);
2581         htable_t        *ht = NULL;
2582         x86pte_t        pte;
2583 
2584         ASSERT(hat == kas.a_hat || vaddr <= _userlimit);
2585 
2586         if (IN_VA_HOLE(vaddr))
2587                 return ((uint_t)-1);
2588 
2589         ht = htable_getpte(hat, vaddr, NULL, &pte, mmu.max_page_level);
2590         if (ht == NULL)
2591                 return ((uint_t)-1);
2592 
2593         if (!PTE_ISVALID(pte) || !PTE_ISPAGE(pte, ht->ht_level)) {
2594                 htable_release(ht);
2595                 return ((uint_t)-1);
2596         }
2597 
2598         *attr = PROT_READ;
2599         if (PTE_GET(pte, PT_WRITABLE))
2600                 *attr |= PROT_WRITE;
2601         if (PTE_GET(pte, PT_USER))
2602                 *attr |= PROT_USER;
2603         if (!PTE_GET(pte, mmu.pt_nx))
2604                 *attr |= PROT_EXEC;
2605         if (PTE_GET(pte, PT_SOFTWARE) >= PT_NOSYNC)
2606                 *attr |= HAT_NOSYNC;
2607         htable_release(ht);
2608         return (0);
2609 }
2610 
2611 /*
2612  * hat_updateattr() applies the given attribute change to an existing mapping
2613  */
2614 #define HAT_LOAD_ATTR           1
2615 #define HAT_SET_ATTR            2
2616 #define HAT_CLR_ATTR            3
2617 
2618 static void
2619 hat_updateattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr, int what)
2620 {
2621         uintptr_t       vaddr = (uintptr_t)addr;
2622         uintptr_t       eaddr = (uintptr_t)addr + len;
2623         htable_t        *ht = NULL;
2624         uint_t          entry;
2625         x86pte_t        oldpte, newpte;
2626         page_t          *pp;
2627 
2628         XPV_DISALLOW_MIGRATE();
2629         ASSERT(IS_PAGEALIGNED(vaddr));
2630         ASSERT(IS_PAGEALIGNED(eaddr));
2631         ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as));
2632         for (; vaddr < eaddr; vaddr += LEVEL_SIZE(ht->ht_level)) {
2633 try_again:
2634                 oldpte = htable_walk(hat, &ht, &vaddr, eaddr);
2635                 if (ht == NULL)
2636                         break;
2637                 if (PTE_GET(oldpte, PT_SOFTWARE) >= PT_NOCONSIST)
2638                         continue;
2639 
2640                 pp = page_numtopp_nolock(PTE2PFN(oldpte, ht->ht_level));
2641                 if (pp == NULL)
2642                         continue;
2643                 x86_hm_enter(pp);
2644 
2645                 newpte = oldpte;
2646                 /*
2647                  * We found a page table entry in the desired range,
2648                  * figure out the new attributes.
2649                  */
2650                 if (what == HAT_SET_ATTR || what == HAT_LOAD_ATTR) {
2651                         if ((attr & PROT_WRITE) &&
2652                             !PTE_GET(oldpte, PT_WRITABLE))
2653                                 newpte |= PT_WRITABLE;
2654 
2655                         if ((attr & HAT_NOSYNC) &&
2656                             PTE_GET(oldpte, PT_SOFTWARE) < PT_NOSYNC)
2657                                 newpte |= PT_NOSYNC;
2658 
2659                         if ((attr & PROT_EXEC) && PTE_GET(oldpte, mmu.pt_nx))
2660                                 newpte &= ~mmu.pt_nx;
2661                 }
2662 
2663                 if (what == HAT_LOAD_ATTR) {
2664                         if (!(attr & PROT_WRITE) &&
2665                             PTE_GET(oldpte, PT_WRITABLE))
2666                                 newpte &= ~PT_WRITABLE;
2667 
2668                         if (!(attr & HAT_NOSYNC) &&
2669                             PTE_GET(oldpte, PT_SOFTWARE) >= PT_NOSYNC)
2670                                 newpte &= ~PT_SOFTWARE;
2671 
2672                         if (!(attr & PROT_EXEC) && !PTE_GET(oldpte, mmu.pt_nx))
2673                                 newpte |= mmu.pt_nx;
2674                 }
2675 
2676                 if (what == HAT_CLR_ATTR) {
2677                         if ((attr & PROT_WRITE) && PTE_GET(oldpte, PT_WRITABLE))
2678                                 newpte &= ~PT_WRITABLE;
2679 
2680                         if ((attr & HAT_NOSYNC) &&
2681                             PTE_GET(oldpte, PT_SOFTWARE) >= PT_NOSYNC)
2682                                 newpte &= ~PT_SOFTWARE;
2683 
2684                         if ((attr & PROT_EXEC) && !PTE_GET(oldpte, mmu.pt_nx))
2685                                 newpte |= mmu.pt_nx;
2686                 }
2687 
2688                 /*
2689                  * Ensure NOSYNC/NOCONSIST mappings have REF and MOD set.
2690                  * x86pte_set() depends on this.
2691                  */
2692                 if (PTE_GET(newpte, PT_SOFTWARE) >= PT_NOSYNC)
2693                         newpte |= PT_REF | PT_MOD;
2694 
2695                 /*
2696                  * what about PROT_READ or others? this code only handles:
2697                  * EXEC, WRITE, NOSYNC
2698                  */
2699 
2700                 /*
2701                  * If new PTE really changed, update the table.
2702                  */
2703                 if (newpte != oldpte) {
2704                         entry = htable_va2entry(vaddr, ht);
2705                         oldpte = hati_update_pte(ht, entry, oldpte, newpte);
2706                         if (oldpte != 0) {
2707                                 x86_hm_exit(pp);
2708                                 goto try_again;
2709                         }
2710                 }
2711                 x86_hm_exit(pp);
2712         }
2713         if (ht)
2714                 htable_release(ht);
2715         XPV_ALLOW_MIGRATE();
2716 }
2717 
2718 /*
2719  * Various wrappers for hat_updateattr()
2720  */
2721 void
2722 hat_setattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr)
2723 {
2724         ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit);
2725         hat_updateattr(hat, addr, len, attr, HAT_SET_ATTR);
2726 }
2727 
2728 void
2729 hat_clrattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr)
2730 {
2731         ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit);
2732         hat_updateattr(hat, addr, len, attr, HAT_CLR_ATTR);
2733 }
2734 
2735 void
2736 hat_chgattr(hat_t *hat, caddr_t addr, size_t len, uint_t attr)
2737 {
2738         ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit);
2739         hat_updateattr(hat, addr, len, attr, HAT_LOAD_ATTR);
2740 }
2741 
2742 void
2743 hat_chgprot(hat_t *hat, caddr_t addr, size_t len, uint_t vprot)
2744 {
2745         ASSERT(hat == kas.a_hat || (uintptr_t)addr + len <= _userlimit);
2746         hat_updateattr(hat, addr, len, vprot & HAT_PROT_MASK, HAT_LOAD_ATTR);
2747 }
2748 
2749 /*
2750  * size_t hat_getpagesize(hat, addr)
2751  *      returns pagesize in bytes for <hat, addr>. returns -1 of there is
2752  *      no mapping. This is an advisory call.
2753  */
2754 ssize_t
2755 hat_getpagesize(hat_t *hat, caddr_t addr)
2756 {
2757         uintptr_t       vaddr = ALIGN2PAGE(addr);
2758         htable_t        *ht;
2759         size_t          pagesize;
2760 
2761         ASSERT(hat == kas.a_hat || vaddr <= _userlimit);
2762         if (IN_VA_HOLE(vaddr))
2763                 return (-1);
2764         ht = htable_getpage(hat, vaddr, NULL);
2765         if (ht == NULL)
2766                 return (-1);
2767         pagesize = LEVEL_SIZE(ht->ht_level);
2768         htable_release(ht);
2769         return (pagesize);
2770 }
2771 
2772 
2773 
2774 /*
2775  * pfn_t hat_getpfnum(hat, addr)
2776  *      returns pfn for <hat, addr> or PFN_INVALID if mapping is invalid.
2777  */
2778 pfn_t
2779 hat_getpfnum(hat_t *hat, caddr_t addr)
2780 {
2781         uintptr_t       vaddr = ALIGN2PAGE(addr);
2782         htable_t        *ht;
2783         uint_t          entry;
2784         pfn_t           pfn = PFN_INVALID;
2785 
2786         ASSERT(hat == kas.a_hat || vaddr <= _userlimit);
2787         if (khat_running == 0)
2788                 return (PFN_INVALID);
2789 
2790         if (IN_VA_HOLE(vaddr))
2791                 return (PFN_INVALID);
2792 
2793         XPV_DISALLOW_MIGRATE();
2794         /*
2795          * A very common use of hat_getpfnum() is from the DDI for kernel pages.
2796          * Use the kmap_ptes (which also covers the 32 bit heap) to speed
2797          * this up.
2798          */
2799         if (mmu.kmap_addr <= vaddr && vaddr < mmu.kmap_eaddr) {
2800                 x86pte_t pte;
2801                 pgcnt_t pg_index;
2802 
2803                 pg_index = mmu_btop(vaddr - mmu.kmap_addr);
2804                 pte = GET_PTE(PT_INDEX_PTR(mmu.kmap_ptes, pg_index));
2805                 if (PTE_ISVALID(pte))
2806                         /*LINTED [use of constant 0 causes a lint warning] */
2807                         pfn = PTE2PFN(pte, 0);
2808                 XPV_ALLOW_MIGRATE();
2809                 return (pfn);
2810         }
2811 
2812         ht = htable_getpage(hat, vaddr, &entry);
2813         if (ht == NULL) {
2814                 XPV_ALLOW_MIGRATE();
2815                 return (PFN_INVALID);
2816         }
2817         ASSERT(vaddr >= ht->ht_vaddr);
2818         ASSERT(vaddr <= HTABLE_LAST_PAGE(ht));
2819         pfn = PTE2PFN(x86pte_get(ht, entry), ht->ht_level);
2820         if (ht->ht_level > 0)
2821                 pfn += mmu_btop(vaddr & LEVEL_OFFSET(ht->ht_level));
2822         htable_release(ht);
2823         XPV_ALLOW_MIGRATE();
2824         return (pfn);
2825 }
2826 
2827 /*
2828  * int hat_probe(hat, addr)
2829  *      return 0 if no valid mapping is present.  Faster version
2830  *      of hat_getattr in certain architectures.
2831  */
2832 int
2833 hat_probe(hat_t *hat, caddr_t addr)
2834 {
2835         uintptr_t       vaddr = ALIGN2PAGE(addr);
2836         uint_t          entry;
2837         htable_t        *ht;
2838         pgcnt_t         pg_off;
2839 
2840         ASSERT(hat == kas.a_hat || vaddr <= _userlimit);
2841         ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as));
2842         if (IN_VA_HOLE(vaddr))
2843                 return (0);
2844 
2845         /*
2846          * Most common use of hat_probe is from segmap. We special case it
2847          * for performance.
2848          */
2849         if (mmu.kmap_addr <= vaddr && vaddr < mmu.kmap_eaddr) {
2850                 pg_off = mmu_btop(vaddr - mmu.kmap_addr);
2851                 if (mmu.pae_hat)
2852                         return (PTE_ISVALID(mmu.kmap_ptes[pg_off]));
2853                 else
2854                         return (PTE_ISVALID(
2855                             ((x86pte32_t *)mmu.kmap_ptes)[pg_off]));
2856         }
2857 
2858         ht = htable_getpage(hat, vaddr, &entry);
2859         htable_release(ht);
2860         return (ht != NULL);
2861 }
2862 
2863 /*
2864  * Find out if the segment for hat_share()/hat_unshare() is DISM or locked ISM.
2865  */
2866 static int
2867 is_it_dism(hat_t *hat, caddr_t va)
2868 {
2869         struct seg *seg;
2870         struct shm_data *shmd;
2871         struct spt_data *sptd;
2872 
2873         seg = as_findseg(hat->hat_as, va, 0);
2874         ASSERT(seg != NULL);
2875         ASSERT(seg->s_base <= va);
2876         shmd = (struct shm_data *)seg->s_data;
2877         ASSERT(shmd != NULL);
2878         sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2879         ASSERT(sptd != NULL);
2880         if (sptd->spt_flags & SHM_PAGEABLE)
2881                 return (1);
2882         return (0);
2883 }
2884 
2885 /*
2886  * Simple implementation of ISM. hat_share() is similar to hat_memload_array(),
2887  * except that we use the ism_hat's existing mappings to determine the pages
2888  * and protections to use for this hat. If we find a full properly aligned
2889  * and sized pagetable, we will attempt to share the pagetable itself.
2890  */
2891 /*ARGSUSED*/
2892 int
2893 hat_share(
2894         hat_t           *hat,
2895         caddr_t         addr,
2896         hat_t           *ism_hat,
2897         caddr_t         src_addr,
2898         size_t          len,    /* almost useless value, see below.. */
2899         uint_t          ismszc)
2900 {
2901         uintptr_t       vaddr_start = (uintptr_t)addr;
2902         uintptr_t       vaddr;
2903         uintptr_t       eaddr = vaddr_start + len;
2904         uintptr_t       ism_addr_start = (uintptr_t)src_addr;
2905         uintptr_t       ism_addr = ism_addr_start;
2906         uintptr_t       e_ism_addr = ism_addr + len;
2907         htable_t        *ism_ht = NULL;
2908         htable_t        *ht;
2909         x86pte_t        pte;
2910         page_t          *pp;
2911         pfn_t           pfn;
2912         level_t         l;
2913         pgcnt_t         pgcnt;
2914         uint_t          prot;
2915         int             is_dism;
2916         int             flags;
2917 
2918         /*
2919          * We might be asked to share an empty DISM hat by as_dup()
2920          */
2921         ASSERT(hat != kas.a_hat);
2922         ASSERT(eaddr <= _userlimit);
2923         if (!(ism_hat->hat_flags & HAT_SHARED)) {
2924                 ASSERT(hat_get_mapped_size(ism_hat) == 0);
2925                 return (0);
2926         }
2927         XPV_DISALLOW_MIGRATE();
2928 
2929         /*
2930          * The SPT segment driver often passes us a size larger than there are
2931          * valid mappings. That's because it rounds the segment size up to a
2932          * large pagesize, even if the actual memory mapped by ism_hat is less.
2933          */
2934         ASSERT(IS_PAGEALIGNED(vaddr_start));
2935         ASSERT(IS_PAGEALIGNED(ism_addr_start));
2936         ASSERT(ism_hat->hat_flags & HAT_SHARED);
2937         is_dism = is_it_dism(hat, addr);
2938         while (ism_addr < e_ism_addr) {
2939                 /*
2940                  * use htable_walk to get the next valid ISM mapping
2941                  */
2942                 pte = htable_walk(ism_hat, &ism_ht, &ism_addr, e_ism_addr);
2943                 if (ism_ht == NULL)
2944                         break;
2945 
2946                 /*
2947                  * First check to see if we already share the page table.
2948                  */
2949                 l = ism_ht->ht_level;
2950                 vaddr = vaddr_start + (ism_addr - ism_addr_start);
2951                 ht = htable_lookup(hat, vaddr, l);
2952                 if (ht != NULL) {
2953                         if (ht->ht_flags & HTABLE_SHARED_PFN)
2954                                 goto shared;
2955                         htable_release(ht);
2956                         goto not_shared;
2957                 }
2958 
2959                 /*
2960                  * Can't ever share top table.
2961                  */
2962                 if (l == mmu.max_level)
2963                         goto not_shared;
2964 
2965                 /*
2966                  * Avoid level mismatches later due to DISM faults.
2967                  */
2968                 if (is_dism && l > 0)
2969                         goto not_shared;
2970 
2971                 /*
2972                  * addresses and lengths must align
2973                  * table must be fully populated
2974                  * no lower level page tables
2975                  */
2976                 if (ism_addr != ism_ht->ht_vaddr ||
2977                     (vaddr & LEVEL_OFFSET(l + 1)) != 0)
2978                         goto not_shared;
2979 
2980                 /*
2981                  * The range of address space must cover a full table.
2982                  */
2983                 if (e_ism_addr - ism_addr < LEVEL_SIZE(l + 1))
2984                         goto not_shared;
2985 
2986                 /*
2987                  * All entries in the ISM page table must be leaf PTEs.
2988                  */
2989                 if (l > 0) {
2990                         int e;
2991 
2992                         /*
2993                          * We know the 0th is from htable_walk() above.
2994                          */
2995                         for (e = 1; e < HTABLE_NUM_PTES(ism_ht); ++e) {
2996                                 x86pte_t pte;
2997                                 pte = x86pte_get(ism_ht, e);
2998                                 if (!PTE_ISPAGE(pte, l))
2999                                         goto not_shared;
3000                         }
3001                 }
3002 
3003                 /*
3004                  * share the page table
3005                  */
3006                 ht = htable_create(hat, vaddr, l, ism_ht);
3007 shared:
3008                 ASSERT(ht->ht_flags & HTABLE_SHARED_PFN);
3009                 ASSERT(ht->ht_shares == ism_ht);
3010                 hat->hat_ism_pgcnt +=
3011                     (ism_ht->ht_valid_cnt - ht->ht_valid_cnt) <<
3012                     (LEVEL_SHIFT(ht->ht_level) - MMU_PAGESHIFT);
3013                 ht->ht_valid_cnt = ism_ht->ht_valid_cnt;
3014                 htable_release(ht);
3015                 ism_addr = ism_ht->ht_vaddr + LEVEL_SIZE(l + 1);
3016                 htable_release(ism_ht);
3017                 ism_ht = NULL;
3018                 continue;
3019 
3020 not_shared:
3021                 /*
3022                  * Unable to share the page table. Instead we will
3023                  * create new mappings from the values in the ISM mappings.
3024                  * Figure out what level size mappings to use;
3025                  */
3026                 for (l = ism_ht->ht_level; l > 0; --l) {
3027                         if (LEVEL_SIZE(l) <= eaddr - vaddr &&
3028                             (vaddr & LEVEL_OFFSET(l)) == 0)
3029                                 break;
3030                 }
3031 
3032                 /*
3033                  * The ISM mapping might be larger than the share area,
3034                  * be careful to truncate it if needed.
3035                  */
3036                 if (eaddr - vaddr >= LEVEL_SIZE(ism_ht->ht_level)) {
3037                         pgcnt = mmu_btop(LEVEL_SIZE(ism_ht->ht_level));
3038                 } else {
3039                         pgcnt = mmu_btop(eaddr - vaddr);
3040                         l = 0;
3041                 }
3042 
3043                 pfn = PTE2PFN(pte, ism_ht->ht_level);
3044                 ASSERT(pfn != PFN_INVALID);
3045                 while (pgcnt > 0) {
3046                         /*
3047                          * Make a new pte for the PFN for this level.
3048                          * Copy protections for the pte from the ISM pte.
3049                          */
3050                         pp = page_numtopp_nolock(pfn);
3051                         ASSERT(pp != NULL);
3052 
3053                         prot = PROT_USER | PROT_READ | HAT_UNORDERED_OK;
3054                         if (PTE_GET(pte, PT_WRITABLE))
3055                                 prot |= PROT_WRITE;
3056                         if (!PTE_GET(pte, PT_NX))
3057                                 prot |= PROT_EXEC;
3058 
3059                         flags = HAT_LOAD;
3060                         if (!is_dism)
3061                                 flags |= HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST;
3062                         while (hati_load_common(hat, vaddr, pp, prot, flags,
3063                             l, pfn) != 0) {
3064                                 if (l == 0)
3065                                         panic("hati_load_common() failure");
3066                                 --l;
3067                         }
3068 
3069                         vaddr += LEVEL_SIZE(l);
3070                         ism_addr += LEVEL_SIZE(l);
3071                         pfn += mmu_btop(LEVEL_SIZE(l));
3072                         pgcnt -= mmu_btop(LEVEL_SIZE(l));
3073                 }
3074         }
3075         if (ism_ht != NULL)
3076                 htable_release(ism_ht);
3077         XPV_ALLOW_MIGRATE();
3078         return (0);
3079 }
3080 
3081 
3082 /*
3083  * hat_unshare() is similar to hat_unload_callback(), but
3084  * we have to look for empty shared pagetables. Note that
3085  * hat_unshare() is always invoked against an entire segment.
3086  */
3087 /*ARGSUSED*/
3088 void
3089 hat_unshare(hat_t *hat, caddr_t addr, size_t len, uint_t ismszc)
3090 {
3091         uint64_t        vaddr = (uintptr_t)addr;
3092         uintptr_t       eaddr = vaddr + len;
3093         htable_t        *ht = NULL;
3094         uint_t          need_demaps = 0;
3095         int             flags = HAT_UNLOAD_UNMAP;
3096         level_t         l;
3097 
3098         ASSERT(hat != kas.a_hat);
3099         ASSERT(eaddr <= _userlimit);
3100         ASSERT(IS_PAGEALIGNED(vaddr));
3101         ASSERT(IS_PAGEALIGNED(eaddr));
3102         XPV_DISALLOW_MIGRATE();
3103 
3104         /*
3105          * First go through and remove any shared pagetables.
3106          *
3107          * Note that it's ok to delay the TLB shootdown till the entire range is
3108          * finished, because if hat_pageunload() were to unload a shared
3109          * pagetable page, its hat_tlb_inval() will do a global TLB invalidate.
3110          */
3111         l = mmu.max_page_level;
3112         if (l == mmu.max_level)
3113                 --l;
3114         for (; l >= 0; --l) {
3115                 for (vaddr = (uintptr_t)addr; vaddr < eaddr;
3116                     vaddr = (vaddr & LEVEL_MASK(l + 1)) + LEVEL_SIZE(l + 1)) {
3117                         ASSERT(!IN_VA_HOLE(vaddr));
3118                         /*
3119                          * find a pagetable that maps the current address
3120                          */
3121                         ht = htable_lookup(hat, vaddr, l);
3122                         if (ht == NULL)
3123                                 continue;
3124                         if (ht->ht_flags & HTABLE_SHARED_PFN) {
3125                                 /*
3126                                  * clear page count, set valid_cnt to 0,
3127                                  * let htable_release() finish the job
3128                                  */
3129                                 hat->hat_ism_pgcnt -= ht->ht_valid_cnt <<
3130                                     (LEVEL_SHIFT(ht->ht_level) - MMU_PAGESHIFT);
3131                                 ht->ht_valid_cnt = 0;
3132                                 need_demaps = 1;
3133                         }
3134                         htable_release(ht);
3135                 }
3136         }
3137 
3138         /*
3139          * flush the TLBs - since we're probably dealing with MANY mappings
3140          * we do just one CR3 reload.
3141          */
3142         if (!(hat->hat_flags & HAT_FREEING) && need_demaps)
3143                 hat_tlb_inval(hat, DEMAP_ALL_ADDR);
3144 
3145         /*
3146          * Now go back and clean up any unaligned mappings that
3147          * couldn't share pagetables.
3148          */
3149         if (!is_it_dism(hat, addr))
3150                 flags |= HAT_UNLOAD_UNLOCK;
3151         hat_unload(hat, addr, len, flags);
3152         XPV_ALLOW_MIGRATE();
3153 }
3154 
3155 
3156 /*
3157  * hat_reserve() does nothing
3158  */
3159 /*ARGSUSED*/
3160 void
3161 hat_reserve(struct as *as, caddr_t addr, size_t len)
3162 {
3163 }
3164 
3165 
3166 /*
3167  * Called when all mappings to a page should have write permission removed.
3168  * Mostly stolen from hat_pagesync()
3169  */
3170 static void
3171 hati_page_clrwrt(struct page *pp)
3172 {
3173         hment_t         *hm = NULL;
3174         htable_t        *ht;
3175         uint_t          entry;
3176         x86pte_t        old;
3177         x86pte_t        new;
3178         uint_t          pszc = 0;
3179 
3180         XPV_DISALLOW_MIGRATE();
3181 next_size:
3182         /*
3183          * walk thru the mapping list clearing write permission
3184          */
3185         x86_hm_enter(pp);
3186         while ((hm = hment_walk(pp, &ht, &entry, hm)) != NULL) {
3187                 if (ht->ht_level < pszc)
3188                         continue;
3189                 old = x86pte_get(ht, entry);
3190 
3191                 for (;;) {
3192                         /*
3193                          * Is this mapping of interest?
3194                          */
3195                         if (PTE2PFN(old, ht->ht_level) != pp->p_pagenum ||
3196                             PTE_GET(old, PT_WRITABLE) == 0)
3197                                 break;
3198 
3199                         /*
3200                          * Clear ref/mod writable bits. This requires cross
3201                          * calls to ensure any executing TLBs see cleared bits.
3202                          */
3203                         new = old;
3204                         PTE_CLR(new, PT_REF | PT_MOD | PT_WRITABLE);
3205                         old = hati_update_pte(ht, entry, old, new);
3206                         if (old != 0)
3207                                 continue;
3208 
3209                         break;
3210                 }
3211         }
3212         x86_hm_exit(pp);
3213         while (pszc < pp->p_szc) {
3214                 page_t *tpp;
3215                 pszc++;
3216                 tpp = PP_GROUPLEADER(pp, pszc);
3217                 if (pp != tpp) {
3218                         pp = tpp;
3219                         goto next_size;
3220                 }
3221         }
3222         XPV_ALLOW_MIGRATE();
3223 }
3224 
3225 /*
3226  * void hat_page_setattr(pp, flag)
3227  * void hat_page_clrattr(pp, flag)
3228  *      used to set/clr ref/mod bits.
3229  */
3230 void
3231 hat_page_setattr(struct page *pp, uint_t flag)
3232 {
3233         vnode_t         *vp = pp->p_vnode;
3234         kmutex_t        *vphm = NULL;
3235         page_t          **listp;
3236         int             noshuffle;
3237 
3238         noshuffle = flag & P_NSH;
3239         flag &= ~P_NSH;
3240 
3241         if (PP_GETRM(pp, flag) == flag)
3242                 return;
3243 
3244         if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) &&
3245             !noshuffle) {
3246                 vphm = page_vnode_mutex(vp);
3247                 mutex_enter(vphm);
3248         }
3249 
3250         PP_SETRM(pp, flag);
3251 
3252         if (vphm != NULL) {
3253 
3254                 /*
3255                  * Some File Systems examine v_pages for NULL w/o
3256                  * grabbing the vphm mutex. Must not let it become NULL when
3257                  * pp is the only page on the list.
3258                  */
3259                 if (pp->p_vpnext != pp) {
3260                         page_vpsub(&vp->v_pages, pp);
3261                         if (vp->v_pages != NULL)
3262                                 listp = &vp->v_pages->p_vpprev->p_vpnext;
3263                         else
3264                                 listp = &vp->v_pages;
3265                         page_vpadd(listp, pp);
3266                 }
3267                 mutex_exit(vphm);
3268         }
3269 }
3270 
3271 void
3272 hat_page_clrattr(struct page *pp, uint_t flag)
3273 {
3274         vnode_t         *vp = pp->p_vnode;
3275         ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
3276 
3277         /*
3278          * Caller is expected to hold page's io lock for VMODSORT to work
3279          * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod
3280          * bit is cleared.
3281          * We don't have assert to avoid tripping some existing third party
3282          * code. The dirty page is moved back to top of the v_page list
3283          * after IO is done in pvn_write_done().
3284          */
3285         PP_CLRRM(pp, flag);
3286 
3287         if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) {
3288 
3289                 /*
3290                  * VMODSORT works by removing write permissions and getting
3291                  * a fault when a page is made dirty. At this point
3292                  * we need to remove write permission from all mappings
3293                  * to this page.
3294                  */
3295                 hati_page_clrwrt(pp);
3296         }
3297 }
3298 
3299 /*
3300  *      If flag is specified, returns 0 if attribute is disabled
3301  *      and non zero if enabled.  If flag specifes multiple attributes
3302  *      then returns 0 if ALL attributes are disabled.  This is an advisory
3303  *      call.
3304  */
3305 uint_t
3306 hat_page_getattr(struct page *pp, uint_t flag)
3307 {
3308         return (PP_GETRM(pp, flag));
3309 }
3310 
3311 /*
3312  * common code used by hat_pageunload() and hment_steal()
3313  */
3314 hment_t *
3315 hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
3316 {
3317         x86pte_t old_pte;
3318         pfn_t pfn = pp->p_pagenum;
3319         hment_t *hm;
3320         hat_t *hat = ht->ht_hat;
3321 
3322         /*
3323          * There is a race between this function and the freeing of a HAT
3324          * whose owning process is exiting; process exit code ignores htable
3325          * reference counts.
3326          * If the HAT is already freeing (HAT_FREEING) no-op this function.
3327          * Otherwise increment hat_unmaps to block the hat from being free'd
3328          * until this function completes.
3329          */
3330         mutex_enter(&hat_list_lock);
3331         if (hat->hat_flags & HAT_FREEING) {
3332                 mutex_exit(&hat_list_lock);
3333                 x86_hm_exit(pp);
3334                 return (NULL);
3335         }
3336         ++(hat->hat_unmaps);
3337         mutex_exit(&hat_list_lock);
3338 
3339         /*
3340          * We need to acquire a hold on the htable in order to
3341          * do the invalidate. We know the htable must exist, since
3342          * unmap's don't release the htable until after removing any
3343          * hment. Having x86_hm_enter() keeps that from proceeding.
3344          */
3345         htable_acquire(ht);
3346 
3347         /*
3348          * Invalidate the PTE and remove the hment.
3349          */
3350         old_pte = x86pte_inval(ht, entry, 0, NULL, B_TRUE);
3351         if (PTE2PFN(old_pte, ht->ht_level) != pfn) {
3352                 panic("x86pte_inval() failure found PTE = " FMT_PTE
3353                     " pfn being unmapped is %lx ht=0x%lx entry=0x%x",
3354                     old_pte, pfn, (uintptr_t)ht, entry);
3355         }
3356 
3357         /*
3358          * Clean up all the htable information for this mapping
3359          */
3360         ASSERT(ht->ht_valid_cnt > 0);
3361         HTABLE_DEC(ht->ht_valid_cnt);
3362         PGCNT_DEC(ht->ht_hat, ht->ht_level);
3363 
3364         /*
3365          * sync ref/mod bits to the page_t
3366          */
3367         if (PTE_GET(old_pte, PT_SOFTWARE) < PT_NOSYNC)
3368                 hati_sync_pte_to_page(pp, old_pte, ht->ht_level);
3369 
3370         /*
3371          * Remove the mapping list entry for this page.
3372          */
3373         hm = hment_remove(pp, ht, entry);
3374 
3375         /*
3376          * drop the mapping list lock so that we might free the hment and htable
3377          */
3378         x86_hm_exit(pp);
3379         htable_release(ht);
3380 
3381         mutex_enter(&hat_list_lock);
3382         --(hat->hat_unmaps);
3383         cv_broadcast(&hat_list_cv);
3384         mutex_exit(&hat_list_lock);
3385         return (hm);
3386 }
3387 
3388 extern int      vpm_enable;
3389 /*
3390  * Unload all translations to a page. If the page is a subpage of a large
3391  * page, the large page mappings are also removed.
3392  *
3393  * The forceflags are unused.
3394  */
3395 
3396 /*ARGSUSED*/
3397 static int
3398 hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
3399 {
3400         page_t          *cur_pp = pp;
3401         hment_t         *hm;
3402         hment_t         *prev;
3403         htable_t        *ht;
3404         uint_t          entry;
3405         level_t         level;
3406 
3407         XPV_DISALLOW_MIGRATE();
3408 
3409         /*
3410          * prevent recursion due to kmem_free()
3411          */
3412         ++curthread->t_hatdepth;
3413         ASSERT(curthread->t_hatdepth < 16);
3414 
3415 #if defined(__amd64)
3416         /*
3417          * clear the vpm ref.
3418          */
3419         if (vpm_enable) {
3420                 pp->p_vpmref = 0;
3421         }
3422 #endif
3423         /*
3424          * The loop with next_size handles pages with multiple pagesize mappings
3425          */
3426 next_size:
3427         for (;;) {
3428 
3429                 /*
3430                  * Get a mapping list entry
3431                  */
3432                 x86_hm_enter(cur_pp);
3433                 for (prev = NULL; ; prev = hm) {
3434                         hm = hment_walk(cur_pp, &ht, &entry, prev);
3435                         if (hm == NULL) {
3436                                 x86_hm_exit(cur_pp);
3437 
3438                                 /*
3439                                  * If not part of a larger page, we're done.
3440                                  */
3441                                 if (cur_pp->p_szc <= pg_szcd) {
3442                                         ASSERT(curthread->t_hatdepth > 0);
3443                                         --curthread->t_hatdepth;
3444                                         XPV_ALLOW_MIGRATE();
3445                                         return (0);
3446                                 }
3447 
3448                                 /*
3449                                  * Else check the next larger page size.
3450                                  * hat_page_demote() may decrease p_szc
3451                                  * but that's ok we'll just take an extra
3452                                  * trip discover there're no larger mappings
3453                                  * and return.
3454                                  */
3455                                 ++pg_szcd;
3456                                 cur_pp = PP_GROUPLEADER(cur_pp, pg_szcd);
3457                                 goto next_size;
3458                         }
3459 
3460                         /*
3461                          * If this mapping size matches, remove it.
3462                          */
3463                         level = ht->ht_level;
3464                         if (level == pg_szcd)
3465                                 break;
3466                 }
3467 
3468                 /*
3469                  * Remove the mapping list entry for this page.
3470                  * Note this does the x86_hm_exit() for us.
3471                  */
3472                 hm = hati_page_unmap(cur_pp, ht, entry);
3473                 if (hm != NULL)
3474                         hment_free(hm);
3475         }
3476 }
3477 
3478 int
3479 hat_pageunload(struct page *pp, uint_t forceflag)
3480 {
3481         ASSERT(PAGE_EXCL(pp));
3482         return (hati_pageunload(pp, 0, forceflag));
3483 }
3484 
3485 /*
3486  * Unload all large mappings to pp and reduce by 1 p_szc field of every large
3487  * page level that included pp.
3488  *
3489  * pp must be locked EXCL. Even though no other constituent pages are locked
3490  * it's legal to unload large mappings to pp because all constituent pages of
3491  * large locked mappings have to be locked SHARED.  therefore if we have EXCL
3492  * lock on one of constituent pages none of the large mappings to pp are
3493  * locked.
3494  *
3495  * Change (always decrease) p_szc field starting from the last constituent
3496  * page and ending with root constituent page so that root's pszc always shows
3497  * the area where hat_page_demote() may be active.
3498  *
3499  * This mechanism is only used for file system pages where it's not always
3500  * possible to get EXCL locks on all constituent pages to demote the size code
3501  * (as is done for anonymous or kernel large pages).
3502  */
3503 void
3504 hat_page_demote(page_t *pp)
3505 {
3506         uint_t          pszc;
3507         uint_t          rszc;
3508         uint_t          szc;
3509         page_t          *rootpp;
3510         page_t          *firstpp;
3511         page_t          *lastpp;
3512         pgcnt_t         pgcnt;
3513 
3514         ASSERT(PAGE_EXCL(pp));
3515         ASSERT(!PP_ISFREE(pp));
3516         ASSERT(page_szc_lock_assert(pp));
3517 
3518         if (pp->p_szc == 0)
3519                 return;
3520 
3521         rootpp = PP_GROUPLEADER(pp, 1);
3522         (void) hati_pageunload(rootpp, 1, HAT_FORCE_PGUNLOAD);
3523 
3524         /*
3525          * all large mappings to pp are gone
3526          * and no new can be setup since pp is locked exclusively.
3527          *
3528          * Lock the root to make sure there's only one hat_page_demote()
3529          * outstanding within the area of this root's pszc.
3530          *
3531          * Second potential hat_page_demote() is already eliminated by upper
3532          * VM layer via page_szc_lock() but we don't rely on it and use our
3533          * own locking (so that upper layer locking can be changed without
3534          * assumptions that hat depends on upper layer VM to prevent multiple
3535          * hat_page_demote() to be issued simultaneously to the same large
3536          * page).
3537          */
3538 again:
3539         pszc = pp->p_szc;
3540         if (pszc == 0)
3541                 return;
3542         rootpp = PP_GROUPLEADER(pp, pszc);
3543         x86_hm_enter(rootpp);
3544         /*
3545          * If root's p_szc is different from pszc we raced with another
3546          * hat_page_demote().  Drop the lock and try to find the root again.
3547          * If root's p_szc is greater than pszc previous hat_page_demote() is
3548          * not done yet.  Take and release mlist lock of root's root to wait
3549          * for previous hat_page_demote() to complete.
3550          */
3551         if ((rszc = rootpp->p_szc) != pszc) {
3552                 x86_hm_exit(rootpp);
3553                 if (rszc > pszc) {
3554                         /* p_szc of a locked non free page can't increase */
3555                         ASSERT(pp != rootpp);
3556 
3557                         rootpp = PP_GROUPLEADER(rootpp, rszc);
3558                         x86_hm_enter(rootpp);
3559                         x86_hm_exit(rootpp);
3560                 }
3561                 goto again;
3562         }
3563         ASSERT(pp->p_szc == pszc);
3564 
3565         /*
3566          * Decrement by 1 p_szc of every constituent page of a region that
3567          * covered pp. For example if original szc is 3 it gets changed to 2
3568          * everywhere except in region 2 that covered pp. Region 2 that
3569          * covered pp gets demoted to 1 everywhere except in region 1 that
3570          * covered pp. The region 1 that covered pp is demoted to region
3571          * 0. It's done this way because from region 3 we removed level 3
3572          * mappings, from region 2 that covered pp we removed level 2 mappings
3573          * and from region 1 that covered pp we removed level 1 mappings.  All
3574          * changes are done from from high pfn's to low pfn's so that roots
3575          * are changed last allowing one to know the largest region where
3576          * hat_page_demote() is stil active by only looking at the root page.
3577          *
3578          * This algorithm is implemented in 2 while loops. First loop changes
3579          * p_szc of pages to the right of pp's level 1 region and second
3580          * loop changes p_szc of pages of level 1 region that covers pp
3581          * and all pages to the left of level 1 region that covers pp.
3582          * In the first loop p_szc keeps dropping with every iteration
3583          * and in the second loop it keeps increasing with every iteration.
3584          *
3585          * First loop description: Demote pages to the right of pp outside of
3586          * level 1 region that covers pp.  In every iteration of the while
3587          * loop below find the last page of szc region and the first page of
3588          * (szc - 1) region that is immediately to the right of (szc - 1)
3589          * region that covers pp.  From last such page to first such page
3590          * change every page's szc to szc - 1. Decrement szc and continue
3591          * looping until szc is 1. If pp belongs to the last (szc - 1) region
3592          * of szc region skip to the next iteration.
3593          */
3594         szc = pszc;
3595         while (szc > 1) {
3596                 lastpp = PP_GROUPLEADER(pp, szc);
3597                 pgcnt = page_get_pagecnt(szc);
3598                 lastpp += pgcnt - 1;
3599                 firstpp = PP_GROUPLEADER(pp, (szc - 1));
3600                 pgcnt = page_get_pagecnt(szc - 1);
3601                 if (lastpp - firstpp < pgcnt) {
3602                         szc--;
3603                         continue;
3604                 }
3605                 firstpp += pgcnt;
3606                 while (lastpp != firstpp) {
3607                         ASSERT(lastpp->p_szc == pszc);
3608                         lastpp->p_szc = szc - 1;
3609                         lastpp--;
3610                 }
3611                 firstpp->p_szc = szc - 1;
3612                 szc--;
3613         }
3614 
3615         /*
3616          * Second loop description:
3617          * First iteration changes p_szc to 0 of every
3618          * page of level 1 region that covers pp.
3619          * Subsequent iterations find last page of szc region
3620          * immediately to the left of szc region that covered pp
3621          * and first page of (szc + 1) region that covers pp.
3622          * From last to first page change p_szc of every page to szc.
3623          * Increment szc and continue looping until szc is pszc.
3624          * If pp belongs to the fist szc region of (szc + 1) region
3625          * skip to the next iteration.
3626          *
3627          */
3628         szc = 0;
3629         while (szc < pszc) {
3630                 firstpp = PP_GROUPLEADER(pp, (szc + 1));
3631                 if (szc == 0) {
3632                         pgcnt = page_get_pagecnt(1);
3633                         lastpp = firstpp + (pgcnt - 1);
3634                 } else {
3635                         lastpp = PP_GROUPLEADER(pp, szc);
3636                         if (firstpp == lastpp) {
3637                                 szc++;
3638                                 continue;
3639                         }
3640                         lastpp--;
3641                         pgcnt = page_get_pagecnt(szc);
3642                 }
3643                 while (lastpp != firstpp) {
3644                         ASSERT(lastpp->p_szc == pszc);
3645                         lastpp->p_szc = szc;
3646                         lastpp--;
3647                 }
3648                 firstpp->p_szc = szc;
3649                 if (firstpp == rootpp)
3650                         break;
3651                 szc++;
3652         }
3653         x86_hm_exit(rootpp);
3654 }
3655 
3656 /*
3657  * get hw stats from hardware into page struct and reset hw stats
3658  * returns attributes of page
3659  * Flags for hat_pagesync, hat_getstat, hat_sync
3660  *
3661  * define       HAT_SYNC_ZERORM         0x01
3662  *
3663  * Additional flags for hat_pagesync
3664  *
3665  * define       HAT_SYNC_STOPON_REF     0x02
3666  * define       HAT_SYNC_STOPON_MOD     0x04
3667  * define       HAT_SYNC_STOPON_RM      0x06
3668  * define       HAT_SYNC_STOPON_SHARED  0x08
3669  */
3670 uint_t
3671 hat_pagesync(struct page *pp, uint_t flags)
3672 {
3673         hment_t         *hm = NULL;
3674         htable_t        *ht;
3675         uint_t          entry;
3676         x86pte_t        old, save_old;
3677         x86pte_t        new;
3678         uchar_t         nrmbits = P_REF|P_MOD|P_RO;
3679         extern ulong_t  po_share;
3680         page_t          *save_pp = pp;
3681         uint_t          pszc = 0;
3682 
3683         ASSERT(PAGE_LOCKED(pp) || panicstr);
3684 
3685         if (PP_ISRO(pp) && (flags & HAT_SYNC_STOPON_MOD))
3686                 return (pp->p_nrm & nrmbits);
3687 
3688         if ((flags & HAT_SYNC_ZERORM) == 0) {
3689 
3690                 if ((flags & HAT_SYNC_STOPON_REF) != 0 && PP_ISREF(pp))
3691                         return (pp->p_nrm & nrmbits);
3692 
3693                 if ((flags & HAT_SYNC_STOPON_MOD) != 0 && PP_ISMOD(pp))
3694                         return (pp->p_nrm & nrmbits);
3695 
3696                 if ((flags & HAT_SYNC_STOPON_SHARED) != 0 &&
3697                     hat_page_getshare(pp) > po_share) {
3698                         if (PP_ISRO(pp))
3699                                 PP_SETREF(pp);
3700                         return (pp->p_nrm & nrmbits);
3701                 }
3702         }
3703 
3704         XPV_DISALLOW_MIGRATE();
3705 next_size:
3706         /*
3707          * walk thru the mapping list syncing (and clearing) ref/mod bits.
3708          */
3709         x86_hm_enter(pp);
3710         while ((hm = hment_walk(pp, &ht, &entry, hm)) != NULL) {
3711                 if (ht->ht_level < pszc)
3712                         continue;
3713                 old = x86pte_get(ht, entry);
3714 try_again:
3715 
3716                 ASSERT(PTE2PFN(old, ht->ht_level) == pp->p_pagenum);
3717 
3718                 if (PTE_GET(old, PT_REF | PT_MOD) == 0)
3719                         continue;
3720 
3721                 save_old = old;
3722                 if ((flags & HAT_SYNC_ZERORM) != 0) {
3723 
3724                         /*
3725                          * Need to clear ref or mod bits. Need to demap
3726                          * to make sure any executing TLBs see cleared bits.
3727                          */
3728                         new = old;
3729                         PTE_CLR(new, PT_REF | PT_MOD);
3730                         old = hati_update_pte(ht, entry, old, new);
3731                         if (old != 0)
3732                                 goto try_again;
3733 
3734                         old = save_old;
3735                 }
3736 
3737                 /*
3738                  * Sync the PTE
3739                  */
3740                 if (!(flags & HAT_SYNC_ZERORM) &&
3741                     PTE_GET(old, PT_SOFTWARE) <= PT_NOSYNC)
3742                         hati_sync_pte_to_page(pp, old, ht->ht_level);
3743 
3744                 /*
3745                  * can stop short if we found a ref'd or mod'd page
3746                  */
3747                 if ((flags & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp) ||
3748                     (flags & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)) {
3749                         x86_hm_exit(pp);
3750                         goto done;
3751                 }
3752         }
3753         x86_hm_exit(pp);
3754         while (pszc < pp->p_szc) {
3755                 page_t *tpp;
3756                 pszc++;
3757                 tpp = PP_GROUPLEADER(pp, pszc);
3758                 if (pp != tpp) {
3759                         pp = tpp;
3760                         goto next_size;
3761                 }
3762         }
3763 done:
3764         XPV_ALLOW_MIGRATE();
3765         return (save_pp->p_nrm & nrmbits);
3766 }
3767 
3768 /*
3769  * returns approx number of mappings to this pp.  A return of 0 implies
3770  * there are no mappings to the page.
3771  */
3772 ulong_t
3773 hat_page_getshare(page_t *pp)
3774 {
3775         uint_t cnt;
3776         cnt = hment_mapcnt(pp);
3777 #if defined(__amd64)
3778         if (vpm_enable && pp->p_vpmref) {
3779                 cnt += 1;
3780         }
3781 #endif
3782         return (cnt);
3783 }
3784 
3785 /*
3786  * Return 1 the number of mappings exceeds sh_thresh. Return 0
3787  * otherwise.
3788  */
3789 int
3790 hat_page_checkshare(page_t *pp, ulong_t sh_thresh)
3791 {
3792         return (hat_page_getshare(pp) > sh_thresh);
3793 }
3794 
3795 /*
3796  * hat_softlock isn't supported anymore
3797  */
3798 /*ARGSUSED*/
3799 faultcode_t
3800 hat_softlock(
3801         hat_t *hat,
3802         caddr_t addr,
3803         size_t *len,
3804         struct page **page_array,
3805         uint_t flags)
3806 {
3807         return (FC_NOSUPPORT);
3808 }
3809 
3810 
3811 
3812 /*
3813  * Routine to expose supported HAT features to platform independent code.
3814  */
3815 /*ARGSUSED*/
3816 int
3817 hat_supported(enum hat_features feature, void *arg)
3818 {
3819         switch (feature) {
3820 
3821         case HAT_SHARED_PT:     /* this is really ISM */
3822                 return (1);
3823 
3824         case HAT_DYNAMIC_ISM_UNMAP:
3825                 return (0);
3826 
3827         case HAT_VMODSORT:
3828                 return (1);
3829 
3830         case HAT_SHARED_REGIONS:
3831                 return (0);
3832 
3833         default:
3834                 panic("hat_supported() - unknown feature");
3835         }
3836         return (0);
3837 }
3838 
3839 /*
3840  * Called when a thread is exiting and has been switched to the kernel AS
3841  */
3842 void
3843 hat_thread_exit(kthread_t *thd)
3844 {
3845         ASSERT(thd->t_procp->p_as == &kas);
3846         XPV_DISALLOW_MIGRATE();
3847         hat_switch(thd->t_procp->p_as->a_hat);
3848         XPV_ALLOW_MIGRATE();
3849 }
3850 
3851 /*
3852  * Setup the given brand new hat structure as the new HAT on this cpu's mmu.
3853  */
3854 /*ARGSUSED*/
3855 void
3856 hat_setup(hat_t *hat, int flags)
3857 {
3858         XPV_DISALLOW_MIGRATE();
3859         kpreempt_disable();
3860 
3861         hat_switch(hat);
3862 
3863         kpreempt_enable();
3864         XPV_ALLOW_MIGRATE();
3865 }
3866 
3867 /*
3868  * Prepare for a CPU private mapping for the given address.
3869  *
3870  * The address can only be used from a single CPU and can be remapped
3871  * using hat_mempte_remap().  Return the address of the PTE.
3872  *
3873  * We do the htable_create() if necessary and increment the valid count so
3874  * the htable can't disappear.  We also hat_devload() the page table into
3875  * kernel so that the PTE is quickly accessed.
3876  */
3877 hat_mempte_t
3878 hat_mempte_setup(caddr_t addr)
3879 {
3880         uintptr_t       va = (uintptr_t)addr;
3881         htable_t        *ht;
3882         uint_t          entry;
3883         x86pte_t        oldpte;
3884         hat_mempte_t    p;
3885 
3886         ASSERT(IS_PAGEALIGNED(va));
3887         ASSERT(!IN_VA_HOLE(va));
3888         ++curthread->t_hatdepth;
3889         XPV_DISALLOW_MIGRATE();
3890         ht = htable_getpte(kas.a_hat, va, &entry, &oldpte, 0);
3891         if (ht == NULL) {
3892                 ht = htable_create(kas.a_hat, va, 0, NULL);
3893                 entry = htable_va2entry(va, ht);
3894                 ASSERT(ht->ht_level == 0);
3895                 oldpte = x86pte_get(ht, entry);
3896         }
3897         if (PTE_ISVALID(oldpte))
3898                 panic("hat_mempte_setup(): address already mapped"
3899                     "ht=%p, entry=%d, pte=" FMT_PTE, (void *)ht, entry, oldpte);
3900 
3901         /*
3902          * increment ht_valid_cnt so that the pagetable can't disappear
3903          */
3904         HTABLE_INC(ht->ht_valid_cnt);
3905 
3906         /*
3907          * return the PTE physical address to the caller.
3908          */
3909         htable_release(ht);
3910         XPV_ALLOW_MIGRATE();
3911         p = PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry);
3912         --curthread->t_hatdepth;
3913         return (p);
3914 }
3915 
3916 /*
3917  * Release a CPU private mapping for the given address.
3918  * We decrement the htable valid count so it might be destroyed.
3919  */
3920 /*ARGSUSED1*/
3921 void
3922 hat_mempte_release(caddr_t addr, hat_mempte_t pte_pa)
3923 {
3924         htable_t        *ht;
3925 
3926         XPV_DISALLOW_MIGRATE();
3927         /*
3928          * invalidate any left over mapping and decrement the htable valid count
3929          */
3930 #ifdef __xpv
3931         if (HYPERVISOR_update_va_mapping((uintptr_t)addr, 0,
3932             UVMF_INVLPG | UVMF_LOCAL))
3933                 panic("HYPERVISOR_update_va_mapping() failed");
3934 #else
3935         {
3936                 x86pte_t *pteptr;
3937 
3938                 pteptr = x86pte_mapin(mmu_btop(pte_pa),
3939                     (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL);
3940                 if (mmu.pae_hat)
3941                         *pteptr = 0;
3942                 else
3943                         *(x86pte32_t *)pteptr = 0;
3944                 mmu_tlbflush_entry(addr);
3945                 x86pte_mapout();
3946         }
3947 #endif
3948 
3949         ht = htable_getpte(kas.a_hat, ALIGN2PAGE(addr), NULL, NULL, 0);
3950         if (ht == NULL)
3951                 panic("hat_mempte_release(): invalid address");
3952         ASSERT(ht->ht_level == 0);
3953         HTABLE_DEC(ht->ht_valid_cnt);
3954         htable_release(ht);
3955         XPV_ALLOW_MIGRATE();
3956 }
3957 
3958 /*
3959  * Apply a temporary CPU private mapping to a page. We flush the TLB only
3960  * on this CPU, so this ought to have been called with preemption disabled.
3961  */
3962 void
3963 hat_mempte_remap(
3964         pfn_t           pfn,
3965         caddr_t         addr,
3966         hat_mempte_t    pte_pa,
3967         uint_t          attr,
3968         uint_t          flags)
3969 {
3970         uintptr_t       va = (uintptr_t)addr;
3971         x86pte_t        pte;
3972 
3973         /*
3974          * Remap the given PTE to the new page's PFN. Invalidate only
3975          * on this CPU.
3976          */
3977 #ifdef DEBUG
3978         htable_t        *ht;
3979         uint_t          entry;
3980 
3981         ASSERT(IS_PAGEALIGNED(va));
3982         ASSERT(!IN_VA_HOLE(va));
3983         ht = htable_getpte(kas.a_hat, va, &entry, NULL, 0);
3984         ASSERT(ht != NULL);
3985         ASSERT(ht->ht_level == 0);
3986         ASSERT(ht->ht_valid_cnt > 0);
3987         ASSERT(ht->ht_pfn == mmu_btop(pte_pa));
3988         htable_release(ht);
3989 #endif
3990         XPV_DISALLOW_MIGRATE();
3991         pte = hati_mkpte(pfn, attr, 0, flags);
3992 #ifdef __xpv
3993         if (HYPERVISOR_update_va_mapping(va, pte, UVMF_INVLPG | UVMF_LOCAL))
3994                 panic("HYPERVISOR_update_va_mapping() failed");
3995 #else
3996         {
3997                 x86pte_t *pteptr;
3998 
3999                 pteptr = x86pte_mapin(mmu_btop(pte_pa),
4000                     (pte_pa & MMU_PAGEOFFSET) >> mmu.pte_size_shift, NULL);
4001                 if (mmu.pae_hat)
4002                         *(x86pte_t *)pteptr = pte;
4003                 else
4004                         *(x86pte32_t *)pteptr = (x86pte32_t)pte;
4005                 mmu_tlbflush_entry(addr);
4006                 x86pte_mapout();
4007         }
4008 #endif
4009         XPV_ALLOW_MIGRATE();
4010 }
4011 
4012 
4013 
4014 /*
4015  * Hat locking functions
4016  * XXX - these two functions are currently being used by hatstats
4017  *      they can be removed by using a per-as mutex for hatstats.
4018  */
4019 void
4020 hat_enter(hat_t *hat)
4021 {
4022         mutex_enter(&hat->hat_mutex);
4023 }
4024 
4025 void
4026 hat_exit(hat_t *hat)
4027 {
4028         mutex_exit(&hat->hat_mutex);
4029 }
4030 
4031 /*
4032  * HAT part of cpu initialization.
4033  */
4034 void
4035 hat_cpu_online(struct cpu *cpup)
4036 {
4037         if (cpup != CPU) {
4038                 x86pte_cpu_init(cpup);
4039                 hat_vlp_setup(cpup);
4040         }
4041         CPUSET_ATOMIC_ADD(khat_cpuset, cpup->cpu_id);
4042 }
4043 
4044 /*
4045  * HAT part of cpu deletion.
4046  * (currently, we only call this after the cpu is safely passivated.)
4047  */
4048 void
4049 hat_cpu_offline(struct cpu *cpup)
4050 {
4051         ASSERT(cpup != CPU);
4052 
4053         CPUSET_ATOMIC_DEL(khat_cpuset, cpup->cpu_id);
4054         hat_vlp_teardown(cpup);
4055         x86pte_cpu_fini(cpup);
4056 }
4057 
4058 /*
4059  * Function called after all CPUs are brought online.
4060  * Used to remove low address boot mappings.
4061  */
4062 void
4063 clear_boot_mappings(uintptr_t low, uintptr_t high)
4064 {
4065         uintptr_t vaddr = low;
4066         htable_t *ht = NULL;
4067         level_t level;
4068         uint_t entry;
4069         x86pte_t pte;
4070 
4071         /*
4072          * On 1st CPU we can unload the prom mappings, basically we blow away
4073          * all virtual mappings under _userlimit.
4074          */
4075         while (vaddr < high) {
4076                 pte = htable_walk(kas.a_hat, &ht, &vaddr, high);
4077                 if (ht == NULL)
4078                         break;
4079 
4080                 level = ht->ht_level;
4081                 entry = htable_va2entry(vaddr, ht);
4082                 ASSERT(level <= mmu.max_page_level);
4083                 ASSERT(PTE_ISPAGE(pte, level));
4084 
4085                 /*
4086                  * Unload the mapping from the page tables.
4087                  */
4088                 (void) x86pte_inval(ht, entry, 0, NULL, B_TRUE);
4089                 ASSERT(ht->ht_valid_cnt > 0);
4090                 HTABLE_DEC(ht->ht_valid_cnt);
4091                 PGCNT_DEC(ht->ht_hat, ht->ht_level);
4092 
4093                 vaddr += LEVEL_SIZE(ht->ht_level);
4094         }
4095         if (ht)
4096                 htable_release(ht);
4097 }
4098 
4099 /*
4100  * Atomically update a new translation for a single page.  If the
4101  * currently installed PTE doesn't match the value we expect to find,
4102  * it's not updated and we return the PTE we found.
4103  *
4104  * If activating nosync or NOWRITE and the page was modified we need to sync
4105  * with the page_t. Also sync with page_t if clearing ref/mod bits.
4106  */
4107 static x86pte_t
4108 hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected, x86pte_t new)
4109 {
4110         page_t          *pp;
4111         uint_t          rm = 0;
4112         x86pte_t        replaced;
4113 
4114         if (PTE_GET(expected, PT_SOFTWARE) < PT_NOSYNC &&
4115             PTE_GET(expected, PT_MOD | PT_REF) &&
4116             (PTE_GET(new, PT_NOSYNC) || !PTE_GET(new, PT_WRITABLE) ||
4117             !PTE_GET(new, PT_MOD | PT_REF))) {
4118 
4119                 ASSERT(!pfn_is_foreign(PTE2PFN(expected, ht->ht_level)));
4120                 pp = page_numtopp_nolock(PTE2PFN(expected, ht->ht_level));
4121                 ASSERT(pp != NULL);
4122                 if (PTE_GET(expected, PT_MOD))
4123                         rm |= P_MOD;
4124                 if (PTE_GET(expected, PT_REF))
4125                         rm |= P_REF;
4126                 PTE_CLR(new, PT_MOD | PT_REF);
4127         }
4128 
4129         replaced = x86pte_update(ht, entry, expected, new);
4130         if (replaced != expected)
4131                 return (replaced);
4132 
4133         if (rm) {
4134                 /*
4135                  * sync to all constituent pages of a large page
4136                  */
4137                 pgcnt_t pgcnt = page_get_pagecnt(ht->ht_level);
4138                 ASSERT(IS_P2ALIGNED(pp->p_pagenum, pgcnt));
4139                 while (pgcnt-- > 0) {
4140                         /*
4141                          * hat_page_demote() can't decrease
4142                          * pszc below this mapping size
4143                          * since large mapping existed after we
4144                          * took mlist lock.
4145                          */
4146                         ASSERT(pp->p_szc >= ht->ht_level);
4147                         hat_page_setattr(pp, rm);
4148                         ++pp;
4149                 }
4150         }
4151 
4152         return (0);
4153 }
4154 
4155 /* ARGSUSED */
4156 void
4157 hat_join_srd(struct hat *hat, vnode_t *evp)
4158 {
4159 }
4160 
4161 /* ARGSUSED */
4162 hat_region_cookie_t
4163 hat_join_region(struct hat *hat,
4164     caddr_t r_saddr,
4165     size_t r_size,
4166     void *r_obj,
4167     u_offset_t r_objoff,
4168     uchar_t r_perm,
4169     uchar_t r_pgszc,
4170     hat_rgn_cb_func_t r_cb_function,
4171     uint_t flags)
4172 {
4173         panic("No shared region support on x86");
4174         return (HAT_INVALID_REGION_COOKIE);
4175 }
4176 
4177 /* ARGSUSED */
4178 void
4179 hat_leave_region(struct hat *hat, hat_region_cookie_t rcookie, uint_t flags)
4180 {
4181         panic("No shared region support on x86");
4182 }
4183 
4184 /* ARGSUSED */
4185 void
4186 hat_dup_region(struct hat *hat, hat_region_cookie_t rcookie)
4187 {
4188         panic("No shared region support on x86");
4189 }
4190 
4191 
4192 /*
4193  * Kernel Physical Mapping (kpm) facility
4194  *
4195  * Most of the routines needed to support segkpm are almost no-ops on the
4196  * x86 platform.  We map in the entire segment when it is created and leave
4197  * it mapped in, so there is no additional work required to set up and tear
4198  * down individual mappings.  All of these routines were created to support
4199  * SPARC platforms that have to avoid aliasing in their virtually indexed
4200  * caches.
4201  *
4202  * Most of the routines have sanity checks in them (e.g. verifying that the
4203  * passed-in page is locked).  We don't actually care about most of these
4204  * checks on x86, but we leave them in place to identify problems in the
4205  * upper levels.
4206  */
4207 
4208 /*
4209  * Map in a locked page and return the vaddr.
4210  */
4211 /*ARGSUSED*/
4212 caddr_t
4213 hat_kpm_mapin(struct page *pp, struct kpme *kpme)
4214 {
4215         caddr_t         vaddr;
4216 
4217 #ifdef DEBUG
4218         if (kpm_enable == 0) {
4219                 cmn_err(CE_WARN, "hat_kpm_mapin: kpm_enable not set\n");
4220                 return ((caddr_t)NULL);
4221         }
4222 
4223         if (pp == NULL || PAGE_LOCKED(pp) == 0) {
4224                 cmn_err(CE_WARN, "hat_kpm_mapin: pp zero or not locked\n");
4225                 return ((caddr_t)NULL);
4226         }
4227 #endif
4228 
4229         vaddr = hat_kpm_page2va(pp, 1);
4230 
4231         return (vaddr);
4232 }
4233 
4234 /*
4235  * Mapout a locked page.
4236  */
4237 /*ARGSUSED*/
4238 void
4239 hat_kpm_mapout(struct page *pp, struct kpme *kpme, caddr_t vaddr)
4240 {
4241 #ifdef DEBUG
4242         if (kpm_enable == 0) {
4243                 cmn_err(CE_WARN, "hat_kpm_mapout: kpm_enable not set\n");
4244                 return;
4245         }
4246 
4247         if (IS_KPM_ADDR(vaddr) == 0) {
4248                 cmn_err(CE_WARN, "hat_kpm_mapout: no kpm address\n");
4249                 return;
4250         }
4251 
4252         if (pp == NULL || PAGE_LOCKED(pp) == 0) {
4253                 cmn_err(CE_WARN, "hat_kpm_mapout: page zero or not locked\n");
4254                 return;
4255         }
4256 #endif
4257 }
4258 
4259 /*
4260  * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical
4261  * memory addresses that are not described by a page_t.  It can
4262  * also be used for normal pages that are not locked, but beware
4263  * this is dangerous - no locking is performed, so the identity of
4264  * the page could change.  hat_kpm_mapin_pfn is not supported when
4265  * vac_colors > 1, because the chosen va depends on the page identity,
4266  * which could change.
4267  * The caller must only pass pfn's for valid physical addresses; violation
4268  * of this rule will cause panic.
4269  */
4270 caddr_t
4271 hat_kpm_mapin_pfn(pfn_t pfn)
4272 {
4273         caddr_t paddr, vaddr;
4274 
4275         if (kpm_enable == 0)
4276                 return ((caddr_t)NULL);
4277 
4278         paddr = (caddr_t)ptob(pfn);
4279         vaddr = (uintptr_t)kpm_vbase + paddr;
4280 
4281         return ((caddr_t)vaddr);
4282 }
4283 
4284 /*ARGSUSED*/
4285 void
4286 hat_kpm_mapout_pfn(pfn_t pfn)
4287 {
4288         /* empty */
4289 }
4290 
4291 /*
4292  * Return the kpm virtual address for a specific pfn
4293  */
4294 caddr_t
4295 hat_kpm_pfn2va(pfn_t pfn)
4296 {
4297         uintptr_t vaddr = (uintptr_t)kpm_vbase + mmu_ptob(pfn);
4298 
4299         ASSERT(!pfn_is_foreign(pfn));
4300         return ((caddr_t)vaddr);
4301 }
4302 
4303 /*
4304  * Return the kpm virtual address for the page at pp.
4305  */
4306 /*ARGSUSED*/
4307 caddr_t
4308 hat_kpm_page2va(struct page *pp, int checkswap)
4309 {
4310         return (hat_kpm_pfn2va(pp->p_pagenum));
4311 }
4312 
4313 /*
4314  * Return the page frame number for the kpm virtual address vaddr.
4315  */
4316 pfn_t
4317 hat_kpm_va2pfn(caddr_t vaddr)
4318 {
4319         pfn_t           pfn;
4320 
4321         ASSERT(IS_KPM_ADDR(vaddr));
4322 
4323         pfn = (pfn_t)btop(vaddr - kpm_vbase);
4324 
4325         return (pfn);
4326 }
4327 
4328 
4329 /*
4330  * Return the page for the kpm virtual address vaddr.
4331  */
4332 page_t *
4333 hat_kpm_vaddr2page(caddr_t vaddr)
4334 {
4335         pfn_t           pfn;
4336 
4337         ASSERT(IS_KPM_ADDR(vaddr));
4338 
4339         pfn = hat_kpm_va2pfn(vaddr);
4340 
4341         return (page_numtopp_nolock(pfn));
4342 }
4343 
4344 /*
4345  * hat_kpm_fault is called from segkpm_fault when we take a page fault on a
4346  * KPM page.  This should never happen on x86
4347  */
4348 int
4349 hat_kpm_fault(hat_t *hat, caddr_t vaddr)
4350 {
4351         panic("pagefault in seg_kpm.  hat: 0x%p  vaddr: 0x%p",
4352             (void *)hat, (void *)vaddr);
4353 
4354         return (0);
4355 }
4356 
4357 /*ARGSUSED*/
4358 void
4359 hat_kpm_mseghash_clear(int nentries)
4360 {}
4361 
4362 /*ARGSUSED*/
4363 void
4364 hat_kpm_mseghash_update(pgcnt_t inx, struct memseg *msp)
4365 {}
4366 
4367 #ifndef __xpv
4368 void
4369 hat_kpm_addmem_mseg_update(struct memseg *msp, pgcnt_t nkpmpgs,
4370         offset_t kpm_pages_off)
4371 {
4372         _NOTE(ARGUNUSED(nkpmpgs, kpm_pages_off));
4373         pfn_t base, end;
4374 
4375         /*
4376          * kphysm_add_memory_dynamic() does not set nkpmpgs
4377          * when page_t memory is externally allocated.  That
4378          * code must properly calculate nkpmpgs in all cases
4379          * if nkpmpgs needs to be used at some point.
4380          */
4381 
4382         /*
4383          * The meta (page_t) pages for dynamically added memory are allocated
4384          * either from the incoming memory itself or from existing memory.
4385          * In the former case the base of the incoming pages will be different
4386          * than the base of the dynamic segment so call memseg_get_start() to
4387          * get the actual base of the incoming memory for each case.
4388          */
4389 
4390         base = memseg_get_start(msp);
4391         end = msp->pages_end;
4392 
4393         hat_devload(kas.a_hat, kpm_vbase + mmu_ptob(base),
4394             mmu_ptob(end - base), base, PROT_READ | PROT_WRITE,
4395             HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
4396 }
4397 
4398 void
4399 hat_kpm_addmem_mseg_insert(struct memseg *msp)
4400 {
4401         _NOTE(ARGUNUSED(msp));
4402 }
4403 
4404 void
4405 hat_kpm_addmem_memsegs_update(struct memseg *msp)
4406 {
4407         _NOTE(ARGUNUSED(msp));
4408 }
4409 
4410 /*
4411  * Return end of metadata for an already setup memseg.
4412  * X86 platforms don't need per-page meta data to support kpm.
4413  */
4414 caddr_t
4415 hat_kpm_mseg_reuse(struct memseg *msp)
4416 {
4417         return ((caddr_t)msp->epages);
4418 }
4419 
4420 void
4421 hat_kpm_delmem_mseg_update(struct memseg *msp, struct memseg **mspp)
4422 {
4423         _NOTE(ARGUNUSED(msp, mspp));
4424         ASSERT(0);
4425 }
4426 
4427 void
4428 hat_kpm_split_mseg_update(struct memseg *msp, struct memseg **mspp,
4429         struct memseg *lo, struct memseg *mid, struct memseg *hi)
4430 {
4431         _NOTE(ARGUNUSED(msp, mspp, lo, mid, hi));
4432         ASSERT(0);
4433 }
4434 
4435 /*
4436  * Walk the memsegs chain, applying func to each memseg span.
4437  */
4438 void
4439 hat_kpm_walk(void (*func)(void *, void *, size_t), void *arg)
4440 {
4441         pfn_t   pbase, pend;
4442         void    *base;
4443         size_t  size;
4444         struct memseg *msp;
4445 
4446         for (msp = memsegs; msp; msp = msp->next) {
4447                 pbase = msp->pages_base;
4448                 pend = msp->pages_end;
4449                 base = ptob(pbase) + kpm_vbase;
4450                 size = ptob(pend - pbase);
4451                 func(arg, base, size);
4452         }
4453 }
4454 
4455 #else   /* __xpv */
4456 
4457 /*
4458  * There are specific Hypervisor calls to establish and remove mappings
4459  * to grant table references and the privcmd driver. We have to ensure
4460  * that a page table actually exists.
4461  */
4462 void
4463 hat_prepare_mapping(hat_t *hat, caddr_t addr, uint64_t *pte_ma)
4464 {
4465         maddr_t base_ma;
4466         htable_t *ht;
4467         uint_t entry;
4468 
4469         ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE));
4470         XPV_DISALLOW_MIGRATE();
4471         ht = htable_create(hat, (uintptr_t)addr, 0, NULL);
4472 
4473         /*
4474          * if an address for pte_ma is passed in, return the MA of the pte
4475          * for this specific address.  This address is only valid as long
4476          * as the htable stays locked.
4477          */
4478         if (pte_ma != NULL) {
4479                 entry = htable_va2entry((uintptr_t)addr, ht);
4480                 base_ma = pa_to_ma(ptob(ht->ht_pfn));
4481                 *pte_ma = base_ma + (entry << mmu.pte_size_shift);
4482         }
4483         XPV_ALLOW_MIGRATE();
4484 }
4485 
4486 void
4487 hat_release_mapping(hat_t *hat, caddr_t addr)
4488 {
4489         htable_t *ht;
4490 
4491         ASSERT(IS_P2ALIGNED((uintptr_t)addr, MMU_PAGESIZE));
4492         XPV_DISALLOW_MIGRATE();
4493         ht = htable_lookup(hat, (uintptr_t)addr, 0);
4494         ASSERT(ht != NULL);
4495         ASSERT(ht->ht_busy >= 2);
4496         htable_release(ht);
4497         htable_release(ht);
4498         XPV_ALLOW_MIGRATE();
4499 }
4500 #endif  /* __xpv */