9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 /*
  29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  30  * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
  31  */
  32 
  33 /*
  34  * VM - Hardware Address Translation management for i386 and amd64
  35  *
  36  * Implementation of the interfaces described in <common/vm/hat.h>
  37  *
  38  * Nearly all the details of how the hardware is managed should not be
  39  * visible outside this layer except for misc. machine specific functions
  40  * that work in conjunction with this code.
  41  *
  42  * Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
  43  */
  44 
  45 #include <sys/machparam.h>
  46 #include <sys/machsystm.h>
  47 #include <sys/mman.h>
  48 #include <sys/types.h>
  49 #include <sys/systm.h>
 
 
 248         uint_t                  r;
 249         hat_kernel_range_t      *rp;
 250         uintptr_t               va;
 251         uintptr_t               eva;
 252         uint_t                  start;
 253         uint_t                  cnt;
 254         htable_t                *src;
 255 
 256         /*
 257          * Once we start creating user process HATs we can enable
 258          * the htable_steal() code.
 259          */
 260         if (can_steal_post_boot == 0)
 261                 can_steal_post_boot = 1;
 262 
 263         ASSERT(AS_WRITE_HELD(as));
 264         hat = kmem_cache_alloc(hat_cache, KM_SLEEP);
 265         hat->hat_as = as;
 266         mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 267         ASSERT(hat->hat_flags == 0);
 268 
 269 #if defined(__xpv)
 270         /*
 271          * No VLP stuff on the hypervisor due to the 64-bit split top level
 272          * page tables.  On 32-bit it's not needed as the hypervisor takes
 273          * care of copying the top level PTEs to a below 4Gig page.
 274          */
 275         use_vlp = 0;
 276 #else   /* __xpv */
 277         /* 32 bit processes uses a VLP style hat when running with PAE */
 278 #if defined(__amd64)
 279         use_vlp = (ttoproc(curthread)->p_model == DATAMODEL_ILP32);
 280 #elif defined(__i386)
 281         use_vlp = mmu.pae_hat;
 282 #endif
 283 #endif  /* __xpv */
 284         if (use_vlp) {
 285                 hat->hat_flags = HAT_VLP;
 286                 bzero(hat->hat_vlp_ptes, VLP_SIZE);
 287         }
 
 384         mutex_exit(&hat_list_lock);
 385 
 386         return (hat);
 387 }
 388 
 389 /*
 390  * process has finished executing but as has not been cleaned up yet.
 391  */
 392 /*ARGSUSED*/
 393 void
 394 hat_free_start(hat_t *hat)
 395 {
 396         ASSERT(AS_WRITE_HELD(hat->hat_as));
 397 
 398         /*
 399          * If the hat is currently a stealing victim, wait for the stealing
 400          * to finish.  Once we mark it as HAT_FREEING, htable_steal()
 401          * won't look at its pagetables anymore.
 402          */
 403         mutex_enter(&hat_list_lock);
 404         while (hat->hat_flags & HAT_VICTIM)
 405                 cv_wait(&hat_list_cv, &hat_list_lock);
 406         hat->hat_flags |= HAT_FREEING;
 407         mutex_exit(&hat_list_lock);
 408 }
 409 
 410 /*
 411  * An address space is being destroyed, so we destroy the associated hat.
 412  */
 413 void
 414 hat_free_end(hat_t *hat)
 415 {
 416         kmem_cache_t *cache;
 417 
 418         ASSERT(hat->hat_flags & HAT_FREEING);
 419 
 420         /*
 421          * must not be running on the given hat
 422          */
 423         ASSERT(CPU->cpu_current_hat != hat);
 424 
 
2445                  */
2446                 entry = htable_va2entry(vaddr, ht);
2447                 hat_pte_unmap(ht, entry, flags, old_pte, NULL, B_FALSE);
2448                 ASSERT(ht->ht_level <= mmu.max_page_level);
2449                 vaddr += LEVEL_SIZE(ht->ht_level);
2450                 contig_va = vaddr;
2451                 ++r[r_cnt - 1].rng_cnt;
2452         }
2453         if (ht)
2454                 htable_release(ht);
2455 
2456         /*
2457          * handle last range for callbacks
2458          */
2459         if (r_cnt > 0)
2460                 handle_ranges(hat, cb, r_cnt, r);
2461         XPV_ALLOW_MIGRATE();
2462 }
2463 
2464 /*
2465  * Invalidate a virtual address translation on a slave CPU during
2466  * panic() dumps.
2467  */
2468 void
2469 hat_flush_range(hat_t *hat, caddr_t va, size_t size)
2470 {
2471         ssize_t sz;
2472         caddr_t endva = va + size;
2473 
2474         while (va < endva) {
2475                 sz = hat_getpagesize(hat, va);
2476                 if (sz < 0) {
2477 #ifdef __xpv
2478                         xen_flush_tlb();
2479 #else
2480                         flush_all_tlb_entries();
2481 #endif
2482                         break;
2483                 }
2484 #ifdef __xpv
2485                 xen_flush_va(va);
2486 #else
2487                 mmu_tlbflush_entry(va);
2488 #endif
2489                 va += sz;
2490         }
2491 }
2492 
2493 /*
2494  * synchronize mapping with software data structures
2495  *
2496  * This interface is currently only used by the working set monitor
2497  * driver.
2498  */
2499 /*ARGSUSED*/
2500 void
2501 hat_sync(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
2502 {
2503         uintptr_t       vaddr = (uintptr_t)addr;
2504         uintptr_t       eaddr = vaddr + len;
2505         htable_t        *ht = NULL;
2506         uint_t          entry;
2507         x86pte_t        pte;
2508         x86pte_t        save_pte;
2509         x86pte_t        new;
2510         page_t          *pp;
 
3304                  * a fault when a page is made dirty. At this point
3305                  * we need to remove write permission from all mappings
3306                  * to this page.
3307                  */
3308                 hati_page_clrwrt(pp);
3309         }
3310 }
3311 
3312 /*
3313  *      If flag is specified, returns 0 if attribute is disabled
3314  *      and non zero if enabled.  If flag specifes multiple attributes
3315  *      then returns 0 if ALL attributes are disabled.  This is an advisory
3316  *      call.
3317  */
3318 uint_t
3319 hat_page_getattr(struct page *pp, uint_t flag)
3320 {
3321         return (PP_GETRM(pp, flag));
3322 }
3323 
3324 
3325 /*
3326  * common code used by hat_pageunload() and hment_steal()
3327  */
3328 hment_t *
3329 hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
3330 {
3331         x86pte_t old_pte;
3332         pfn_t pfn = pp->p_pagenum;
3333         hment_t *hm;
3334 
3335         /*
3336          * We need to acquire a hold on the htable in order to
3337          * do the invalidate. We know the htable must exist, since
3338          * unmap's don't release the htable until after removing any
3339          * hment. Having x86_hm_enter() keeps that from proceeding.
3340          */
3341         htable_acquire(ht);
3342 
3343         /*
3344          * Invalidate the PTE and remove the hment.
3345          */
3346         old_pte = x86pte_inval(ht, entry, 0, NULL, B_TRUE);
3347         if (PTE2PFN(old_pte, ht->ht_level) != pfn) {
3348                 panic("x86pte_inval() failure found PTE = " FMT_PTE
3349                     " pfn being unmapped is %lx ht=0x%lx entry=0x%x",
3350                     old_pte, pfn, (uintptr_t)ht, entry);
3351         }
3352 
3353         /*
3354          * Clean up all the htable information for this mapping
3355          */
3356         ASSERT(ht->ht_valid_cnt > 0);
3357         HTABLE_DEC(ht->ht_valid_cnt);
3358         PGCNT_DEC(ht->ht_hat, ht->ht_level);
3359 
3360         /*
3361          * sync ref/mod bits to the page_t
3362          */
3363         if (PTE_GET(old_pte, PT_SOFTWARE) < PT_NOSYNC)
3364                 hati_sync_pte_to_page(pp, old_pte, ht->ht_level);
3365 
3366         /*
3367          * Remove the mapping list entry for this page.
3368          */
3369         hm = hment_remove(pp, ht, entry);
3370 
3371         /*
3372          * drop the mapping list lock so that we might free the
3373          * hment and htable.
3374          */
3375         x86_hm_exit(pp);
3376         htable_release(ht);
3377         return (hm);
3378 }
3379 
3380 extern int      vpm_enable;
3381 /*
3382  * Unload all translations to a page. If the page is a subpage of a large
3383  * page, the large page mappings are also removed.
3384  *
3385  * The forceflags are unused.
3386  */
3387 
3388 /*ARGSUSED*/
3389 static int
3390 hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
3391 {
3392         page_t          *cur_pp = pp;
3393         hment_t         *hm;
3394         hment_t         *prev;
3395         htable_t        *ht;
3396         uint_t          entry;
 
 | 
 
 
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 /*
  29  * Copyright 2019 Nexenta Systems, Inc.  All rights reserved.
  30  * Copyright 2018 Joyent, Inc.  All rights reserved.
  31  * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
  32  */
  33 
  34 /*
  35  * VM - Hardware Address Translation management for i386 and amd64
  36  *
  37  * Implementation of the interfaces described in <common/vm/hat.h>
  38  *
  39  * Nearly all the details of how the hardware is managed should not be
  40  * visible outside this layer except for misc. machine specific functions
  41  * that work in conjunction with this code.
  42  *
  43  * Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
  44  */
  45 
  46 #include <sys/machparam.h>
  47 #include <sys/machsystm.h>
  48 #include <sys/mman.h>
  49 #include <sys/types.h>
  50 #include <sys/systm.h>
 
 
 249         uint_t                  r;
 250         hat_kernel_range_t      *rp;
 251         uintptr_t               va;
 252         uintptr_t               eva;
 253         uint_t                  start;
 254         uint_t                  cnt;
 255         htable_t                *src;
 256 
 257         /*
 258          * Once we start creating user process HATs we can enable
 259          * the htable_steal() code.
 260          */
 261         if (can_steal_post_boot == 0)
 262                 can_steal_post_boot = 1;
 263 
 264         ASSERT(AS_WRITE_HELD(as));
 265         hat = kmem_cache_alloc(hat_cache, KM_SLEEP);
 266         hat->hat_as = as;
 267         mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
 268         ASSERT(hat->hat_flags == 0);
 269         hat->hat_unmaps = 0;
 270 
 271 #if defined(__xpv)
 272         /*
 273          * No VLP stuff on the hypervisor due to the 64-bit split top level
 274          * page tables.  On 32-bit it's not needed as the hypervisor takes
 275          * care of copying the top level PTEs to a below 4Gig page.
 276          */
 277         use_vlp = 0;
 278 #else   /* __xpv */
 279         /* 32 bit processes uses a VLP style hat when running with PAE */
 280 #if defined(__amd64)
 281         use_vlp = (ttoproc(curthread)->p_model == DATAMODEL_ILP32);
 282 #elif defined(__i386)
 283         use_vlp = mmu.pae_hat;
 284 #endif
 285 #endif  /* __xpv */
 286         if (use_vlp) {
 287                 hat->hat_flags = HAT_VLP;
 288                 bzero(hat->hat_vlp_ptes, VLP_SIZE);
 289         }
 
 386         mutex_exit(&hat_list_lock);
 387 
 388         return (hat);
 389 }
 390 
 391 /*
 392  * process has finished executing but as has not been cleaned up yet.
 393  */
 394 /*ARGSUSED*/
 395 void
 396 hat_free_start(hat_t *hat)
 397 {
 398         ASSERT(AS_WRITE_HELD(hat->hat_as));
 399 
 400         /*
 401          * If the hat is currently a stealing victim, wait for the stealing
 402          * to finish.  Once we mark it as HAT_FREEING, htable_steal()
 403          * won't look at its pagetables anymore.
 404          */
 405         mutex_enter(&hat_list_lock);
 406         while ((hat->hat_flags & HAT_VICTIM) || (hat->hat_unmaps > 0))
 407                 cv_wait(&hat_list_cv, &hat_list_lock);
 408         hat->hat_flags |= HAT_FREEING;
 409         mutex_exit(&hat_list_lock);
 410 }
 411 
 412 /*
 413  * An address space is being destroyed, so we destroy the associated hat.
 414  */
 415 void
 416 hat_free_end(hat_t *hat)
 417 {
 418         kmem_cache_t *cache;
 419 
 420         ASSERT(hat->hat_flags & HAT_FREEING);
 421 
 422         /*
 423          * must not be running on the given hat
 424          */
 425         ASSERT(CPU->cpu_current_hat != hat);
 426 
 
2447                  */
2448                 entry = htable_va2entry(vaddr, ht);
2449                 hat_pte_unmap(ht, entry, flags, old_pte, NULL, B_FALSE);
2450                 ASSERT(ht->ht_level <= mmu.max_page_level);
2451                 vaddr += LEVEL_SIZE(ht->ht_level);
2452                 contig_va = vaddr;
2453                 ++r[r_cnt - 1].rng_cnt;
2454         }
2455         if (ht)
2456                 htable_release(ht);
2457 
2458         /*
2459          * handle last range for callbacks
2460          */
2461         if (r_cnt > 0)
2462                 handle_ranges(hat, cb, r_cnt, r);
2463         XPV_ALLOW_MIGRATE();
2464 }
2465 
2466 /*
2467  * Flush the TLB for the local CPU
2468  * Invoked from a slave CPU during panic() dumps.
2469  */
2470 void
2471 hat_flush(void)
2472 {
2473 #ifdef __xpv
2474                         xen_flush_tlb();
2475 #else
2476                         flush_all_tlb_entries();
2477 #endif
2478 }
2479 
2480 /*
2481  * synchronize mapping with software data structures
2482  *
2483  * This interface is currently only used by the working set monitor
2484  * driver.
2485  */
2486 /*ARGSUSED*/
2487 void
2488 hat_sync(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
2489 {
2490         uintptr_t       vaddr = (uintptr_t)addr;
2491         uintptr_t       eaddr = vaddr + len;
2492         htable_t        *ht = NULL;
2493         uint_t          entry;
2494         x86pte_t        pte;
2495         x86pte_t        save_pte;
2496         x86pte_t        new;
2497         page_t          *pp;
 
3291                  * a fault when a page is made dirty. At this point
3292                  * we need to remove write permission from all mappings
3293                  * to this page.
3294                  */
3295                 hati_page_clrwrt(pp);
3296         }
3297 }
3298 
3299 /*
3300  *      If flag is specified, returns 0 if attribute is disabled
3301  *      and non zero if enabled.  If flag specifes multiple attributes
3302  *      then returns 0 if ALL attributes are disabled.  This is an advisory
3303  *      call.
3304  */
3305 uint_t
3306 hat_page_getattr(struct page *pp, uint_t flag)
3307 {
3308         return (PP_GETRM(pp, flag));
3309 }
3310 
3311 /*
3312  * common code used by hat_pageunload() and hment_steal()
3313  */
3314 hment_t *
3315 hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
3316 {
3317         x86pte_t old_pte;
3318         pfn_t pfn = pp->p_pagenum;
3319         hment_t *hm;
3320         hat_t *hat = ht->ht_hat;
3321 
3322         /*
3323          * There is a race between this function and the freeing of a HAT
3324          * whose owning process is exiting; process exit code ignores htable
3325          * reference counts.
3326          * If the HAT is already freeing (HAT_FREEING) no-op this function.
3327          * Otherwise increment hat_unmaps to block the hat from being free'd
3328          * until this function completes.
3329          */
3330         mutex_enter(&hat_list_lock);
3331         if (hat->hat_flags & HAT_FREEING) {
3332                 mutex_exit(&hat_list_lock);
3333                 x86_hm_exit(pp);
3334                 return (NULL);
3335         }
3336         ++(hat->hat_unmaps);
3337         mutex_exit(&hat_list_lock);
3338 
3339         /*
3340          * We need to acquire a hold on the htable in order to
3341          * do the invalidate. We know the htable must exist, since
3342          * unmap's don't release the htable until after removing any
3343          * hment. Having x86_hm_enter() keeps that from proceeding.
3344          */
3345         htable_acquire(ht);
3346 
3347         /*
3348          * Invalidate the PTE and remove the hment.
3349          */
3350         old_pte = x86pte_inval(ht, entry, 0, NULL, B_TRUE);
3351         if (PTE2PFN(old_pte, ht->ht_level) != pfn) {
3352                 panic("x86pte_inval() failure found PTE = " FMT_PTE
3353                     " pfn being unmapped is %lx ht=0x%lx entry=0x%x",
3354                     old_pte, pfn, (uintptr_t)ht, entry);
3355         }
3356 
3357         /*
3358          * Clean up all the htable information for this mapping
3359          */
3360         ASSERT(ht->ht_valid_cnt > 0);
3361         HTABLE_DEC(ht->ht_valid_cnt);
3362         PGCNT_DEC(ht->ht_hat, ht->ht_level);
3363 
3364         /*
3365          * sync ref/mod bits to the page_t
3366          */
3367         if (PTE_GET(old_pte, PT_SOFTWARE) < PT_NOSYNC)
3368                 hati_sync_pte_to_page(pp, old_pte, ht->ht_level);
3369 
3370         /*
3371          * Remove the mapping list entry for this page.
3372          */
3373         hm = hment_remove(pp, ht, entry);
3374 
3375         /*
3376          * drop the mapping list lock so that we might free the hment and htable
3377          */
3378         x86_hm_exit(pp);
3379         htable_release(ht);
3380 
3381         mutex_enter(&hat_list_lock);
3382         --(hat->hat_unmaps);
3383         cv_broadcast(&hat_list_cv);
3384         mutex_exit(&hat_list_lock);
3385         return (hm);
3386 }
3387 
3388 extern int      vpm_enable;
3389 /*
3390  * Unload all translations to a page. If the page is a subpage of a large
3391  * page, the large page mappings are also removed.
3392  *
3393  * The forceflags are unused.
3394  */
3395 
3396 /*ARGSUSED*/
3397 static int
3398 hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
3399 {
3400         page_t          *cur_pp = pp;
3401         hment_t         *hm;
3402         hment_t         *prev;
3403         htable_t        *ht;
3404         uint_t          entry;
 
 |