Print this page
OS-3088 need a lighterweight page invalidation mechanism for zone memcap
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process


  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 /*
  29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  30  * Copyright (c) 2014, 2015 by Delphix. All rights reserved.

  31  */
  32 
  33 /*
  34  * VM - Hardware Address Translation management for i386 and amd64
  35  *
  36  * Implementation of the interfaces described in <common/vm/hat.h>
  37  *
  38  * Nearly all the details of how the hardware is managed should not be
  39  * visible outside this layer except for misc. machine specific functions
  40  * that work in conjunction with this code.
  41  *
  42  * Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
  43  */
  44 
  45 #include <sys/machparam.h>
  46 #include <sys/machsystm.h>
  47 #include <sys/mman.h>
  48 #include <sys/types.h>
  49 #include <sys/systm.h>
  50 #include <sys/cpuvar.h>


3306                  * to this page.
3307                  */
3308                 hati_page_clrwrt(pp);
3309         }
3310 }
3311 
3312 /*
3313  *      If flag is specified, returns 0 if attribute is disabled
3314  *      and non zero if enabled.  If flag specifes multiple attributes
3315  *      then returns 0 if ALL attributes are disabled.  This is an advisory
3316  *      call.
3317  */
3318 uint_t
3319 hat_page_getattr(struct page *pp, uint_t flag)
3320 {
3321         return (PP_GETRM(pp, flag));
3322 }
3323 
3324 
3325 /*
3326  * common code used by hat_pageunload() and hment_steal()
3327  */
3328 hment_t *
3329 hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
3330 {
3331         x86pte_t old_pte;
3332         pfn_t pfn = pp->p_pagenum;
3333         hment_t *hm;
3334 
3335         /*
3336          * We need to acquire a hold on the htable in order to
3337          * do the invalidate. We know the htable must exist, since
3338          * unmap's don't release the htable until after removing any
3339          * hment. Having x86_hm_enter() keeps that from proceeding.
3340          */
3341         htable_acquire(ht);
3342 
3343         /*
3344          * Invalidate the PTE and remove the hment.
3345          */
3346         old_pte = x86pte_inval(ht, entry, 0, NULL, B_TRUE);


3362          */
3363         if (PTE_GET(old_pte, PT_SOFTWARE) < PT_NOSYNC)
3364                 hati_sync_pte_to_page(pp, old_pte, ht->ht_level);
3365 
3366         /*
3367          * Remove the mapping list entry for this page.
3368          */
3369         hm = hment_remove(pp, ht, entry);
3370 
3371         /*
3372          * drop the mapping list lock so that we might free the
3373          * hment and htable.
3374          */
3375         x86_hm_exit(pp);
3376         htable_release(ht);
3377         return (hm);
3378 }
3379 
3380 extern int      vpm_enable;
3381 /*
3382  * Unload all translations to a page. If the page is a subpage of a large
3383  * page, the large page mappings are also removed.
3384  *
3385  * The forceflags are unused.
3386  */
3387 
3388 /*ARGSUSED*/
3389 static int
3390 hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag)
3391 {
3392         page_t          *cur_pp = pp;
3393         hment_t         *hm;
3394         hment_t         *prev;
3395         htable_t        *ht;
3396         uint_t          entry;
3397         level_t         level;

3398 
3399         XPV_DISALLOW_MIGRATE();
3400 
3401         /*
3402          * prevent recursion due to kmem_free()
3403          */
3404         ++curthread->t_hatdepth;
3405         ASSERT(curthread->t_hatdepth < 16);
3406 
3407 #if defined(__amd64)
3408         /*
3409          * clear the vpm ref.
3410          */
3411         if (vpm_enable) {
3412                 pp->p_vpmref = 0;
3413         }
3414 #endif
3415         /*
3416          * The loop with next_size handles pages with multiple pagesize mappings
3417          */
3418 next_size:


3419         for (;;) {
3420 
3421                 /*
3422                  * Get a mapping list entry
3423                  */
3424                 x86_hm_enter(cur_pp);
3425                 for (prev = NULL; ; prev = hm) {
3426                         hm = hment_walk(cur_pp, &ht, &entry, prev);
3427                         if (hm == NULL) {
3428                                 x86_hm_exit(cur_pp);
3429 

3430                                 /*
3431                                  * If not part of a larger page, we're done.
3432                                  */
3433                                 if (cur_pp->p_szc <= pg_szcd) {
3434                                         ASSERT(curthread->t_hatdepth > 0);
3435                                         --curthread->t_hatdepth;
3436                                         XPV_ALLOW_MIGRATE();
3437                                         return (0);
3438                                 }
3439 
3440                                 /*
3441                                  * Else check the next larger page size.
3442                                  * hat_page_demote() may decrease p_szc
3443                                  * but that's ok we'll just take an extra
3444                                  * trip discover there're no larger mappings
3445                                  * and return.
3446                                  */
3447                                 ++pg_szcd;
3448                                 cur_pp = PP_GROUPLEADER(cur_pp, pg_szcd);
3449                                 goto next_size;
3450                         }
3451 
3452                         /*
3453                          * If this mapping size matches, remove it.
3454                          */
3455                         level = ht->ht_level;
3456                         if (level == pg_szcd)

3457                                 break;









3458                 }


3459 
3460                 /*
3461                  * Remove the mapping list entry for this page.
3462                  * Note this does the x86_hm_exit() for us.
3463                  */
3464                 hm = hati_page_unmap(cur_pp, ht, entry);
3465                 if (hm != NULL)
3466                         hment_free(hm);




3467         }
3468 }
3469 


























3470 int
3471 hat_pageunload(struct page *pp, uint_t forceflag)
3472 {
3473         ASSERT(PAGE_EXCL(pp));
3474         return (hati_pageunload(pp, 0, forceflag));
3475 }
3476 
3477 /*
3478  * Unload all large mappings to pp and reduce by 1 p_szc field of every large
3479  * page level that included pp.
3480  *
3481  * pp must be locked EXCL. Even though no other constituent pages are locked
3482  * it's legal to unload large mappings to pp because all constituent pages of
3483  * large locked mappings have to be locked SHARED.  therefore if we have EXCL
3484  * lock on one of constituent pages none of the large mappings to pp are
3485  * locked.
3486  *
3487  * Change (always decrease) p_szc field starting from the last constituent
3488  * page and ending with root constituent page so that root's pszc always shows
3489  * the area where hat_page_demote() may be active.
3490  *
3491  * This mechanism is only used for file system pages where it's not always
3492  * possible to get EXCL locks on all constituent pages to demote the size code
3493  * (as is done for anonymous or kernel large pages).
3494  */




  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 /*
  25  * Copyright (c) 2010, Intel Corporation.
  26  * All rights reserved.
  27  */
  28 /*
  29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  30  * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
  31  * Copyright 2014 Joyent, Inc.  All rights reserved.
  32  */
  33 
  34 /*
  35  * VM - Hardware Address Translation management for i386 and amd64
  36  *
  37  * Implementation of the interfaces described in <common/vm/hat.h>
  38  *
  39  * Nearly all the details of how the hardware is managed should not be
  40  * visible outside this layer except for misc. machine specific functions
  41  * that work in conjunction with this code.
  42  *
  43  * Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
  44  */
  45 
  46 #include <sys/machparam.h>
  47 #include <sys/machsystm.h>
  48 #include <sys/mman.h>
  49 #include <sys/types.h>
  50 #include <sys/systm.h>
  51 #include <sys/cpuvar.h>


3307                  * to this page.
3308                  */
3309                 hati_page_clrwrt(pp);
3310         }
3311 }
3312 
3313 /*
3314  *      If flag is specified, returns 0 if attribute is disabled
3315  *      and non zero if enabled.  If flag specifes multiple attributes
3316  *      then returns 0 if ALL attributes are disabled.  This is an advisory
3317  *      call.
3318  */
3319 uint_t
3320 hat_page_getattr(struct page *pp, uint_t flag)
3321 {
3322         return (PP_GETRM(pp, flag));
3323 }
3324 
3325 
3326 /*
3327  * common code used by hat_page_inval() and hment_steal()
3328  */
3329 hment_t *
3330 hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry)
3331 {
3332         x86pte_t old_pte;
3333         pfn_t pfn = pp->p_pagenum;
3334         hment_t *hm;
3335 
3336         /*
3337          * We need to acquire a hold on the htable in order to
3338          * do the invalidate. We know the htable must exist, since
3339          * unmap's don't release the htable until after removing any
3340          * hment. Having x86_hm_enter() keeps that from proceeding.
3341          */
3342         htable_acquire(ht);
3343 
3344         /*
3345          * Invalidate the PTE and remove the hment.
3346          */
3347         old_pte = x86pte_inval(ht, entry, 0, NULL, B_TRUE);


3363          */
3364         if (PTE_GET(old_pte, PT_SOFTWARE) < PT_NOSYNC)
3365                 hati_sync_pte_to_page(pp, old_pte, ht->ht_level);
3366 
3367         /*
3368          * Remove the mapping list entry for this page.
3369          */
3370         hm = hment_remove(pp, ht, entry);
3371 
3372         /*
3373          * drop the mapping list lock so that we might free the
3374          * hment and htable.
3375          */
3376         x86_hm_exit(pp);
3377         htable_release(ht);
3378         return (hm);
3379 }
3380 
3381 extern int      vpm_enable;
3382 /*
3383  * Unload translations to a page. If the page is a subpage of a large
3384  * page, the large page mappings are also removed.
3385  * If curhat is not NULL, then we only unload the translation
3386  * for the given process, otherwise all translations are unloaded.
3387  */
3388 void
3389 hat_page_inval(struct page *pp, uint_t pg_szcd, struct hat *curhat)


3390 {
3391         page_t          *cur_pp = pp;
3392         hment_t         *hm;
3393         hment_t         *prev;
3394         htable_t        *ht;
3395         uint_t          entry;
3396         level_t         level;
3397         ulong_t         cnt;
3398 
3399         XPV_DISALLOW_MIGRATE();
3400 






3401 #if defined(__amd64)
3402         /*
3403          * clear the vpm ref.
3404          */
3405         if (vpm_enable) {
3406                 pp->p_vpmref = 0;
3407         }
3408 #endif
3409         /*
3410          * The loop with next_size handles pages with multiple pagesize mappings
3411          */
3412 next_size:
3413         if (curhat != NULL)
3414                 cnt = hat_page_getshare(cur_pp);
3415         for (;;) {
3416 
3417                 /*
3418                  * Get a mapping list entry
3419                  */
3420                 x86_hm_enter(cur_pp);
3421                 for (prev = NULL; ; prev = hm) {
3422                         hm = hment_walk(cur_pp, &ht, &entry, prev);
3423                         if (hm == NULL) {
3424                                 x86_hm_exit(cur_pp);
3425 
3426 curproc_done:
3427                                 /*
3428                                  * If not part of a larger page, we're done.
3429                                  */
3430                                 if (cur_pp->p_szc <= pg_szcd) {


3431                                         XPV_ALLOW_MIGRATE();
3432                                         return;
3433                                 }
3434 
3435                                 /*
3436                                  * Else check the next larger page size.
3437                                  * hat_page_demote() may decrease p_szc
3438                                  * but that's ok we'll just take an extra
3439                                  * trip discover there're no larger mappings
3440                                  * and return.
3441                                  */
3442                                 ++pg_szcd;
3443                                 cur_pp = PP_GROUPLEADER(cur_pp, pg_szcd);
3444                                 goto next_size;
3445                         }
3446 
3447                         /*
3448                          * If this mapping size matches, remove it.
3449                          */
3450                         level = ht->ht_level;
3451                         if (level == pg_szcd) {
3452                                 if (curhat == NULL || ht->ht_hat == curhat)
3453                                         break;
3454                                 /*
3455                                  * Unloading only the given process but it's
3456                                  * not the hat for the current process. Leave
3457                                  * entry in place. Also do a safety check to
3458                                  * ensure we don't get in an infinite loop
3459                                  */
3460                                 if (cnt-- == 0) {
3461                                         x86_hm_exit(cur_pp);
3462                                         goto curproc_done;
3463                                 }
3464                         }
3465                 }
3466 
3467                 /*
3468                  * Remove the mapping list entry for this page.
3469                  * Note this does the x86_hm_exit() for us.
3470                  */
3471                 hm = hati_page_unmap(cur_pp, ht, entry);
3472                 if (hm != NULL)
3473                         hment_free(hm);
3474 
3475                 /* Perform check above for being part of a larger page. */
3476                 if (curhat != NULL)
3477                         goto curproc_done;
3478         }
3479 }
3480 
3481 /*
3482  * Unload translations to a page. If unloadflag is HAT_CURPROC_PGUNLOAD, then
3483  * we only unload the translation for the current process, otherwise all
3484  * translations are unloaded.
3485  */
3486 static int
3487 hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t unloadflag)
3488 {
3489         struct hat      *curhat = NULL;
3490 
3491         /*
3492          * prevent recursion due to kmem_free()
3493          */
3494         ++curthread->t_hatdepth;
3495         ASSERT(curthread->t_hatdepth < 16);
3496 
3497         if (unloadflag == HAT_CURPROC_PGUNLOAD)
3498                 curhat = curthread->t_procp->p_as->a_hat;
3499 
3500         hat_page_inval(pp, pg_szcd, curhat);
3501 
3502         ASSERT(curthread->t_hatdepth > 0);
3503         --curthread->t_hatdepth;
3504         return (0);
3505 }
3506 
3507 int
3508 hat_pageunload(struct page *pp, uint_t unloadflag)
3509 {
3510         ASSERT(PAGE_EXCL(pp));
3511         return (hati_pageunload(pp, 0, unloadflag));
3512 }
3513 
3514 /*
3515  * Unload all large mappings to pp and reduce by 1 p_szc field of every large
3516  * page level that included pp.
3517  *
3518  * pp must be locked EXCL. Even though no other constituent pages are locked
3519  * it's legal to unload large mappings to pp because all constituent pages of
3520  * large locked mappings have to be locked SHARED.  therefore if we have EXCL
3521  * lock on one of constituent pages none of the large mappings to pp are
3522  * locked.
3523  *
3524  * Change (always decrease) p_szc field starting from the last constituent
3525  * page and ending with root constituent page so that root's pszc always shows
3526  * the area where hat_page_demote() may be active.
3527  *
3528  * This mechanism is only used for file system pages where it's not always
3529  * possible to get EXCL locks on all constituent pages to demote the size code
3530  * (as is done for anonymous or kernel large pages).
3531  */