io-lx-public-vs-joyent Wdiff usr/src/uts/common/vm/vm_usage.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_usage.c
          +++ new/usr/src/uts/common/vm/vm_usage.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Copyright 2016, Joyent, Inc.
  29   29   */
  30   30  
  31   31  /*
  32   32   * vm_usage
  33   33   *
  34   34   * This file implements the getvmusage() private system call.
  35   35   * getvmusage() counts the amount of resident memory pages and swap
  36   36   * reserved by the specified process collective. A "process collective" is
  37   37   * the set of processes owned by a particular, zone, project, task, or user.
  38   38   *
  39   39   * rss and swap are counted so that for a given process collective, a page is
  40   40   * only counted once.  For example, this means that if multiple processes in
  41   41   * the same project map the same page, then the project will only be charged
  42   42   * once for that page.  On the other hand, if two processes in different
  43   43   * projects map the same page, then both projects will be charged
  44   44   * for the page.
  45   45   *
  46   46   * The vm_getusage() calculation is implemented so that the first thread
  47   47   * performs the rss/swap counting. Other callers will wait for that thread to
  48   48   * finish, copying the results.  This enables multiple rcapds and prstats to
  49   49   * consume data from the same calculation.  The results are also cached so that
  50   50   * a caller interested in recent results can just copy them instead of starting
  51   51   * a new calculation. The caller passes the maximium age (in seconds) of the
  52   52   * data.  If the cached data is young enough, the cache is copied, otherwise,
  53   53   * a new calculation is executed and the cache is replaced with the new
  54   54   * data.
  55   55   *
  56   56   * The rss calculation for each process collective is as follows:
  57   57   *
  58   58   *   - Inspect flags, determine if counting rss for zones, projects, tasks,
  59   59   *     and/or users.
  60   60   *   - For each proc:
  61   61   *      - Figure out proc's collectives (zone, project, task, and/or user).
  62   62   *      - For each seg in proc's address space:
  63   63   *              - If seg is private:
  64   64   *                      - Lookup anons in the amp.
  65   65   *                      - For incore pages not previously visited each of the
  66   66   *                        proc's collectives, add incore pagesize to each.
  67   67   *                        collective.
  68   68   *                        Anon's with a refcnt of 1 can be assummed to be not
  69   69   *                        previously visited.
  70   70   *                      - For address ranges without anons in the amp:
  71   71   *                              - Lookup pages in underlying vnode.
  72   72   *                              - For incore pages not previously visiting for
  73   73   *                                each of the proc's collectives, add incore
  74   74   *                                pagesize to each collective.
  75   75   *              - If seg is shared:
  76   76   *                      - Lookup pages in the shared amp or vnode.
  77   77   *                      - For incore pages not previously visited for each of
  78   78   *                        the proc's collectives, add incore pagesize to each
  79   79   *                        collective.
  80   80   *
  81   81   * Swap is reserved by private segments, and shared anonymous segments.
  82   82   * The only shared anon segments which do not reserve swap are ISM segments
  83   83   * and schedctl segments, both of which can be identified by having
  84   84   * amp->swresv == 0.
  85   85   *
  86   86   * The swap calculation for each collective is as follows:
  87   87   *
  88   88   *   - Inspect flags, determine if counting rss for zones, projects, tasks,
  89   89   *     and/or users.
  90   90   *   - For each proc:
  91   91   *      - Figure out proc's collectives (zone, project, task, and/or user).
  92   92   *      - For each seg in proc's address space:
  93   93   *              - If seg is private:
  94   94   *                      - Add svd->swresv pages to swap count for each of the
  95   95   *                        proc's collectives.
  96   96   *              - If seg is anon, shared, and amp->swresv != 0
  97   97   *                      - For address ranges in amp not previously visited for
  98   98   *                        each of the proc's collectives, add size of address
  99   99   *                        range to the swap count for each collective.
 100  100   *
 101  101   * These two calculations are done simultaneously, with most of the work
 102  102   * being done in vmu_calculate_seg().  The results of the calculation are
 103  103   * copied into "vmu_data.vmu_cache_results".
 104  104   *
 105  105   * To perform the calculation, various things are tracked and cached:
 106  106   *
 107  107   *    - incore/not-incore page ranges for all vnodes.
 108  108   *      (vmu_data.vmu_all_vnodes_hash)
 109  109   *      This eliminates looking up the same page more than once.
 110  110   *

↓ open down ↓

110 lines elided

↑ open up ↑

 111  111   *    - incore/not-incore page ranges for all shared amps.
 112  112   *      (vmu_data.vmu_all_amps_hash)
 113  113   *      This eliminates looking up the same page more than once.
 114  114   *
 115  115   *    - visited page ranges for each collective.
 116  116   *         - per vnode (entity->vme_vnode_hash)
 117  117   *         - per shared amp (entity->vme_amp_hash)
 118  118   *      For accurate counting of map-shared and COW-shared pages.
 119  119   *
 120  120   *    - visited private anons (refcnt > 1) for each collective.
 121      - *      (entity->vme_anon)
      121 + *      (entity->vme_anon_hash)
 122  122   *      For accurate counting of COW-shared pages.
 123  123   *
 124  124   * The common accounting structure is the vmu_entity_t, which represents
 125  125   * collectives:
 126  126   *
 127  127   *    - A zone.
 128  128   *    - A project, task, or user within a zone.
 129  129   *    - The entire system (vmu_data.vmu_system).
 130  130   *    - Each collapsed (col) project and user.  This means a given projid or
 131  131   *      uid, regardless of which zone the process is in.  For instance,

 132  132   *      project 0 in the global zone and project 0 in a non global zone are
 133  133   *      the same collapsed project.
 134  134   *
 135  135   *  Each entity structure tracks which pages have been already visited for
 136  136   *  that entity (via previously inspected processes) so that these pages are
 137  137   *  not double counted.
 138  138   */
 139  139  
 140  140  #include <sys/errno.h>
 141  141  #include <sys/types.h>
 142  142  #include <sys/zone.h>
 143  143  #include <sys/proc.h>
 144  144  #include <sys/project.h>
 145  145  #include <sys/task.h>
 146  146  #include <sys/thread.h>
 147  147  #include <sys/time.h>
 148  148  #include <sys/mman.h>

↓ open down ↓

17 lines elided

↑ open up ↑

 149  149  #include <sys/modhash.h>
 150  150  #include <sys/modhash_impl.h>
 151  151  #include <sys/shm.h>
 152  152  #include <sys/swap.h>
 153  153  #include <sys/synch.h>
 154  154  #include <sys/systm.h>
 155  155  #include <sys/var.h>
 156  156  #include <sys/vm_usage.h>
 157  157  #include <sys/zone.h>
 158  158  #include <sys/sunddi.h>
 159      -#include <sys/sysmacros.h>
 160  159  #include <sys/avl.h>
 161  160  #include <vm/anon.h>
 162  161  #include <vm/as.h>
 163  162  #include <vm/seg_vn.h>
 164  163  #include <vm/seg_spt.h>
 165  164  
 166  165  #define VMUSAGE_HASH_SIZE               512
 167  166  
 168  167  #define VMUSAGE_TYPE_VNODE              1
 169  168  #define VMUSAGE_TYPE_AMP                2

 170  169  #define VMUSAGE_TYPE_ANON               3
 171  170  
 172  171  #define VMUSAGE_BOUND_UNKNOWN           0
 173  172  #define VMUSAGE_BOUND_INCORE            1
 174  173  #define VMUSAGE_BOUND_NOT_INCORE        2
 175  174  
 176  175  #define ISWITHIN(node, addr)    ((node)->vmb_start <= addr && \
 177  176                                      (node)->vmb_end >= addr ? 1 : 0)
 178  177  
 179  178  /*
 180  179   * bounds for vnodes and shared amps
 181  180   * Each bound is either entirely incore, entirely not in core, or
 182  181   * entirely unknown.  bounds are stored in an avl tree sorted by start member
 183  182   * when in use, otherwise (free or temporary lists) they're strung
 184  183   * together off of vmb_next.
 185  184   */
 186  185  typedef struct vmu_bound {
 187  186          avl_node_t vmb_node;
 188  187          struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
 189  188          pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
 190  189          pgcnt_t vmb_end;    /* page offset in vnode/amp on which bound ends */
 191  190          char    vmb_type;   /* One of VMUSAGE_BOUND_* */
 192  191  } vmu_bound_t;
 193  192  
 194  193  /*
 195  194   * hash of visited objects (vnodes or shared amps)
 196  195   * key is address of vnode or amp.  Bounds lists known incore/non-incore

↓ open down ↓

27 lines elided

↑ open up ↑

 197  196   * bounds for vnode/amp.
 198  197   */
 199  198  typedef struct vmu_object {
 200  199          struct vmu_object       *vmo_next;      /* free list */
 201  200          caddr_t         vmo_key;
 202  201          short           vmo_type;
 203  202          avl_tree_t      vmo_bounds;
 204  203  } vmu_object_t;
 205  204  
 206  205  /*
 207      - * Node for tree of visited COW anons.
 208      - */
 209      -typedef struct vmu_anon {
 210      -        avl_node_t vma_node;
 211      -        uintptr_t vma_addr;
 212      -} vmu_anon_t;
 213      -
 214      -/*
 215  206   * Entity by which to count results.
 216  207   *
 217  208   * The entity structure keeps the current rss/swap counts for each entity
 218  209   * (zone, project, etc), and hashes of vm structures that have already
 219  210   * been visited for the entity.
 220  211   *
 221  212   * vme_next:    links the list of all entities currently being counted by
 222  213   *              vmu_calculate().
 223  214   *
 224  215   * vme_next_calc: links the list of entities related to the current process

 225  216   *               being counted by vmu_calculate_proc().
 226  217   *

↓ open down ↓

2 lines elided

↑ open up ↑

 227  218   * vmu_calculate_proc() walks all processes.  For each process, it makes a
 228  219   * list of the entities related to that process using vme_next_calc.  This
 229  220   * list changes each time vmu_calculate_proc() is called.
 230  221   *
 231  222   */
 232  223  typedef struct vmu_entity {
 233  224          struct vmu_entity *vme_next;
 234  225          struct vmu_entity *vme_next_calc;
 235  226          mod_hash_t      *vme_vnode_hash; /* vnodes visited for entity */
 236  227          mod_hash_t      *vme_amp_hash;   /* shared amps visited for entity */
 237      -        avl_tree_t      vme_anon;        /* COW anons visited for entity */
      228 +        mod_hash_t      *vme_anon_hash;  /* COW anons visited for entity */
 238  229          vmusage_t       vme_result;      /* identifies entity and results */
 239  230  } vmu_entity_t;
 240  231  
 241  232  /*
 242  233   * Hash of entities visited within a zone, and an entity for the zone
 243  234   * itself.
 244  235   */
 245  236  typedef struct vmu_zone {
 246  237          struct vmu_zone *vmz_next;      /* free list */
 247  238          id_t            vmz_id;

 248  239          vmu_entity_t    *vmz_zone;
 249  240          mod_hash_t      *vmz_projects_hash;
 250  241          mod_hash_t      *vmz_tasks_hash;
 251  242          mod_hash_t      *vmz_rusers_hash;
 252  243          mod_hash_t      *vmz_eusers_hash;
 253  244  } vmu_zone_t;
 254  245  
 255  246  /*
 256  247   * Cache of results from last calculation
 257  248   */
 258  249  typedef struct vmu_cache {
 259  250          vmusage_t       *vmc_results;   /* Results from last call to */
 260  251                                          /* vm_getusage(). */
 261  252          uint64_t        vmc_nresults;   /* Count of cached results */
 262  253          uint64_t        vmc_refcnt;     /* refcnt for free */
 263  254          uint_t          vmc_flags;      /* Flags for vm_getusage() */
 264  255          hrtime_t        vmc_timestamp;  /* when cache was created */
 265  256  } vmu_cache_t;
 266  257  
 267  258  /*
 268  259   * top level rss info for the system
 269  260   */
 270  261  typedef struct vmu_data {
 271  262          kmutex_t        vmu_lock;               /* Protects vmu_data */
 272  263          kcondvar_t      vmu_cv;                 /* Used to signal threads */
 273  264                                                  /* Waiting for */
 274  265                                                  /* Rss_calc_thread to finish */
 275  266          vmu_entity_t    *vmu_system;            /* Entity for tracking */
 276  267                                                  /* rss/swap for all processes */
 277  268                                                  /* in all zones */
 278  269          mod_hash_t      *vmu_zones_hash;        /* Zones visited */
 279  270          mod_hash_t      *vmu_projects_col_hash; /* These *_col_hash hashes */
 280  271          mod_hash_t      *vmu_rusers_col_hash;   /* keep track of entities, */
 281  272          mod_hash_t      *vmu_eusers_col_hash;   /* ignoring zoneid, in order */
 282  273                                                  /* to implement VMUSAGE_COL_* */
 283  274                                                  /* flags, which aggregate by */
 284  275                                                  /* project or user regardless */
 285  276                                                  /* of zoneid. */
 286  277          mod_hash_t      *vmu_all_vnodes_hash;   /* System wide visited vnodes */
 287  278                                                  /* to track incore/not-incore */
 288  279          mod_hash_t      *vmu_all_amps_hash;     /* System wide visited shared */
 289  280                                                  /* amps to track incore/not- */
 290  281                                                  /* incore */
 291  282          vmu_entity_t    *vmu_entities;          /* Linked list of entities */
 292  283          size_t          vmu_nentities;          /* Count of entities in list */
 293  284          vmu_cache_t     *vmu_cache;             /* Cached results */
 294  285          kthread_t       *vmu_calc_thread;       /* NULL, or thread running */
 295  286                                                  /* vmu_calculate() */
 296  287          uint_t          vmu_calc_flags;         /* Flags being using by */
 297  288                                                  /* currently running calc */
 298  289                                                  /* thread */
 299  290          uint_t          vmu_pending_flags;      /* Flags of vm_getusage() */
 300  291                                                  /* threads waiting for */
 301  292                                                  /* calc thread to finish */
 302  293          uint_t          vmu_pending_waiters;    /* Number of threads waiting */
 303  294                                                  /* for calc thread */
 304  295          vmu_bound_t     *vmu_free_bounds;
 305  296          vmu_object_t    *vmu_free_objects;
 306  297          vmu_entity_t    *vmu_free_entities;
 307  298          vmu_zone_t      *vmu_free_zones;
 308  299  } vmu_data_t;
 309  300  
 310  301  extern struct as kas;
 311  302  extern proc_t *practive;
 312  303  extern zone_t *global_zone;
 313  304  extern struct seg_ops segvn_ops;
 314  305  extern struct seg_ops segspt_shmops;
 315  306  
 316  307  static vmu_data_t vmu_data;
 317  308  static kmem_cache_t *vmu_bound_cache;
 318  309  static kmem_cache_t *vmu_object_cache;
 319  310  
 320  311  /*
 321  312   * Comparison routine for AVL tree. We base our comparison on vmb_start.
 322  313   */
 323  314  static int
 324  315  bounds_cmp(const void *bnd1, const void *bnd2)
 325  316  {
 326  317          const vmu_bound_t *bound1 = bnd1;
 327  318          const vmu_bound_t *bound2 = bnd2;
 328  319  
 329  320          if (bound1->vmb_start == bound2->vmb_start) {

↓ open down ↓

82 lines elided

↑ open up ↑

 330  321                  return (0);
 331  322          }
 332  323          if (bound1->vmb_start < bound2->vmb_start) {
 333  324                  return (-1);
 334  325          }
 335  326  
 336  327          return (1);
 337  328  }
 338  329  
 339  330  /*
 340      - * Comparison routine for our AVL tree of anon structures.
 341      - */
 342      -static int
 343      -vmu_anon_cmp(const void *lhs, const void *rhs)
 344      -{
 345      -        const vmu_anon_t *l = lhs, *r = rhs;
 346      -
 347      -        if (l->vma_addr == r->vma_addr)
 348      -                return (0);
 349      -
 350      -        if (l->vma_addr < r->vma_addr)
 351      -                return (-1);
 352      -
 353      -        return (1);
 354      -}
 355      -
 356      -/*
 357  331   * Save a bound on the free list.
 358  332   */
 359  333  static void
 360  334  vmu_free_bound(vmu_bound_t *bound)
 361  335  {
 362  336          bound->vmb_next = vmu_data.vmu_free_bounds;
 363  337          bound->vmb_start = 0;
 364  338          bound->vmb_end = 0;
 365  339          bound->vmb_type = 0;
 366  340          vmu_data.vmu_free_bounds = bound;

 367  341  }
 368  342  
 369  343  /*
 370  344   * Free an object, and all visited bound info.
 371  345   */
 372  346  static void
 373  347  vmu_free_object(mod_hash_val_t val)
 374  348  {
 375  349          vmu_object_t *obj = (vmu_object_t *)val;
 376  350          avl_tree_t *tree = &(obj->vmo_bounds);
 377  351          vmu_bound_t *bound;
 378  352          void *cookie = NULL;
 379  353  
 380  354          while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
 381  355                  vmu_free_bound(bound);
 382  356          avl_destroy(tree);
 383  357  
 384  358          obj->vmo_type = 0;
 385  359          obj->vmo_next = vmu_data.vmu_free_objects;

↓ open down ↓

19 lines elided

↑ open up ↑

 386  360          vmu_data.vmu_free_objects = obj;
 387  361  }
 388  362  
 389  363  /*
 390  364   * Free an entity, and hashes of visited objects for that entity.
 391  365   */
 392  366  static void
 393  367  vmu_free_entity(mod_hash_val_t val)
 394  368  {
 395  369          vmu_entity_t *entity = (vmu_entity_t *)val;
 396      -        vmu_anon_t *anon;
 397      -        void *cookie = NULL;
 398  370  
 399  371          if (entity->vme_vnode_hash != NULL)
 400  372                  i_mod_hash_clear_nosync(entity->vme_vnode_hash);
 401  373          if (entity->vme_amp_hash != NULL)
 402  374                  i_mod_hash_clear_nosync(entity->vme_amp_hash);
      375 +        if (entity->vme_anon_hash != NULL)
      376 +                i_mod_hash_clear_nosync(entity->vme_anon_hash);
 403  377  
 404      -        while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL)
 405      -                kmem_free(anon, sizeof (vmu_anon_t));
 406      -
 407      -        avl_destroy(&entity->vme_anon);
 408      -
 409  378          entity->vme_next = vmu_data.vmu_free_entities;
 410  379          vmu_data.vmu_free_entities = entity;
 411  380  }
 412  381  
 413  382  /*
 414  383   * Free zone entity, and all hashes of entities inside that zone,
 415  384   * which are projects, tasks, and users.
 416  385   */
 417  386  static void
 418  387  vmu_free_zone(mod_hash_val_t val)

 419  388  {
 420  389          vmu_zone_t *zone = (vmu_zone_t *)val;
 421  390  
 422  391          if (zone->vmz_zone != NULL) {
 423  392                  vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
 424  393                  zone->vmz_zone = NULL;
 425  394          }
 426  395          if (zone->vmz_projects_hash != NULL)
 427  396                  i_mod_hash_clear_nosync(zone->vmz_projects_hash);
 428  397          if (zone->vmz_tasks_hash != NULL)
 429  398                  i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
 430  399          if (zone->vmz_rusers_hash != NULL)
 431  400                  i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
 432  401          if (zone->vmz_eusers_hash != NULL)
 433  402                  i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
 434  403          zone->vmz_next = vmu_data.vmu_free_zones;
 435  404          vmu_data.vmu_free_zones = zone;
 436  405  }
 437  406  
 438  407  /*
 439  408   * Initialize synchronization primitives and hashes for system-wide tracking
 440  409   * of visited vnodes and shared amps.  Initialize results cache.
 441  410   */
 442  411  void
 443  412  vm_usage_init()
 444  413  {
 445  414          mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
 446  415          cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
 447  416  
 448  417          vmu_data.vmu_system = NULL;
 449  418          vmu_data.vmu_zones_hash = NULL;
 450  419          vmu_data.vmu_projects_col_hash = NULL;
 451  420          vmu_data.vmu_rusers_col_hash = NULL;
 452  421          vmu_data.vmu_eusers_col_hash = NULL;
 453  422  
 454  423          vmu_data.vmu_free_bounds = NULL;
 455  424          vmu_data.vmu_free_objects = NULL;
 456  425          vmu_data.vmu_free_entities = NULL;
 457  426          vmu_data.vmu_free_zones = NULL;
 458  427  
 459  428          vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
 460  429              "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 461  430              sizeof (vnode_t));
 462  431          vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
 463  432              "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 464  433              sizeof (struct anon_map));
 465  434          vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
 466  435              "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
 467  436              vmu_free_entity);
 468  437          vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
 469  438              "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
 470  439              vmu_free_entity);
 471  440          vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
 472  441              "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
 473  442              vmu_free_entity);
 474  443          vmu_data.vmu_zones_hash = mod_hash_create_idhash(
 475  444              "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
 476  445  
 477  446          vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
 478  447              sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 479  448          vmu_object_cache = kmem_cache_create("vmu_object_cache",
 480  449              sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 481  450  
 482  451          vmu_data.vmu_entities = NULL;
 483  452          vmu_data.vmu_nentities = 0;
 484  453  
 485  454          vmu_data.vmu_cache = NULL;
 486  455          vmu_data.vmu_calc_thread = NULL;
 487  456          vmu_data.vmu_calc_flags = 0;
 488  457          vmu_data.vmu_pending_flags = 0;
 489  458          vmu_data.vmu_pending_waiters = 0;
 490  459  }
 491  460  
 492  461  /*
 493  462   * Allocate hashes for tracking vm objects visited for an entity.
 494  463   * Update list of entities.
 495  464   */
 496  465  static vmu_entity_t *
 497  466  vmu_alloc_entity(id_t id, int type, id_t zoneid)
 498  467  {
 499  468          vmu_entity_t *entity;
 500  469  
 501  470          if (vmu_data.vmu_free_entities != NULL) {
 502  471                  entity = vmu_data.vmu_free_entities;
 503  472                  vmu_data.vmu_free_entities =
 504  473                      vmu_data.vmu_free_entities->vme_next;
 505  474                  bzero(&entity->vme_result, sizeof (vmusage_t));
 506  475          } else {
 507  476                  entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
 508  477          }
 509  478          entity->vme_result.vmu_id = id;
 510  479          entity->vme_result.vmu_zoneid = zoneid;
 511  480          entity->vme_result.vmu_type = type;
 512  481

↓ open down ↓

94 lines elided

↑ open up ↑

 513  482          if (entity->vme_vnode_hash == NULL)
 514  483                  entity->vme_vnode_hash = mod_hash_create_ptrhash(
 515  484                      "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 516  485                      sizeof (vnode_t));
 517  486  
 518  487          if (entity->vme_amp_hash == NULL)
 519  488                  entity->vme_amp_hash = mod_hash_create_ptrhash(
 520  489                      "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
 521  490                      sizeof (struct anon_map));
 522  491  
 523      -        VERIFY(avl_first(&entity->vme_anon) == NULL);
      492 +        if (entity->vme_anon_hash == NULL)
      493 +                entity->vme_anon_hash = mod_hash_create_ptrhash(
      494 +                    "vmusage anon hash", VMUSAGE_HASH_SIZE,
      495 +                    mod_hash_null_valdtor, sizeof (struct anon));
 524  496  
 525      -        avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon),
 526      -            offsetof(struct vmu_anon, vma_node));
 527      -
 528  497          entity->vme_next = vmu_data.vmu_entities;
 529  498          vmu_data.vmu_entities = entity;
 530  499          vmu_data.vmu_nentities++;
 531  500  
 532  501          return (entity);
 533  502  }
 534  503  
 535  504  /*
 536  505   * Allocate a zone entity, and hashes for tracking visited vm objects
 537  506   * for projects, tasks, and users within that zone.

 538  507   */
 539  508  static vmu_zone_t *
 540  509  vmu_alloc_zone(id_t id)
 541  510  {
 542  511          vmu_zone_t *zone;
 543  512  
 544  513          if (vmu_data.vmu_free_zones != NULL) {
 545  514                  zone = vmu_data.vmu_free_zones;
 546  515                  vmu_data.vmu_free_zones =
 547  516                      vmu_data.vmu_free_zones->vmz_next;
 548  517                  zone->vmz_next = NULL;
 549  518                  zone->vmz_zone = NULL;
 550  519          } else {
 551  520                  zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
 552  521          }
 553  522  
 554  523          zone->vmz_id = id;
 555  524  
 556  525          if ((vmu_data.vmu_calc_flags &
 557  526              (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0)
 558  527                  zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
 559  528  
 560  529          if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
 561  530              VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
 562  531                  zone->vmz_projects_hash = mod_hash_create_idhash(
 563  532                      "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 564  533  
 565  534          if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
 566  535              != 0 && zone->vmz_tasks_hash == NULL)
 567  536                  zone->vmz_tasks_hash = mod_hash_create_idhash(
 568  537                      "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 569  538  
 570  539          if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
 571  540              != 0 && zone->vmz_rusers_hash == NULL)
 572  541                  zone->vmz_rusers_hash = mod_hash_create_idhash(
 573  542                      "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 574  543  
 575  544          if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
 576  545              != 0 && zone->vmz_eusers_hash == NULL)
 577  546                  zone->vmz_eusers_hash = mod_hash_create_idhash(
 578  547                      "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
 579  548  
 580  549          return (zone);
 581  550  }
 582  551  
 583  552  /*
 584  553   * Allocate a structure for tracking visited bounds for a vm object.
 585  554   */
 586  555  static vmu_object_t *
 587  556  vmu_alloc_object(caddr_t key, int type)
 588  557  {
 589  558          vmu_object_t *object;
 590  559  
 591  560          if (vmu_data.vmu_free_objects != NULL) {
 592  561                  object = vmu_data.vmu_free_objects;
 593  562                  vmu_data.vmu_free_objects =
 594  563                      vmu_data.vmu_free_objects->vmo_next;
 595  564          } else {
 596  565                  object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
 597  566          }
 598  567  
 599  568          object->vmo_next = NULL;
 600  569          object->vmo_key = key;
 601  570          object->vmo_type = type;
 602  571          avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
 603  572  
 604  573          return (object);
 605  574  }
 606  575  
 607  576  /*
 608  577   * Allocate and return a bound structure.
 609  578   */
 610  579  static vmu_bound_t *
 611  580  vmu_alloc_bound()
 612  581  {
 613  582          vmu_bound_t *bound;
 614  583  
 615  584          if (vmu_data.vmu_free_bounds != NULL) {
 616  585                  bound = vmu_data.vmu_free_bounds;
 617  586                  vmu_data.vmu_free_bounds =
 618  587                      vmu_data.vmu_free_bounds->vmb_next;
 619  588          } else {
 620  589                  bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
 621  590          }
 622  591  
 623  592          bound->vmb_next = NULL;
 624  593          bound->vmb_start = 0;
 625  594          bound->vmb_end = 0;
 626  595          bound->vmb_type = 0;
 627  596          return (bound);
 628  597  }
 629  598  
 630  599  /*
 631  600   * vmu_find_insert_* functions implement hash lookup or allocate and
 632  601   * insert operations.
 633  602   */
 634  603  static vmu_object_t *
 635  604  vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
 636  605  {
 637  606          int ret;
 638  607          vmu_object_t *object;
 639  608  
 640  609          ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
 641  610              (mod_hash_val_t *)&object);

↓ open down ↓

104 lines elided

↑ open up ↑

 642  611          if (ret != 0) {
 643  612                  object = vmu_alloc_object(key, type);
 644  613                  ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
 645  614                      (mod_hash_val_t)object, (mod_hash_hndl_t)0);
 646  615                  ASSERT(ret == 0);
 647  616          }
 648  617          return (object);
 649  618  }
 650  619  
 651  620  static int
 652      -vmu_find_insert_anon(vmu_entity_t *entity, void *key)
      621 +vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
 653  622  {
 654      -        vmu_anon_t anon, *ap;
      623 +        int ret;
      624 +        caddr_t val;
 655  625  
 656      -        anon.vma_addr = (uintptr_t)key;
      626 +        ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
      627 +            (mod_hash_val_t *)&val);
 657  628  
 658      -        if (avl_find(&entity->vme_anon, &anon, NULL) != NULL)
      629 +        if (ret == 0)
 659  630                  return (0);
 660  631  
 661      -        ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP);
 662      -        ap->vma_addr = (uintptr_t)key;
      632 +        ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
      633 +            (mod_hash_val_t)key, (mod_hash_hndl_t)0);
 663  634  
 664      -        avl_add(&entity->vme_anon, ap);
      635 +        ASSERT(ret == 0);
 665  636  
 666  637          return (1);
 667  638  }
 668  639  
 669  640  static vmu_entity_t *
 670  641  vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
 671  642  {
 672  643          int ret;
 673  644          vmu_entity_t *entity;
 674  645

 675  646          ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
 676  647              (mod_hash_val_t *)&entity);
 677  648          if (ret != 0) {
 678  649                  entity = vmu_alloc_entity(id, type, zoneid);
 679  650                  ret = i_mod_hash_insert_nosync(hash,
 680  651                      (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
 681  652                      (mod_hash_hndl_t)0);
 682  653                  ASSERT(ret == 0);
 683  654          }
 684  655          return (entity);
 685  656  }
 686  657  
 687  658  
 688  659  
 689  660  
 690  661  /*
 691  662   * Returns list of object bounds between start and end.  New bounds inserted
 692  663   * by this call are given type.
 693  664   *
 694  665   * Returns the number of pages covered if new bounds are created.  Returns 0
 695  666   * if region between start/end consists of all existing bounds.
 696  667   */
 697  668  static pgcnt_t
 698  669  vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
 699  670      end, char type, vmu_bound_t **first, vmu_bound_t **last)
 700  671  {
 701  672          avl_tree_t      *tree = &(ro->vmo_bounds);
 702  673          avl_index_t     where;
 703  674          vmu_bound_t     *walker, *tmp;
 704  675          pgcnt_t         ret = 0;
 705  676  
 706  677          ASSERT(start <= end);
 707  678  
 708  679          *first = *last = NULL;
 709  680  
 710  681          tmp = vmu_alloc_bound();
 711  682          tmp->vmb_start = start;
 712  683          tmp->vmb_type = type;
 713  684  
 714  685          /* Hopelessly optimistic case. */
 715  686          if (walker = avl_find(tree, tmp, &where)) {
 716  687                  /* We got lucky. */
 717  688                  vmu_free_bound(tmp);
 718  689                  *first = walker;
 719  690          }
 720  691  
 721  692          if (walker == NULL) {
 722  693                  /* Is start in the previous node? */
 723  694                  walker = avl_nearest(tree, where, AVL_BEFORE);
 724  695                  if (walker != NULL) {
 725  696                          if (ISWITHIN(walker, start)) {
 726  697                                  /* We found start. */
 727  698                                  vmu_free_bound(tmp);
 728  699                                  *first = walker;
 729  700                          }
 730  701                  }
 731  702          }
 732  703  
 733  704          /*
 734  705           * At this point, if *first is still NULL, then we
 735  706           * didn't get a direct hit and start isn't covered
 736  707           * by the previous node. We know that the next node
 737  708           * must have a greater start value than we require
 738  709           * because avl_find tells us where the AVL routines would
 739  710           * insert our new node. We have some gap between the
 740  711           * start we want and the next node.
 741  712           */
 742  713          if (*first == NULL) {
 743  714                  walker = avl_nearest(tree, where, AVL_AFTER);
 744  715                  if (walker != NULL && walker->vmb_start <= end) {
 745  716                          /* Fill the gap. */
 746  717                          tmp->vmb_end = walker->vmb_start - 1;
 747  718                          *first = tmp;
 748  719                  } else {
 749  720                          /* We have a gap over [start, end]. */
 750  721                          tmp->vmb_end = end;
 751  722                          *first = *last = tmp;
 752  723                  }
 753  724                  ret += tmp->vmb_end - tmp->vmb_start + 1;
 754  725                  avl_insert(tree, tmp, where);
 755  726          }
 756  727  
 757  728          ASSERT(*first != NULL);
 758  729  
 759  730          if (*last != NULL) {
 760  731                  /* We're done. */
 761  732                  return (ret);
 762  733          }
 763  734  
 764  735          /*
 765  736           * If we are here we still need to set *last and
 766  737           * that may involve filling in some gaps.
 767  738           */
 768  739          *last = *first;
 769  740          for (;;) {
 770  741                  if (ISWITHIN(*last, end)) {
 771  742                          /* We're done. */
 772  743                          break;
 773  744                  }
 774  745                  walker = AVL_NEXT(tree, *last);
 775  746                  if (walker == NULL || walker->vmb_start > end) {
 776  747                          /* Bottom or mid tree with gap. */
 777  748                          tmp = vmu_alloc_bound();
 778  749                          tmp->vmb_start = (*last)->vmb_end + 1;
 779  750                          tmp->vmb_end = end;
 780  751                          tmp->vmb_type = type;
 781  752                          ret += tmp->vmb_end - tmp->vmb_start + 1;
 782  753                          avl_insert_here(tree, tmp, *last, AVL_AFTER);
 783  754                          *last = tmp;
 784  755                          break;
 785  756                  } else {
 786  757                          if ((*last)->vmb_end + 1 != walker->vmb_start) {
 787  758                                  /* Non-contiguous. */
 788  759                                  tmp = vmu_alloc_bound();
 789  760                                  tmp->vmb_start = (*last)->vmb_end + 1;
 790  761                                  tmp->vmb_end = walker->vmb_start - 1;
 791  762                                  tmp->vmb_type = type;
 792  763                                  ret += tmp->vmb_end - tmp->vmb_start + 1;
 793  764                                  avl_insert_here(tree, tmp, *last, AVL_AFTER);
 794  765                                  *last = tmp;
 795  766                          } else {
 796  767                                  *last = walker;
 797  768                          }
 798  769                  }
 799  770          }
 800  771  
 801  772          return (ret);
 802  773  }
 803  774  
 804  775  /*
 805  776   * vmu_update_bounds()
 806  777   *
 807  778   * tree: avl_tree in which first and last hang.
 808  779   *
 809  780   * first, last: list of continuous bounds, of which zero or more are of
 810  781   *              type VMUSAGE_BOUND_UNKNOWN.
 811  782   *
 812  783   * new_tree: avl_tree in which new_first and new_last hang.
 813  784   *
 814  785   * new_first, new_last: list of continuous bounds, of which none are of
 815  786   *                      type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
 816  787   *                      update the types of bounds in (first,last) with
 817  788   *                      type VMUSAGE_BOUND_UNKNOWN.
 818  789   *
 819  790   * For the list of bounds (first,last), this function updates any bounds
 820  791   * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
 821  792   * the list (new_first, new_last).
 822  793   *
 823  794   * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
 824  795   * (new_first, new_last), it will be split into multiple bounds.
 825  796   *
 826  797   * Return value:
 827  798   *      The number of pages in the list of bounds (first,last) that were of
 828  799   *      type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
 829  800   *      VMUSAGE_BOUND_INCORE.
 830  801   *
 831  802   */
 832  803  static pgcnt_t
 833  804  vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
 834  805      avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
 835  806  {
 836  807          vmu_bound_t *next, *new_next, *tmp;
 837  808          pgcnt_t rss = 0;
 838  809  
 839  810          next = *first;
 840  811          new_next = new_first;
 841  812  
 842  813          /*
 843  814           * Verify first and last bound are covered by new bounds if they
 844  815           * have unknown type.
 845  816           */
 846  817          ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
 847  818              (*first)->vmb_start >= new_first->vmb_start);
 848  819          ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
 849  820              (*last)->vmb_end <= new_last->vmb_end);
 850  821          for (;;) {
 851  822                  /* If bound already has type, proceed to next bound. */
 852  823                  if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 853  824                          if (next == *last)
 854  825                                  break;
 855  826                          next = AVL_NEXT(tree, next);
 856  827                          continue;
 857  828                  }
 858  829                  while (new_next->vmb_end < next->vmb_start)
 859  830                          new_next = AVL_NEXT(new_tree, new_next);
 860  831                  ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
 861  832                  next->vmb_type = new_next->vmb_type;
 862  833                  if (new_next->vmb_end < next->vmb_end) {
 863  834                          /* need to split bound */
 864  835                          tmp = vmu_alloc_bound();
 865  836                          tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
 866  837                          tmp->vmb_start = new_next->vmb_end + 1;
 867  838                          tmp->vmb_end = next->vmb_end;
 868  839                          avl_insert_here(tree, tmp, next, AVL_AFTER);
 869  840                          next->vmb_end = new_next->vmb_end;
 870  841                          if (*last == next)
 871  842                                  *last = tmp;
 872  843                          if (next->vmb_type == VMUSAGE_BOUND_INCORE)
 873  844                                  rss += next->vmb_end - next->vmb_start + 1;
 874  845                          next = tmp;
 875  846                  } else {
 876  847                          if (next->vmb_type == VMUSAGE_BOUND_INCORE)
 877  848                                  rss += next->vmb_end - next->vmb_start + 1;
 878  849                          if (next == *last)
 879  850                                  break;
 880  851                          next = AVL_NEXT(tree, next);
 881  852                  }
 882  853          }
 883  854          return (rss);
 884  855  }
 885  856  
 886  857  /*
 887  858   * Merges adjacent bounds with same type between first and last bound.
 888  859   * After merge, last pointer may point to a different bound, as (incoming)
 889  860   * last bound may have been merged away.
 890  861   */
 891  862  static void
 892  863  vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
 893  864  {
 894  865          vmu_bound_t *current;
 895  866          vmu_bound_t *next;
 896  867  
 897  868          ASSERT(tree != NULL);
 898  869          ASSERT(*first != NULL);
 899  870          ASSERT(*last != NULL);
 900  871  
 901  872          current = *first;
 902  873          while (current != *last) {
 903  874                  next = AVL_NEXT(tree, current);
 904  875                  if ((current->vmb_end + 1) == next->vmb_start &&
 905  876                      current->vmb_type == next->vmb_type) {
 906  877                          current->vmb_end = next->vmb_end;
 907  878                          avl_remove(tree, next);
 908  879                          vmu_free_bound(next);
 909  880                          if (next == *last) {
 910  881                                  *last = current;
 911  882                          }
 912  883                  } else {
 913  884                          current = AVL_NEXT(tree, current);
 914  885                  }
 915  886          }
 916  887  }
 917  888  
 918  889  /*
 919  890   * Given an amp and a list of bounds, updates each bound's type with
 920  891   * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
 921  892   *
 922  893   * If a bound is partially incore, it will be split into two bounds.
 923  894   * first and last may be modified, as bounds may be split into multiple
 924  895   * bounds if they are partially incore/not-incore.
 925  896   *
 926  897   * Set incore to non-zero if bounds are already known to be incore.
 927  898   *
 928  899   */
 929  900  static void
 930  901  vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
 931  902      vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
 932  903  {
 933  904          vmu_bound_t *next;
 934  905          vmu_bound_t *tmp;
 935  906          pgcnt_t index;
 936  907          short bound_type;
 937  908          short page_type;
 938  909          vnode_t *vn;
 939  910          anoff_t off;
 940  911          struct anon *ap;
 941  912  
 942  913          next = *first;
 943  914          /* Shared anon slots don't change once set. */
 944  915          ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 945  916          for (;;) {
 946  917                  if (incore == B_TRUE)
 947  918                          next->vmb_type = VMUSAGE_BOUND_INCORE;
 948  919  
 949  920                  if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
 950  921                          if (next == *last)
 951  922                                  break;
 952  923                          next = AVL_NEXT(tree, next);
 953  924                          continue;
 954  925                  }
 955  926  
 956  927                  ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
 957  928                  bound_type = next->vmb_type;
 958  929                  index = next->vmb_start;
 959  930                  while (index <= next->vmb_end) {
 960  931  
 961  932                          /*
 962  933                           * These are used to determine how much to increment
 963  934                           * index when a large page is found.
 964  935                           */
 965  936                          page_t *page;
 966  937                          pgcnt_t pgcnt = 1;
 967  938                          uint_t pgshft;
 968  939                          pgcnt_t pgmsk;
 969  940  
 970  941                          ap = anon_get_ptr(amp->ahp, index);
 971  942                          if (ap != NULL)
 972  943                                  swap_xlate(ap, &vn, &off);
 973  944  
 974  945                          if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
 975  946                              (page = page_exists(vn, off)) != NULL) {
 976  947                                  if (PP_ISFREE(page))
 977  948                                          page_type = VMUSAGE_BOUND_NOT_INCORE;
 978  949                                  else
 979  950                                          page_type = VMUSAGE_BOUND_INCORE;
 980  951                                  if (page->p_szc > 0) {
 981  952                                          pgcnt = page_get_pagecnt(page->p_szc);
 982  953                                          pgshft = page_get_shift(page->p_szc);
 983  954                                          pgmsk = (0x1 << (pgshft - PAGESHIFT))
 984  955                                              - 1;
 985  956                                  }
 986  957                          } else {
 987  958                                  page_type = VMUSAGE_BOUND_NOT_INCORE;
 988  959                          }
 989  960  
 990  961                          if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
 991  962                                  next->vmb_type = page_type;
 992  963                                  bound_type = page_type;
 993  964                          } else if (next->vmb_type != page_type) {
 994  965                                  /*
 995  966                                   * If current bound type does not match page
 996  967                                   * type, need to split off new bound.
 997  968                                   */
 998  969                                  tmp = vmu_alloc_bound();
 999  970                                  tmp->vmb_type = page_type;
1000  971                                  tmp->vmb_start = index;
1001  972                                  tmp->vmb_end = next->vmb_end;
1002  973                                  avl_insert_here(tree, tmp, next, AVL_AFTER);
1003  974                                  next->vmb_end = index - 1;
1004  975                                  if (*last == next)
1005  976                                          *last = tmp;
1006  977                                  next = tmp;
1007  978                          }
1008  979                          if (pgcnt > 1) {
1009  980                                  /*
1010  981                                   * If inside large page, jump to next large
1011  982                                   * page
1012  983                                   */
1013  984                                  index = (index & ~pgmsk) + pgcnt;
1014  985                          } else {
1015  986                                  index++;
1016  987                          }
1017  988                  }
1018  989                  if (next == *last) {
1019  990                          ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1020  991                          break;
1021  992                  } else
1022  993                          next = AVL_NEXT(tree, next);
1023  994          }
1024  995          ANON_LOCK_EXIT(&amp->a_rwlock);
1025  996  }
1026  997  
1027  998  /*
1028  999   * Same as vmu_amp_update_incore_bounds(), except for tracking
1029 1000   * incore-/not-incore for vnodes.
1030 1001   */
1031 1002  static void
1032 1003  vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
1033 1004      vmu_bound_t **first, vmu_bound_t **last)
1034 1005  {
1035 1006          vmu_bound_t *next;
1036 1007          vmu_bound_t *tmp;
1037 1008          pgcnt_t index;
1038 1009          short bound_type;
1039 1010          short page_type;
1040 1011  
1041 1012          next = *first;
1042 1013          for (;;) {
1043 1014                  if (vnode->v_pages == NULL)
1044 1015                          next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1045 1016  
1046 1017                  if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1047 1018                          if (next == *last)
1048 1019                                  break;
1049 1020                          next = AVL_NEXT(tree, next);
1050 1021                          continue;
1051 1022                  }
1052 1023  
1053 1024                  ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN);
1054 1025                  bound_type = next->vmb_type;
1055 1026                  index = next->vmb_start;
1056 1027                  while (index <= next->vmb_end) {
1057 1028  
1058 1029                          /*
1059 1030                           * These are used to determine how much to increment
1060 1031                           * index when a large page is found.
1061 1032                           */
1062 1033                          page_t *page;
1063 1034                          pgcnt_t pgcnt = 1;
1064 1035                          uint_t pgshft;
1065 1036                          pgcnt_t pgmsk;
1066 1037  
1067 1038                          if (vnode->v_pages != NULL &&
1068 1039                              (page = page_exists(vnode, ptob(index))) != NULL) {
1069 1040                                  if (PP_ISFREE(page))
1070 1041                                          page_type = VMUSAGE_BOUND_NOT_INCORE;
1071 1042                                  else
1072 1043                                          page_type = VMUSAGE_BOUND_INCORE;
1073 1044                                  if (page->p_szc > 0) {
1074 1045                                          pgcnt = page_get_pagecnt(page->p_szc);
1075 1046                                          pgshft = page_get_shift(page->p_szc);
1076 1047                                          pgmsk = (0x1 << (pgshft - PAGESHIFT))
1077 1048                                              - 1;
1078 1049                                  }
1079 1050                          } else {
1080 1051                                  page_type = VMUSAGE_BOUND_NOT_INCORE;
1081 1052                          }
1082 1053  
1083 1054                          if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1084 1055                                  next->vmb_type = page_type;
1085 1056                                  bound_type = page_type;
1086 1057                          } else if (next->vmb_type != page_type) {
1087 1058                                  /*
1088 1059                                   * If current bound type does not match page
1089 1060                                   * type, need to split off new bound.
1090 1061                                   */
1091 1062                                  tmp = vmu_alloc_bound();
1092 1063                                  tmp->vmb_type = page_type;
1093 1064                                  tmp->vmb_start = index;
1094 1065                                  tmp->vmb_end = next->vmb_end;
1095 1066                                  avl_insert_here(tree, tmp, next, AVL_AFTER);
1096 1067                                  next->vmb_end = index - 1;
1097 1068                                  if (*last == next)
1098 1069                                          *last = tmp;
1099 1070                                  next = tmp;
1100 1071                          }
1101 1072                          if (pgcnt > 1) {
1102 1073                                  /*
1103 1074                                   * If inside large page, jump to next large
1104 1075                                   * page
1105 1076                                   */
1106 1077                                  index = (index & ~pgmsk) + pgcnt;
1107 1078                          } else {
1108 1079                                  index++;
1109 1080                          }
1110 1081                  }
1111 1082                  if (next == *last) {
1112 1083                          ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1113 1084                          break;
1114 1085                  } else
1115 1086                          next = AVL_NEXT(tree, next);
1116 1087          }
1117 1088  }
1118 1089  
1119 1090  /*
1120 1091   * Calculate the rss and swap consumed by a segment.  vmu_entities is the
1121 1092   * list of entities to visit.  For shared segments, the vnode or amp
1122 1093   * is looked up in each entity to see if it has been already counted.  Private
1123 1094   * anon pages are checked per entity to ensure that COW pages are not
1124 1095   * double counted.
1125 1096   *
1126 1097   * For private mapped files, first the amp is checked for private pages.
1127 1098   * Bounds not backed by the amp are looked up in the vnode for each entity
1128 1099   * to avoid double counting of private COW vnode pages.
1129 1100   */
1130 1101  static void
1131 1102  vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1132 1103  {
1133 1104          struct segvn_data *svd;
1134 1105          struct shm_data *shmd;
1135 1106          struct spt_data *sptd;
1136 1107          vmu_object_t *shared_object = NULL;
1137 1108          vmu_object_t *entity_object = NULL;
1138 1109          vmu_entity_t *entity;
1139 1110          vmusage_t *result;
1140 1111          vmu_bound_t *first = NULL;
1141 1112          vmu_bound_t *last = NULL;
1142 1113          vmu_bound_t *cur = NULL;
1143 1114          vmu_bound_t *e_first = NULL;
1144 1115          vmu_bound_t *e_last = NULL;
1145 1116          vmu_bound_t *tmp;
1146 1117          pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1147 1118          struct anon_map *private_amp = NULL;
1148 1119          boolean_t incore = B_FALSE;
1149 1120          boolean_t shared = B_FALSE;
1150 1121          int file = 0;
1151 1122          pgcnt_t swresv = 0;
1152 1123          pgcnt_t panon = 0;
1153 1124  
1154 1125          /* Can zero-length segments exist?  Not sure, so paranoia. */
1155 1126          if (seg->s_size <= 0)
1156 1127                  return;
1157 1128  
1158 1129          /*
1159 1130           * Figure out if there is a shared object (such as a named vnode or
1160 1131           * a shared amp, then figure out if there is a private amp, which
1161 1132           * identifies private pages.
1162 1133           */
1163 1134          if (seg->s_ops == &segvn_ops) {
1164 1135                  svd = (struct segvn_data *)seg->s_data;
1165 1136                  if (svd->type == MAP_SHARED) {
1166 1137                          shared = B_TRUE;
1167 1138                  } else {
1168 1139                          swresv = svd->swresv;
1169 1140  
1170 1141                          if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1171 1142                              RW_READER) != 0) {
1172 1143                                  /*
1173 1144                                   * Text replication anon maps can be shared
1174 1145                                   * across all zones. Space used for text
1175 1146                                   * replication is typically capped as a small %
1176 1147                                   * of memory.  To keep it simple for now we
1177 1148                                   * don't account for swap and memory space used
1178 1149                                   * for text replication.
1179 1150                                   */
1180 1151                                  if (svd->tr_state == SEGVN_TR_OFF &&
1181 1152                                      svd->amp != NULL) {
1182 1153                                          private_amp = svd->amp;
1183 1154                                          p_start = svd->anon_index;
1184 1155                                          p_end = svd->anon_index +
1185 1156                                              btop(seg->s_size) - 1;
1186 1157                                  }
1187 1158                                  SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1188 1159                          }
1189 1160                  }
1190 1161                  if (svd->vp != NULL) {
1191 1162                          file = 1;
1192 1163                          shared_object = vmu_find_insert_object(
1193 1164                              vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1194 1165                              VMUSAGE_TYPE_VNODE);
1195 1166                          s_start = btop(svd->offset);
1196 1167                          s_end = btop(svd->offset + seg->s_size) - 1;
1197 1168                  }
1198 1169                  if (svd->amp != NULL && svd->type == MAP_SHARED) {
1199 1170                          ASSERT(shared_object == NULL);
1200 1171                          shared_object = vmu_find_insert_object(
1201 1172                              vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1202 1173                              VMUSAGE_TYPE_AMP);
1203 1174                          s_start = svd->anon_index;
1204 1175                          s_end = svd->anon_index + btop(seg->s_size) - 1;
1205 1176                          /* schedctl mappings are always in core */
1206 1177                          if (svd->amp->swresv == 0)
1207 1178                                  incore = B_TRUE;
1208 1179                  }
1209 1180          } else if (seg->s_ops == &segspt_shmops) {
1210 1181                  shared = B_TRUE;
1211 1182                  shmd = (struct shm_data *)seg->s_data;
1212 1183                  shared_object = vmu_find_insert_object(
1213 1184                      vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1214 1185                      VMUSAGE_TYPE_AMP);
1215 1186                  s_start = 0;
1216 1187                  s_end = btop(seg->s_size) - 1;
1217 1188                  sptd = shmd->shm_sptseg->s_data;
1218 1189  
1219 1190                  /* ism segments are always incore and do not reserve swap */
1220 1191                  if (sptd->spt_flags & SHM_SHARE_MMU)
1221 1192                          incore = B_TRUE;
1222 1193  
1223 1194          } else {
1224 1195                  return;
1225 1196          }
1226 1197  
1227 1198          /*
1228 1199           * If there is a private amp, count anon pages that exist.  If an
1229 1200           * anon has a refcnt > 1 (COW sharing), then save the anon in a
1230 1201           * hash so that it is not double counted.
1231 1202           *
1232 1203           * If there is also a shared object, then figure out the bounds
1233 1204           * which are not mapped by the private amp.
1234 1205           */
1235 1206          if (private_amp != NULL) {
1236 1207  
1237 1208                  /* Enter as writer to prevent COW anons from being freed */
1238 1209                  ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1239 1210  
1240 1211                  p_index = p_start;
1241 1212                  s_index = s_start;
1242 1213  
1243 1214                  while (p_index <= p_end) {
1244 1215  
1245 1216                          pgcnt_t p_index_next;
1246 1217                          pgcnt_t p_bound_size;
1247 1218                          int cnt;
1248 1219                          anoff_t off;
1249 1220                          struct vnode *vn;
1250 1221                          struct anon *ap;
1251 1222                          page_t *page;           /* For handling of large */
1252 1223                          pgcnt_t pgcnt = 1;      /* pages */
1253 1224                          pgcnt_t pgstart;
1254 1225                          pgcnt_t pgend;
1255 1226                          uint_t pgshft;
1256 1227                          pgcnt_t pgmsk;
1257 1228  
1258 1229                          p_index_next = p_index;
1259 1230                          ap = anon_get_next_ptr(private_amp->ahp,
1260 1231                              &p_index_next);
1261 1232  
1262 1233                          /*
1263 1234                           * If next anon is past end of mapping, simulate
1264 1235                           * end of anon so loop terminates.
1265 1236                           */
1266 1237                          if (p_index_next > p_end) {
1267 1238                                  p_index_next = p_end + 1;
1268 1239                                  ap = NULL;
1269 1240                          }
1270 1241                          /*
1271 1242                           * For COW segments, keep track of bounds not
1272 1243                           * backed by private amp so they can be looked
1273 1244                           * up in the backing vnode
1274 1245                           */
1275 1246                          if (p_index_next != p_index) {
1276 1247  
1277 1248                                  /*
1278 1249                                   * Compute index difference between anon and
1279 1250                                   * previous anon.
1280 1251                                   */
1281 1252                                  p_bound_size = p_index_next - p_index - 1;
1282 1253  
1283 1254                                  if (shared_object != NULL) {
1284 1255                                          cur = vmu_alloc_bound();
1285 1256                                          cur->vmb_start = s_index;
1286 1257                                          cur->vmb_end = s_index + p_bound_size;
1287 1258                                          cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1288 1259                                          if (first == NULL) {
1289 1260                                                  first = cur;
1290 1261                                                  last = cur;
1291 1262                                          } else {
1292 1263                                                  last->vmb_next = cur;
1293 1264                                                  last = cur;
1294 1265                                          }
1295 1266                                  }
1296 1267                                  p_index = p_index + p_bound_size + 1;
1297 1268                                  s_index = s_index + p_bound_size + 1;
1298 1269                          }
1299 1270  
1300 1271                          /* Detect end of anons in amp */
1301 1272                          if (ap == NULL)
1302 1273                                  break;
1303 1274  
1304 1275                          cnt = ap->an_refcnt;
1305 1276                          swap_xlate(ap, &vn, &off);
1306 1277  
1307 1278                          if (vn == NULL || vn->v_pages == NULL ||
1308 1279                              (page = page_exists(vn, off)) == NULL) {
1309 1280                                  p_index++;
1310 1281                                  s_index++;
1311 1282                                  continue;
1312 1283                          }
1313 1284  
1314 1285                          /*
1315 1286                           * If large page is found, compute portion of large
1316 1287                           * page in mapping, and increment indicies to the next
1317 1288                           * large page.
1318 1289                           */
1319 1290                          if (page->p_szc > 0) {
1320 1291  
1321 1292                                  pgcnt = page_get_pagecnt(page->p_szc);
1322 1293                                  pgshft = page_get_shift(page->p_szc);
1323 1294                                  pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1324 1295  
1325 1296                                  /* First page in large page */
1326 1297                                  pgstart = p_index & ~pgmsk;
1327 1298                                  /* Last page in large page */
1328 1299                                  pgend = pgstart + pgcnt - 1;
1329 1300                                  /*
1330 1301                                   * Artifically end page if page extends past
1331 1302                                   * end of mapping.
1332 1303                                   */
1333 1304                                  if (pgend > p_end)
1334 1305                                          pgend = p_end;
1335 1306  
1336 1307                                  /*
1337 1308                                   * Compute number of pages from large page
1338 1309                                   * which are mapped.
1339 1310                                   */
1340 1311                                  pgcnt = pgend - p_index + 1;
1341 1312  
1342 1313                                  /*
1343 1314                                   * Point indicies at page after large page,
1344 1315                                   * or at page after end of mapping.
1345 1316                                   */
1346 1317                                  p_index += pgcnt;
1347 1318                                  s_index += pgcnt;
1348 1319                          } else {
1349 1320                                  p_index++;
1350 1321                                  s_index++;
1351 1322                          }
1352 1323  
1353 1324                          /*
1354 1325                           * Pages on the free list aren't counted for the rss.
1355 1326                           */
1356 1327                          if (PP_ISFREE(page))
1357 1328                                  continue;
1358 1329  
1359 1330                          /*
1360 1331                           * Assume anon structs with a refcnt
1361 1332                           * of 1 are not COW shared, so there
1362 1333                           * is no reason to track them per entity.
1363 1334                           */
1364 1335                          if (cnt == 1) {
1365 1336                                  panon += pgcnt;

↓ open down ↓

691 lines elided

↑ open up ↑

1366 1337                                  continue;
1367 1338                          }
1368 1339                          for (entity = vmu_entities; entity != NULL;
1369 1340                              entity = entity->vme_next_calc) {
1370 1341  
1371 1342                                  result = &entity->vme_result;
1372 1343                                  /*
1373 1344                                   * Track COW anons per entity so
1374 1345                                   * they are not double counted.
1375 1346                                   */
1376      -                                if (vmu_find_insert_anon(entity, ap) == 0)
     1347 +                                if (vmu_find_insert_anon(entity->vme_anon_hash,
     1348 +                                    (caddr_t)ap) == 0)
1377 1349                                          continue;
1378 1350  
1379 1351                                  result->vmu_rss_all += (pgcnt << PAGESHIFT);
1380 1352                                  result->vmu_rss_private +=
1381 1353                                      (pgcnt << PAGESHIFT);
1382 1354                          }
1383 1355                  }
1384 1356                  ANON_LOCK_EXIT(&private_amp->a_rwlock);
1385 1357          }
1386 1358

1387 1359          /* Add up resident anon and swap reserved for private mappings */
1388 1360          if (swresv > 0 || panon > 0) {
1389 1361                  for (entity = vmu_entities; entity != NULL;
1390 1362                      entity = entity->vme_next_calc) {
1391 1363                          result = &entity->vme_result;
1392 1364                          result->vmu_swap_all += swresv;
1393 1365                          result->vmu_swap_private += swresv;
1394 1366                          result->vmu_rss_all += (panon << PAGESHIFT);
1395 1367                          result->vmu_rss_private += (panon << PAGESHIFT);
1396 1368                  }
1397 1369          }
1398 1370  
1399 1371          /* Compute resident pages backing shared amp or named vnode */
1400 1372          if (shared_object != NULL) {
1401 1373                  avl_tree_t *tree = &(shared_object->vmo_bounds);
1402 1374  
1403 1375                  if (first == NULL) {
1404 1376                          /*
1405 1377                           * No private amp, or private amp has no anon
1406 1378                           * structs.  This means entire segment is backed by
1407 1379                           * the shared object.
1408 1380                           */
1409 1381                          first = vmu_alloc_bound();
1410 1382                          first->vmb_start = s_start;
1411 1383                          first->vmb_end = s_end;
1412 1384                          first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1413 1385                  }
1414 1386                  /*
1415 1387                   * Iterate bounds not backed by private amp, and compute
1416 1388                   * resident pages.
1417 1389                   */
1418 1390                  cur = first;
1419 1391                  while (cur != NULL) {
1420 1392  
1421 1393                          if (vmu_insert_lookup_object_bounds(shared_object,
1422 1394                              cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1423 1395                              &first, &last) > 0) {
1424 1396                                  /* new bounds, find incore/not-incore */
1425 1397                                  if (shared_object->vmo_type ==
1426 1398                                      VMUSAGE_TYPE_VNODE) {
1427 1399                                          vmu_vnode_update_incore_bounds(
1428 1400                                              tree,
1429 1401                                              (vnode_t *)
1430 1402                                              shared_object->vmo_key, &first,
1431 1403                                              &last);
1432 1404                                  } else {
1433 1405                                          vmu_amp_update_incore_bounds(
1434 1406                                              tree,
1435 1407                                              (struct anon_map *)
1436 1408                                              shared_object->vmo_key, &first,
1437 1409                                              &last, incore);
1438 1410                                  }
1439 1411                                  vmu_merge_bounds(tree, &first, &last);
1440 1412                          }
1441 1413                          for (entity = vmu_entities; entity != NULL;
1442 1414                              entity = entity->vme_next_calc) {
1443 1415                                  avl_tree_t *e_tree;
1444 1416  
1445 1417                                  result = &entity->vme_result;
1446 1418  
1447 1419                                  entity_object = vmu_find_insert_object(
1448 1420                                      shared_object->vmo_type ==
1449 1421                                      VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1450 1422                                      entity->vme_amp_hash,
1451 1423                                      shared_object->vmo_key,
1452 1424                                      shared_object->vmo_type);
1453 1425  
1454 1426                                  virt = vmu_insert_lookup_object_bounds(
1455 1427                                      entity_object, cur->vmb_start, cur->vmb_end,
1456 1428                                      VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1457 1429  
1458 1430                                  if (virt == 0)
1459 1431                                          continue;
1460 1432                                  /*
1461 1433                                   * Range visited for this entity
1462 1434                                   */
1463 1435                                  e_tree = &(entity_object->vmo_bounds);
1464 1436                                  rss = vmu_update_bounds(e_tree, &e_first,
1465 1437                                      &e_last, tree, first, last);
1466 1438                                  result->vmu_rss_all += (rss << PAGESHIFT);
1467 1439                                  if (shared == B_TRUE && file == B_FALSE) {
1468 1440                                          /* shared anon mapping */
1469 1441                                          result->vmu_swap_all +=
1470 1442                                              (virt << PAGESHIFT);
1471 1443                                          result->vmu_swap_shared +=
1472 1444                                              (virt << PAGESHIFT);
1473 1445                                          result->vmu_rss_shared +=
1474 1446                                              (rss << PAGESHIFT);
1475 1447                                  } else if (shared == B_TRUE && file == B_TRUE) {
1476 1448                                          /* shared file mapping */
1477 1449                                          result->vmu_rss_shared +=
1478 1450                                              (rss << PAGESHIFT);
1479 1451                                  } else if (shared == B_FALSE &&
1480 1452                                      file == B_TRUE) {
1481 1453                                          /* private file mapping */
1482 1454                                          result->vmu_rss_private +=
1483 1455                                              (rss << PAGESHIFT);
1484 1456                                  }
1485 1457                                  vmu_merge_bounds(e_tree, &e_first, &e_last);
1486 1458                          }
1487 1459                          tmp = cur;
1488 1460                          cur = cur->vmb_next;
1489 1461                          vmu_free_bound(tmp);
1490 1462                  }
1491 1463          }
1492 1464  }
1493 1465  
1494 1466  /*
1495 1467   * Based on the current calculation flags, find the relevant entities
1496 1468   * which are relative to the process.  Then calculate each segment
1497 1469   * in the process'es address space for each relevant entity.
1498 1470   */
1499 1471  static void
1500 1472  vmu_calculate_proc(proc_t *p)
1501 1473  {
1502 1474          vmu_entity_t *entities = NULL;
1503 1475          vmu_zone_t *zone;
1504 1476          vmu_entity_t *tmp;
1505 1477          struct as *as;
1506 1478          struct seg *seg;
1507 1479          int ret;
1508 1480  
1509 1481          /* Figure out which entities are being computed */
1510 1482          if ((vmu_data.vmu_system) != NULL) {
1511 1483                  tmp = vmu_data.vmu_system;
1512 1484                  tmp->vme_next_calc = entities;
1513 1485                  entities = tmp;
1514 1486          }
1515 1487          if (vmu_data.vmu_calc_flags &
1516 1488              (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE |
1517 1489              VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1518 1490              VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1519 1491              VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1520 1492              VMUSAGE_ALL_EUSERS)) {
1521 1493                  ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1522 1494                      (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1523 1495                      (mod_hash_val_t *)&zone);
1524 1496                  if (ret != 0) {
1525 1497                          zone = vmu_alloc_zone(p->p_zone->zone_id);
1526 1498                          ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1527 1499                              (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1528 1500                              (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1529 1501                          ASSERT(ret == 0);
1530 1502                  }
1531 1503                  if (zone->vmz_zone != NULL) {
1532 1504                          tmp = zone->vmz_zone;
1533 1505                          tmp->vme_next_calc = entities;
1534 1506                          entities = tmp;
1535 1507                  }
1536 1508                  if (vmu_data.vmu_calc_flags &
1537 1509                      (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1538 1510                          tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1539 1511                              p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1540 1512                              zone->vmz_id);
1541 1513                          tmp->vme_next_calc = entities;
1542 1514                          entities = tmp;
1543 1515                  }
1544 1516                  if (vmu_data.vmu_calc_flags &
1545 1517                      (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1546 1518                          tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1547 1519                              p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1548 1520                          tmp->vme_next_calc = entities;
1549 1521                          entities = tmp;
1550 1522                  }
1551 1523                  if (vmu_data.vmu_calc_flags &
1552 1524                      (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1553 1525                          tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1554 1526                              crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1555 1527                          tmp->vme_next_calc = entities;
1556 1528                          entities = tmp;
1557 1529                  }
1558 1530                  if (vmu_data.vmu_calc_flags &
1559 1531                      (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1560 1532                          tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1561 1533                              crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1562 1534                          tmp->vme_next_calc = entities;
1563 1535                          entities = tmp;
1564 1536                  }
1565 1537          }
1566 1538          /* Entities which collapse projects and users for all zones */
1567 1539          if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1568 1540                  tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1569 1541                      p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1570 1542                  tmp->vme_next_calc = entities;
1571 1543                  entities = tmp;
1572 1544          }
1573 1545          if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1574 1546                  tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1575 1547                      crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1576 1548                  tmp->vme_next_calc = entities;
1577 1549                  entities = tmp;
1578 1550          }
1579 1551          if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1580 1552                  tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1581 1553                      crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1582 1554                  tmp->vme_next_calc = entities;
1583 1555                  entities = tmp;
1584 1556          }
1585 1557  
1586 1558          ASSERT(entities != NULL);
1587 1559          /* process all segs in process's address space */
1588 1560          as = p->p_as;
1589 1561          AS_LOCK_ENTER(as, RW_READER);
1590 1562          for (seg = AS_SEGFIRST(as); seg != NULL;
1591 1563              seg = AS_SEGNEXT(as, seg)) {
1592 1564                  vmu_calculate_seg(entities, seg);
1593 1565          }
1594 1566          AS_LOCK_EXIT(as);
1595 1567  }
1596 1568  
1597 1569  /*
1598 1570   * Free data created by previous call to vmu_calculate().
1599 1571   */
1600 1572  static void
1601 1573  vmu_clear_calc()
1602 1574  {
1603 1575          if (vmu_data.vmu_system != NULL)
1604 1576                  vmu_free_entity(vmu_data.vmu_system);
1605 1577                  vmu_data.vmu_system = NULL;
1606 1578          if (vmu_data.vmu_zones_hash != NULL)
1607 1579                  i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1608 1580          if (vmu_data.vmu_projects_col_hash != NULL)
1609 1581                  i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1610 1582          if (vmu_data.vmu_rusers_col_hash != NULL)
1611 1583                  i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1612 1584          if (vmu_data.vmu_eusers_col_hash != NULL)
1613 1585                  i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1614 1586  
1615 1587          i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1616 1588          i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1617 1589  }
1618 1590  
1619 1591  /*
1620 1592   * Free unused data structures.  These can result if the system workload
1621 1593   * decreases between calculations.
1622 1594   */
1623 1595  static void
1624 1596  vmu_free_extra()
1625 1597  {
1626 1598          vmu_bound_t *tb;
1627 1599          vmu_object_t *to;
1628 1600          vmu_entity_t *te;
1629 1601          vmu_zone_t *tz;
1630 1602  
1631 1603          while (vmu_data.vmu_free_bounds != NULL) {
1632 1604                  tb = vmu_data.vmu_free_bounds;
1633 1605                  vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1634 1606                  kmem_cache_free(vmu_bound_cache, tb);
1635 1607          }
1636 1608          while (vmu_data.vmu_free_objects != NULL) {
1637 1609                  to = vmu_data.vmu_free_objects;
1638 1610                  vmu_data.vmu_free_objects =
1639 1611                      vmu_data.vmu_free_objects->vmo_next;

↓ open down ↓

253 lines elided

↑ open up ↑

1640 1612                  kmem_cache_free(vmu_object_cache, to);
1641 1613          }
1642 1614          while (vmu_data.vmu_free_entities != NULL) {
1643 1615                  te = vmu_data.vmu_free_entities;
1644 1616                  vmu_data.vmu_free_entities =
1645 1617                      vmu_data.vmu_free_entities->vme_next;
1646 1618                  if (te->vme_vnode_hash != NULL)
1647 1619                          mod_hash_destroy_hash(te->vme_vnode_hash);
1648 1620                  if (te->vme_amp_hash != NULL)
1649 1621                          mod_hash_destroy_hash(te->vme_amp_hash);
1650      -                VERIFY(avl_first(&te->vme_anon) == NULL);
     1622 +                if (te->vme_anon_hash != NULL)
     1623 +                        mod_hash_destroy_hash(te->vme_anon_hash);
1651 1624                  kmem_free(te, sizeof (vmu_entity_t));
1652 1625          }
1653 1626          while (vmu_data.vmu_free_zones != NULL) {
1654 1627                  tz = vmu_data.vmu_free_zones;
1655 1628                  vmu_data.vmu_free_zones =
1656 1629                      vmu_data.vmu_free_zones->vmz_next;
1657 1630                  if (tz->vmz_projects_hash != NULL)
1658 1631                          mod_hash_destroy_hash(tz->vmz_projects_hash);
1659 1632                  if (tz->vmz_tasks_hash != NULL)
1660 1633                          mod_hash_destroy_hash(tz->vmz_tasks_hash);

1661 1634                  if (tz->vmz_rusers_hash != NULL)
1662 1635                          mod_hash_destroy_hash(tz->vmz_rusers_hash);
1663 1636                  if (tz->vmz_eusers_hash != NULL)
1664 1637                          mod_hash_destroy_hash(tz->vmz_eusers_hash);
1665 1638                  kmem_free(tz, sizeof (vmu_zone_t));
1666 1639          }
1667 1640  }
1668 1641  
1669 1642  extern kcondvar_t *pr_pid_cv;
1670 1643  
1671 1644  /*
1672 1645   * Determine which entity types are relevant and allocate the hashes to
1673 1646   * track them.  Then walk the process table and count rss and swap
1674 1647   * for each process'es address space.  Address space object such as
1675 1648   * vnodes, amps and anons are tracked per entity, so that they are
1676 1649   * not double counted in the results.
1677 1650   *
1678 1651   */
1679 1652  static void
1680 1653  vmu_calculate()
1681 1654  {
1682 1655          int i = 0;
1683 1656          int ret;
1684 1657          proc_t *p;
1685 1658  
1686 1659          vmu_clear_calc();
1687 1660  
1688 1661          if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1689 1662                  vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1690 1663                      ALL_ZONES);
1691 1664  
1692 1665          /*
1693 1666           * Walk process table and calculate rss of each proc.
1694 1667           *
1695 1668           * Pidlock and p_lock cannot be held while doing the rss calculation.
1696 1669           * This is because:
1697 1670           *      1.  The calculation allocates using KM_SLEEP.
1698 1671           *      2.  The calculation grabs a_lock, which cannot be grabbed
1699 1672           *          after p_lock.
1700 1673           *
1701 1674           * Since pidlock must be dropped, we cannot simply just walk the
1702 1675           * practive list.  Instead, we walk the process table, and sprlock
1703 1676           * each process to ensure that it does not exit during the
1704 1677           * calculation.
1705 1678           */
1706 1679  
1707 1680          mutex_enter(&pidlock);
1708 1681          for (i = 0; i < v.v_proc; i++) {
1709 1682  again:
1710 1683                  p = pid_entry(i);
1711 1684                  if (p == NULL)
1712 1685                          continue;
1713 1686  
1714 1687                  mutex_enter(&p->p_lock);
1715 1688                  mutex_exit(&pidlock);
1716 1689  
1717 1690                  if (panicstr) {
1718 1691                          mutex_exit(&p->p_lock);
1719 1692                          return;
1720 1693                  }
1721 1694  
1722 1695                  /* Try to set P_PR_LOCK */
1723 1696                  ret = sprtrylock_proc(p);
1724 1697                  if (ret == -1) {
1725 1698                          /* Process in invalid state */
1726 1699                          mutex_exit(&p->p_lock);
1727 1700                          mutex_enter(&pidlock);
1728 1701                          continue;
1729 1702                  } else if (ret == 1) {
1730 1703                          /*
1731 1704                           * P_PR_LOCK is already set.  Wait and try again.
1732 1705                           * This also drops p_lock.
1733 1706                           */
1734 1707                          sprwaitlock_proc(p);
1735 1708                          mutex_enter(&pidlock);
1736 1709                          goto again;
1737 1710                  }
1738 1711                  mutex_exit(&p->p_lock);
1739 1712  
1740 1713                  vmu_calculate_proc(p);
1741 1714  
1742 1715                  mutex_enter(&p->p_lock);
1743 1716                  sprunlock(p);
1744 1717                  mutex_enter(&pidlock);
1745 1718          }
1746 1719          mutex_exit(&pidlock);
1747 1720  
1748 1721          vmu_free_extra();
1749 1722  }
1750 1723  
1751 1724  /*
1752 1725   * allocate a new cache for N results satisfying flags
1753 1726   */
1754 1727  vmu_cache_t *
1755 1728  vmu_cache_alloc(size_t nres, uint_t flags)
1756 1729  {
1757 1730          vmu_cache_t *cache;
1758 1731  
1759 1732          cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1760 1733          cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1761 1734          cache->vmc_nresults = nres;
1762 1735          cache->vmc_flags = flags;
1763 1736          cache->vmc_refcnt = 1;
1764 1737          return (cache);
1765 1738  }
1766 1739  
1767 1740  /*
1768 1741   * Make sure cached results are not freed
1769 1742   */
1770 1743  static void
1771 1744  vmu_cache_hold(vmu_cache_t *cache)
1772 1745  {
1773 1746          ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1774 1747          cache->vmc_refcnt++;
1775 1748  }
1776 1749  
1777 1750  /*
1778 1751   * free cache data
1779 1752   */
1780 1753  static void
1781 1754  vmu_cache_rele(vmu_cache_t *cache)
1782 1755  {
1783 1756          ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1784 1757          ASSERT(cache->vmc_refcnt > 0);
1785 1758          cache->vmc_refcnt--;
1786 1759          if (cache->vmc_refcnt == 0) {
1787 1760                  kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1788 1761                      cache->vmc_nresults);
1789 1762                  kmem_free(cache, sizeof (vmu_cache_t));
1790 1763          }
1791 1764  }
1792 1765  
1793 1766  /*
1794 1767   * When new data is calculated, update the phys_mem rctl usage value in the
1795 1768   * zones.
1796 1769   */
1797 1770  static void

↓ open down ↓

137 lines elided

↑ open up ↑

1798 1771  vmu_update_zone_rctls(vmu_cache_t *cache)
1799 1772  {
1800 1773          vmusage_t       *rp;
1801 1774          size_t          i = 0;
1802 1775          zone_t          *zp;
1803 1776  
1804 1777          for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) {
1805 1778                  if (rp->vmu_type == VMUSAGE_ZONE &&
1806 1779                      rp->vmu_zoneid != ALL_ZONES) {
1807 1780                          if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) {
1808      -                                zp->zone_phys_mem = rp->vmu_rss_all;
1809      -                                zone_rele(zp);
     1781 +                                zp->zone_phys_mem = rp->vmu_rss_all;
     1782 +                                zone_rele(zp);
1810 1783                          }
1811 1784                  }
1812 1785          }
1813 1786  }
1814 1787  
1815 1788  /*
1816 1789   * Copy out the cached results to a caller.  Inspect the callers flags
1817 1790   * and zone to determine which cached results should be copied.
1818 1791   */
1819 1792  static int

1820 1793  vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1821 1794      uint_t flags, id_t req_zone_id, int cpflg)
1822 1795  {
1823 1796          vmusage_t *result, *out_result;
1824 1797          vmusage_t dummy;
1825 1798          size_t i, count = 0;
1826 1799          size_t bufsize;
1827 1800          int ret = 0;
1828 1801          uint_t types = 0;
1829 1802  
1830 1803          if (nres != NULL) {
1831 1804                  if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1832 1805                          return (set_errno(EFAULT));
1833 1806          } else {
1834 1807                  bufsize = 0;
1835 1808          }
1836 1809  
1837 1810          /* figure out what results the caller is interested in. */
1838 1811          if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1839 1812                  types |= VMUSAGE_SYSTEM;
1840 1813          if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE))
1841 1814                  types |= VMUSAGE_ZONE;
1842 1815          if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1843 1816              VMUSAGE_COL_PROJECTS))
1844 1817                  types |= VMUSAGE_PROJECTS;
1845 1818          if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1846 1819                  types |= VMUSAGE_TASKS;
1847 1820          if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1848 1821                  types |= VMUSAGE_RUSERS;
1849 1822          if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1850 1823                  types |= VMUSAGE_EUSERS;
1851 1824  
1852 1825          /* count results for current zone */
1853 1826          out_result = buf;
1854 1827          for (result = cache->vmc_results, i = 0;
1855 1828              i < cache->vmc_nresults; result++, i++) {
1856 1829  
1857 1830                  /* Do not return "other-zone" results to non-global zones */
1858 1831                  if (curproc->p_zone != global_zone &&
1859 1832                      curproc->p_zone->zone_id != result->vmu_zoneid)
1860 1833                          continue;
1861 1834  
1862 1835                  /*
1863 1836                   * If non-global zone requests VMUSAGE_SYSTEM, fake
1864 1837                   * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1865 1838                   */
1866 1839                  if (curproc->p_zone != global_zone &&
1867 1840                      (flags & VMUSAGE_SYSTEM) != 0 &&
1868 1841                      result->vmu_type == VMUSAGE_ZONE) {
1869 1842                          count++;
1870 1843                          if (out_result != NULL) {
1871 1844                                  if (bufsize < count) {
1872 1845                                          ret = set_errno(EOVERFLOW);
1873 1846                                  } else {
1874 1847                                          dummy = *result;
1875 1848                                          dummy.vmu_zoneid = ALL_ZONES;
1876 1849                                          dummy.vmu_id = 0;
1877 1850                                          dummy.vmu_type = VMUSAGE_SYSTEM;
1878 1851                                          if (ddi_copyout(&dummy, out_result,
1879 1852                                              sizeof (vmusage_t), cpflg))
1880 1853                                                  return (set_errno(EFAULT));
1881 1854                                          out_result++;
1882 1855                                  }
1883 1856                          }
1884 1857                  }
1885 1858  
1886 1859                  /* Skip results that do not match requested type */
1887 1860                  if ((result->vmu_type & types) == 0)
1888 1861                          continue;
1889 1862  
1890 1863                  /* Skip collated results if not requested */
1891 1864                  if (result->vmu_zoneid == ALL_ZONES) {
1892 1865                          if (result->vmu_type == VMUSAGE_PROJECTS &&
1893 1866                              (flags & VMUSAGE_COL_PROJECTS) == 0)
1894 1867                                  continue;
1895 1868                          if (result->vmu_type == VMUSAGE_EUSERS &&
1896 1869                              (flags & VMUSAGE_COL_EUSERS) == 0)
1897 1870                                  continue;
1898 1871                          if (result->vmu_type == VMUSAGE_RUSERS &&
1899 1872                              (flags & VMUSAGE_COL_RUSERS) == 0)
1900 1873                                  continue;
1901 1874                  }
1902 1875  
1903 1876                  if (result->vmu_type == VMUSAGE_ZONE &&
1904 1877                      flags & VMUSAGE_A_ZONE) {
1905 1878                          /* Skip non-requested zone results */
1906 1879                          if (result->vmu_zoneid != req_zone_id)
1907 1880                                  continue;
1908 1881                  } else {
1909 1882                          /* Skip "other zone" results if not requested */
1910 1883                          if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1911 1884                                  if (result->vmu_type == VMUSAGE_ZONE &&
1912 1885                                      (flags & VMUSAGE_ALL_ZONES) == 0)
1913 1886                                          continue;
1914 1887                                  if (result->vmu_type == VMUSAGE_PROJECTS &&
1915 1888                                      (flags & (VMUSAGE_ALL_PROJECTS |
1916 1889                                      VMUSAGE_COL_PROJECTS)) == 0)
1917 1890                                          continue;
1918 1891                                  if (result->vmu_type == VMUSAGE_TASKS &&
1919 1892                                      (flags & VMUSAGE_ALL_TASKS) == 0)
1920 1893                                          continue;
1921 1894                                  if (result->vmu_type == VMUSAGE_RUSERS &&
1922 1895                                      (flags & (VMUSAGE_ALL_RUSERS |
1923 1896                                      VMUSAGE_COL_RUSERS)) == 0)
1924 1897                                          continue;
1925 1898                                  if (result->vmu_type == VMUSAGE_EUSERS &&
1926 1899                                      (flags & (VMUSAGE_ALL_EUSERS |
1927 1900                                      VMUSAGE_COL_EUSERS)) == 0)
1928 1901                                          continue;
1929 1902                          }
1930 1903                  }
1931 1904                  count++;
1932 1905                  if (out_result != NULL) {
1933 1906                          if (bufsize < count) {
1934 1907                                  ret = set_errno(EOVERFLOW);
1935 1908                          } else {
1936 1909                                  if (ddi_copyout(result, out_result,
1937 1910                                      sizeof (vmusage_t), cpflg))
1938 1911                                          return (set_errno(EFAULT));
1939 1912                                  out_result++;
1940 1913                          }
1941 1914                  }
1942 1915          }
1943 1916          if (nres != NULL)
1944 1917                  if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1945 1918                          return (set_errno(EFAULT));
1946 1919  
1947 1920          return (ret);
1948 1921  }
1949 1922  
1950 1923  /*
1951 1924   * vm_getusage()
1952 1925   *
1953 1926   * Counts rss and swap by zone, project, task, and/or user.  The flags argument
1954 1927   * determines the type of results structures returned.  Flags requesting
1955 1928   * results from more than one zone are "flattened" to the local zone if the
1956 1929   * caller is not the global zone.
1957 1930   *
1958 1931   * args:
1959 1932   *      flags:  bitmap consisting of one or more of VMUSAGE_*.
1960 1933   *      age:    maximum allowable age (time since counting was done) in
1961 1934   *              seconds of the results.  Results from previous callers are
1962 1935   *              cached in kernel.
1963 1936   *      buf:    pointer to buffer array of vmusage_t.  If NULL, then only nres
1964 1937   *              set on success.
1965 1938   *      nres:   Set to number of vmusage_t structures pointed to by buf
1966 1939   *              before calling vm_getusage().
1967 1940   *              On return 0 (success) or ENOSPC, is set to the number of result
1968 1941   *              structures returned or attempted to return.
1969 1942   *
1970 1943   * returns 0 on success, -1 on failure:
1971 1944   *      EINTR (interrupted)
1972 1945   *      ENOSPC (nres to small for results, nres set to needed value for success)
1973 1946   *      EINVAL (flags invalid)
1974 1947   *      EFAULT (bad address for buf or nres)
1975 1948   */
1976 1949  int
1977 1950  vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1978 1951  {
1979 1952          vmu_entity_t *entity;
1980 1953          vmusage_t *result;
1981 1954          int ret = 0;
1982 1955          int cacherecent = 0;
1983 1956          hrtime_t now;
1984 1957          uint_t flags_orig;
1985 1958          id_t req_zone_id;
1986 1959  
1987 1960          /*
1988 1961           * Non-global zones cannot request system wide and/or collated
1989 1962           * results, or the system result, or usage of another zone, so munge
1990 1963           * the flags accordingly.
1991 1964           */
1992 1965          flags_orig = flags;
1993 1966          if (curproc->p_zone != global_zone) {
1994 1967                  if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1995 1968                          flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1996 1969                          flags |= VMUSAGE_PROJECTS;
1997 1970                  }
1998 1971                  if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1999 1972                          flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
2000 1973                          flags |= VMUSAGE_RUSERS;
2001 1974                  }
2002 1975                  if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
2003 1976                          flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
2004 1977                          flags |= VMUSAGE_EUSERS;
2005 1978                  }
2006 1979                  if (flags & VMUSAGE_SYSTEM) {
2007 1980                          flags &= ~VMUSAGE_SYSTEM;
2008 1981                          flags |= VMUSAGE_ZONE;
2009 1982                  }
2010 1983                  if (flags & VMUSAGE_A_ZONE) {
2011 1984                          flags &= ~VMUSAGE_A_ZONE;
2012 1985                          flags |= VMUSAGE_ZONE;
2013 1986                  }
2014 1987          }
2015 1988  
2016 1989          /* Check for unknown flags */
2017 1990          if ((flags & (~VMUSAGE_MASK)) != 0)
2018 1991                  return (set_errno(EINVAL));
2019 1992  
2020 1993          /* Check for no flags */
2021 1994          if ((flags & VMUSAGE_MASK) == 0)
2022 1995                  return (set_errno(EINVAL));
2023 1996  
2024 1997          /* If requesting results for a specific zone, get the zone ID */
2025 1998          if (flags & VMUSAGE_A_ZONE) {
2026 1999                  size_t bufsize;
2027 2000                  vmusage_t zreq;
2028 2001  
2029 2002                  if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
2030 2003                          return (set_errno(EFAULT));
2031 2004                  /* Requested zone ID is passed in buf, so 0 len not allowed */
2032 2005                  if (bufsize == 0)
2033 2006                          return (set_errno(EINVAL));
2034 2007                  if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg))
2035 2008                          return (set_errno(EFAULT));
2036 2009                  req_zone_id = zreq.vmu_id;
2037 2010          }
2038 2011  
2039 2012          mutex_enter(&vmu_data.vmu_lock);
2040 2013          now = gethrtime();
2041 2014  
2042 2015  start:
2043 2016          if (vmu_data.vmu_cache != NULL) {
2044 2017  
2045 2018                  vmu_cache_t *cache;
2046 2019  
2047 2020                  if ((vmu_data.vmu_cache->vmc_timestamp +
2048 2021                      ((hrtime_t)age * NANOSEC)) > now)
2049 2022                          cacherecent = 1;
2050 2023  
2051 2024                  if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
2052 2025                      cacherecent == 1) {
2053 2026                          cache = vmu_data.vmu_cache;
2054 2027                          vmu_cache_hold(cache);
2055 2028                          mutex_exit(&vmu_data.vmu_lock);
2056 2029  
2057 2030                          ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2058 2031                              req_zone_id, cpflg);
2059 2032                          mutex_enter(&vmu_data.vmu_lock);
2060 2033                          vmu_cache_rele(cache);
2061 2034                          if (vmu_data.vmu_pending_waiters > 0)
2062 2035                                  cv_broadcast(&vmu_data.vmu_cv);
2063 2036                          mutex_exit(&vmu_data.vmu_lock);
2064 2037                          return (ret);
2065 2038                  }
2066 2039                  /*
2067 2040                   * If the cache is recent, it is likely that there are other
2068 2041                   * consumers of vm_getusage running, so add their flags to the
2069 2042                   * desired flags for the calculation.
2070 2043                   */
2071 2044                  if (cacherecent == 1)
2072 2045                          flags = vmu_data.vmu_cache->vmc_flags | flags;
2073 2046          }
2074 2047          if (vmu_data.vmu_calc_thread == NULL) {
2075 2048  
2076 2049                  vmu_cache_t *cache;
2077 2050  
2078 2051                  vmu_data.vmu_calc_thread = curthread;
2079 2052                  vmu_data.vmu_calc_flags = flags;
2080 2053                  vmu_data.vmu_entities = NULL;
2081 2054                  vmu_data.vmu_nentities = 0;
2082 2055                  if (vmu_data.vmu_pending_waiters > 0)
2083 2056                          vmu_data.vmu_calc_flags |=
2084 2057                              vmu_data.vmu_pending_flags;
2085 2058  
2086 2059                  vmu_data.vmu_pending_flags = 0;
2087 2060                  mutex_exit(&vmu_data.vmu_lock);
2088 2061                  vmu_calculate();
2089 2062                  mutex_enter(&vmu_data.vmu_lock);
2090 2063                  /* copy results to cache */
2091 2064                  if (vmu_data.vmu_cache != NULL)
2092 2065                          vmu_cache_rele(vmu_data.vmu_cache);
2093 2066                  cache = vmu_data.vmu_cache =
2094 2067                      vmu_cache_alloc(vmu_data.vmu_nentities,
2095 2068                      vmu_data.vmu_calc_flags);
2096 2069  
2097 2070                  result = cache->vmc_results;
2098 2071                  for (entity = vmu_data.vmu_entities; entity != NULL;
2099 2072                      entity = entity->vme_next) {
2100 2073                          *result = entity->vme_result;
2101 2074                          result++;
2102 2075                  }
2103 2076                  cache->vmc_timestamp = gethrtime();
2104 2077                  vmu_cache_hold(cache);
2105 2078  
2106 2079                  vmu_data.vmu_calc_flags = 0;
2107 2080                  vmu_data.vmu_calc_thread = NULL;
2108 2081  
2109 2082                  if (vmu_data.vmu_pending_waiters > 0)
2110 2083                          cv_broadcast(&vmu_data.vmu_cv);
2111 2084  
2112 2085                  mutex_exit(&vmu_data.vmu_lock);
2113 2086  
2114 2087                  /* update zone's phys. mem. rctl usage */
2115 2088                  vmu_update_zone_rctls(cache);
2116 2089                  /* copy cache */
2117 2090                  ret = vmu_copyout_results(cache, buf, nres, flags_orig,
2118 2091                      req_zone_id, cpflg);
2119 2092                  mutex_enter(&vmu_data.vmu_lock);
2120 2093                  vmu_cache_rele(cache);
2121 2094                  mutex_exit(&vmu_data.vmu_lock);
2122 2095  
2123 2096                  return (ret);
2124 2097          }
2125 2098          vmu_data.vmu_pending_flags |= flags;
2126 2099          vmu_data.vmu_pending_waiters++;
2127 2100          while (vmu_data.vmu_calc_thread != NULL) {
2128 2101                  if (cv_wait_sig(&vmu_data.vmu_cv,
2129 2102                      &vmu_data.vmu_lock) == 0) {
2130 2103                          vmu_data.vmu_pending_waiters--;
2131 2104                          mutex_exit(&vmu_data.vmu_lock);
2132 2105                          return (set_errno(EINTR));
2133 2106                  }
2134 2107          }
2135 2108          vmu_data.vmu_pending_waiters--;
2136 2109          goto start;
2137 2110  }
2138 2111  
2139 2112  #if defined(__x86)
2140 2113  /*
2141 2114   * Attempt to invalidate all of the pages in the mapping for the given process.
2142 2115   */
2143 2116  static void
2144 2117  map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size)
2145 2118  {
2146 2119          page_t          *pp;
2147 2120          size_t          psize;
2148 2121          u_offset_t      off;
2149 2122          caddr_t         eaddr;
2150 2123          struct vnode    *vp;
2151 2124          struct segvn_data *svd;
2152 2125          struct hat      *victim_hat;
2153 2126  
2154 2127          ASSERT((addr + size) <= (seg->s_base + seg->s_size));
2155 2128  
2156 2129          victim_hat = p->p_as->a_hat;
2157 2130          svd = (struct segvn_data *)seg->s_data;
2158 2131          vp = svd->vp;
2159 2132          psize = page_get_pagesize(seg->s_szc);
2160 2133  
2161 2134          off = svd->offset + (uintptr_t)(addr - seg->s_base);
2162 2135  
2163 2136          for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) {
2164 2137                  pp = page_lookup_nowait(vp, off, SE_SHARED);
2165 2138  
2166 2139                  if (pp != NULL) {
2167 2140                          /* following logic based on pvn_getdirty() */
2168 2141  
2169 2142                          if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2170 2143                                  page_unlock(pp);
2171 2144                                  continue;
2172 2145                          }
2173 2146  
2174 2147                          page_io_lock(pp);
2175 2148                          hat_page_inval(pp, 0, victim_hat);
2176 2149                          page_io_unlock(pp);
2177 2150  
2178 2151                          /*
2179 2152                           * For B_INVALCURONLY-style handling we let
2180 2153                           * page_release call VN_DISPOSE if no one else is using
2181 2154                           * the page.
2182 2155                           *
2183 2156                           * A hat_ismod() check would be useless because:
2184 2157                           * (1) we are not be holding SE_EXCL lock
2185 2158                           * (2) we've not unloaded _all_ translations
2186 2159                           *
2187 2160                           * Let page_release() do the heavy-lifting.
2188 2161                           */
2189 2162                          (void) page_release(pp, 1);
2190 2163                  }
2191 2164          }
2192 2165  }
2193 2166  
2194 2167  /*
2195 2168   * vm_map_inval()
2196 2169   *
2197 2170   * Invalidate as many pages as possible within the given mapping for the given
2198 2171   * process. addr is expected to be the base address of the mapping and size is
2199 2172   * the length of the mapping. In some cases a mapping will encompass an
2200 2173   * entire segment, but at least for anon or stack mappings, these will be
2201 2174   * regions within a single large segment. Thus, the invalidation is oriented
2202 2175   * around a single mapping and not an entire segment.
2203 2176   *
2204 2177   * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so
2205 2178   * this code is only applicable to x86.
2206 2179   */
2207 2180  int
2208 2181  vm_map_inval(pid_t pid, caddr_t addr, size_t size)
2209 2182  {
2210 2183          int ret;
2211 2184          int error = 0;
2212 2185          proc_t *p;              /* target proc */
2213 2186          struct as *as;          /* target proc's address space */
2214 2187          struct seg *seg;        /* working segment */
2215 2188  
2216 2189          if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0)
2217 2190                  return (set_errno(EPERM));
2218 2191  
2219 2192          /* If not a valid mapping address, return an error */
2220 2193          if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr)
2221 2194                  return (set_errno(EINVAL));
2222 2195  
2223 2196  again:
2224 2197          mutex_enter(&pidlock);
2225 2198          p = prfind(pid);
2226 2199          if (p == NULL) {
2227 2200                  mutex_exit(&pidlock);
2228 2201                  return (set_errno(ESRCH));
2229 2202          }
2230 2203  
2231 2204          mutex_enter(&p->p_lock);
2232 2205          mutex_exit(&pidlock);
2233 2206  
2234 2207          if (panicstr != NULL) {
2235 2208                  mutex_exit(&p->p_lock);
2236 2209                  return (0);
2237 2210          }
2238 2211  
2239 2212          as = p->p_as;
2240 2213  
2241 2214          /*
2242 2215           * Try to set P_PR_LOCK - prevents process "changing shape"
2243 2216           * - blocks fork
2244 2217           * - blocks sigkill
2245 2218           * - cannot be a system proc
2246 2219           * - must be fully created proc
2247 2220           */
2248 2221          ret = sprtrylock_proc(p);
2249 2222          if (ret == -1) {
2250 2223                  /* Process in invalid state */
2251 2224                  mutex_exit(&p->p_lock);
2252 2225                  return (set_errno(ESRCH));
2253 2226          }
2254 2227  
2255 2228          if (ret == 1) {
2256 2229                  /*
2257 2230                   * P_PR_LOCK is already set. Wait and try again. This also
2258 2231                   * drops p_lock so p may no longer be valid since the proc may
2259 2232                   * have exited.
2260 2233                   */
2261 2234                  sprwaitlock_proc(p);
2262 2235                  goto again;
2263 2236          }
2264 2237  
2265 2238          /* P_PR_LOCK is now set */
2266 2239          mutex_exit(&p->p_lock);
2267 2240  
2268 2241          AS_LOCK_ENTER(as, RW_READER);
2269 2242          if ((seg = as_segat(as, addr)) == NULL) {
2270 2243                  AS_LOCK_EXIT(as);
2271 2244                  mutex_enter(&p->p_lock);
2272 2245                  sprunlock(p);
2273 2246                  return (set_errno(ENOMEM));
2274 2247          }
2275 2248  
2276 2249          /*
2277 2250           * The invalidation behavior only makes sense for vnode-backed segments.
2278 2251           */
2279 2252          if (seg->s_ops != &segvn_ops) {
2280 2253                  AS_LOCK_EXIT(as);
2281 2254                  mutex_enter(&p->p_lock);
2282 2255                  sprunlock(p);
2283 2256                  return (0);
2284 2257          }
2285 2258  
2286 2259          /*
2287 2260           * If the mapping is out of bounds of the segement return an error.
2288 2261           */
2289 2262          if ((addr + size) > (seg->s_base + seg->s_size)) {
2290 2263                  AS_LOCK_EXIT(as);
2291 2264                  mutex_enter(&p->p_lock);
2292 2265                  sprunlock(p);
2293 2266                  return (set_errno(EINVAL));
2294 2267          }
2295 2268  
2296 2269          /*
2297 2270           * Don't use MS_INVALCURPROC flag here since that would eventually
2298 2271           * initiate hat invalidation based on curthread. Since we're doing this
2299 2272           * on behalf of a different process, that would erroneously invalidate
2300 2273           * our own process mappings.
2301 2274           */
2302 2275          error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC);
2303 2276          if (error == 0) {
2304 2277                  /*
2305 2278                   * Since we didn't invalidate during the sync above, we now
2306 2279                   * try to invalidate all of the pages in the mapping.
2307 2280                   */
2308 2281                  map_inval(p, seg, addr, size);
2309 2282          }
2310 2283          AS_LOCK_EXIT(as);
2311 2284  
2312 2285          mutex_enter(&p->p_lock);
2313 2286          sprunlock(p);
2314 2287  
2315 2288          if (error)
2316 2289                  (void) set_errno(error);
2317 2290          return (error);
2318 2291  }
2319 2292  #endif

↓ open down ↓

500 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX