io-lx-public Wdiff usr/src/uts/common/vm/vm_as.c

Print this page

OS-803 make phys mem cap a bit harder
OS-1043 minimize vm_getusage impact

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_as.c
          +++ new/usr/src/uts/common/vm/vm_as.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   * Copyright 2015, Joyent, Inc.  All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  29   29  /*        All Rights Reserved   */
  30   30  
  31   31  /*
  32   32   * University Copyright- Copyright (c) 1982, 1986, 1988
  33   33   * The Regents of the University of California
  34   34   * All Rights Reserved
  35   35   *
  36   36   * University Acknowledgment- Portions of this document are derived from
  37   37   * software developed by the University of California, Berkeley, and its
  38   38   * contributors.
  39   39   */
  40   40  
  41   41  /*
  42   42   * VM - address spaces.
  43   43   */
  44   44  
  45   45  #include <sys/types.h>
  46   46  #include <sys/t_lock.h>
  47   47  #include <sys/param.h>
  48   48  #include <sys/errno.h>
  49   49  #include <sys/systm.h>
  50   50  #include <sys/mman.h>

↓ open down ↓

50 lines elided

↑ open up ↑

  51   51  #include <sys/sysmacros.h>
  52   52  #include <sys/cpuvar.h>
  53   53  #include <sys/sysinfo.h>
  54   54  #include <sys/kmem.h>
  55   55  #include <sys/vnode.h>
  56   56  #include <sys/vmsystm.h>
  57   57  #include <sys/cmn_err.h>
  58   58  #include <sys/debug.h>
  59   59  #include <sys/tnf_probe.h>
  60   60  #include <sys/vtrace.h>
       61 +#include <sys/ddi.h>
  61   62  
  62   63  #include <vm/hat.h>
  63   64  #include <vm/as.h>
  64   65  #include <vm/seg.h>
  65   66  #include <vm/seg_vn.h>
  66   67  #include <vm/seg_dev.h>
  67   68  #include <vm/seg_kmem.h>
  68   69  #include <vm/seg_map.h>
  69   70  #include <vm/seg_spt.h>
  70   71  #include <vm/page.h>

  71   72  
  72   73  clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  73   74  
  74   75  static struct kmem_cache *as_cache;
  75   76  
  76   77  static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  77   78  static void as_clearwatchprot(struct as *, caddr_t, size_t);
  78   79  int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  79   80  
  80   81  
  81   82  /*
  82   83   * Verifying the segment lists is very time-consuming; it may not be
  83   84   * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  84   85   */
  85   86  #ifdef DEBUG
  86   87  #define VERIFY_SEGLIST
  87   88  int do_as_verify = 0;
  88   89  #endif
  89   90  
  90   91  /*
  91   92   * Allocate a new callback data structure entry and fill in the events of
  92   93   * interest, the address range of interest, and the callback argument.
  93   94   * Link the entry on the as->a_callbacks list. A callback entry for the
  94   95   * entire address space may be specified with vaddr = 0 and size = -1.
  95   96   *
  96   97   * CALLERS RESPONSIBILITY: If not calling from within the process context for
  97   98   * the specified as, the caller must guarantee persistence of the specified as
  98   99   * for the duration of this function (eg. pages being locked within the as
  99  100   * will guarantee persistence).
 100  101   */
 101  102  int
 102  103  as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 103  104      caddr_t vaddr, size_t size, int sleepflag)
 104  105  {
 105  106          struct as_callback      *current_head, *cb;
 106  107          caddr_t                 saddr;
 107  108          size_t                  rsize;
 108  109  
 109  110          /* callback function and an event are mandatory */
 110  111          if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 111  112                  return (EINVAL);
 112  113  
 113  114          /* Adding a callback after as_free has been called is not allowed */
 114  115          if (as == &kas)
 115  116                  return (ENOMEM);
 116  117  
 117  118          /*
 118  119           * vaddr = 0 and size = -1 is used to indicate that the callback range
 119  120           * is the entire address space so no rounding is done in that case.
 120  121           */
 121  122          if (size != -1) {
 122  123                  saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 123  124                  rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 124  125                      (size_t)saddr;
 125  126                  /* check for wraparound */
 126  127                  if (saddr + rsize < saddr)
 127  128                          return (ENOMEM);
 128  129          } else {
 129  130                  if (vaddr != 0)
 130  131                          return (EINVAL);
 131  132                  saddr = vaddr;
 132  133                  rsize = size;
 133  134          }
 134  135  
 135  136          /* Allocate and initialize a callback entry */
 136  137          cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 137  138          if (cb == NULL)
 138  139                  return (EAGAIN);
 139  140  
 140  141          cb->ascb_func = cb_func;
 141  142          cb->ascb_arg = arg;
 142  143          cb->ascb_events = events;
 143  144          cb->ascb_saddr = saddr;
 144  145          cb->ascb_len = rsize;
 145  146  
 146  147          /* Add the entry to the list */
 147  148          mutex_enter(&as->a_contents);
 148  149          current_head = as->a_callbacks;
 149  150          as->a_callbacks = cb;
 150  151          cb->ascb_next = current_head;
 151  152  
 152  153          /*
 153  154           * The call to this function may lose in a race with
 154  155           * a pertinent event - eg. a thread does long term memory locking
 155  156           * but before the callback is added another thread executes as_unmap.
 156  157           * A broadcast here resolves that.
 157  158           */
 158  159          if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 159  160                  AS_CLRUNMAPWAIT(as);
 160  161                  cv_broadcast(&as->a_cv);
 161  162          }
 162  163  
 163  164          mutex_exit(&as->a_contents);
 164  165          return (0);
 165  166  }
 166  167  
 167  168  /*
 168  169   * Search the callback list for an entry which pertains to arg.
 169  170   *
 170  171   * This is called from within the client upon completion of the callback.
 171  172   * RETURN VALUES:
 172  173   *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 173  174   *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 174  175   *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 175  176   *                      entry will be made in as_do_callbacks)
 176  177   *
 177  178   * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 178  179   * set, it indicates that as_do_callbacks is processing this entry.  The
 179  180   * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 180  181   * to unblock as_do_callbacks, in case it is blocked.
 181  182   *
 182  183   * CALLERS RESPONSIBILITY: If not calling from within the process context for
 183  184   * the specified as, the caller must guarantee persistence of the specified as
 184  185   * for the duration of this function (eg. pages being locked within the as
 185  186   * will guarantee persistence).
 186  187   */
 187  188  uint_t
 188  189  as_delete_callback(struct as *as, void *arg)
 189  190  {
 190  191          struct as_callback **prevcb = &as->a_callbacks;
 191  192          struct as_callback *cb;
 192  193          uint_t rc = AS_CALLBACK_NOTFOUND;
 193  194  
 194  195          mutex_enter(&as->a_contents);
 195  196          for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 196  197                  if (cb->ascb_arg != arg)
 197  198                          continue;
 198  199  
 199  200                  /*
 200  201                   * If the events indicate AS_CALLBACK_CALLED, just clear
 201  202                   * AS_ALL_EVENT in the events field and wakeup the thread
 202  203                   * that may be waiting in as_do_callbacks.  as_do_callbacks
 203  204                   * will take care of removing this entry from the list.  In
 204  205                   * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 205  206                   * (AS_CALLBACK_CALLED not set), just remove it from the
 206  207                   * list, return the memory and return AS_CALLBACK_DELETED.
 207  208                   */
 208  209                  if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 209  210                          /* leave AS_CALLBACK_CALLED */
 210  211                          cb->ascb_events &= ~AS_ALL_EVENT;
 211  212                          rc = AS_CALLBACK_DELETE_DEFERRED;
 212  213                          cv_broadcast(&as->a_cv);
 213  214                  } else {
 214  215                          *prevcb = cb->ascb_next;
 215  216                          kmem_free(cb, sizeof (struct as_callback));
 216  217                          rc = AS_CALLBACK_DELETED;
 217  218                  }
 218  219                  break;
 219  220          }
 220  221          mutex_exit(&as->a_contents);
 221  222          return (rc);
 222  223  }
 223  224  
 224  225  /*
 225  226   * Searches the as callback list for a matching entry.
 226  227   * Returns a pointer to the first matching callback, or NULL if
 227  228   * nothing is found.
 228  229   * This function never sleeps so it is ok to call it with more
 229  230   * locks held but the (required) a_contents mutex.
 230  231   *
 231  232   * See also comment on as_do_callbacks below.
 232  233   */
 233  234  static struct as_callback *
 234  235  as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 235  236      size_t event_len)
 236  237  {
 237  238          struct as_callback      *cb;
 238  239  
 239  240          ASSERT(MUTEX_HELD(&as->a_contents));
 240  241          for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 241  242                  /*
 242  243                   * If the callback has not already been called, then
 243  244                   * check if events or address range pertains.  An event_len
 244  245                   * of zero means do an unconditional callback.
 245  246                   */
 246  247                  if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 247  248                      ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 248  249                      (event_addr + event_len < cb->ascb_saddr) ||
 249  250                      (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 250  251                          continue;
 251  252                  }
 252  253                  break;
 253  254          }
 254  255          return (cb);
 255  256  }
 256  257  
 257  258  /*
 258  259   * Executes a given callback and removes it from the callback list for
 259  260   * this address space.
 260  261   * This function may sleep so the caller must drop all locks except
 261  262   * a_contents before calling this func.
 262  263   *
 263  264   * See also comments on as_do_callbacks below.
 264  265   */
 265  266  static void
 266  267  as_execute_callback(struct as *as, struct as_callback *cb,
 267  268      uint_t events)
 268  269  {
 269  270          struct as_callback **prevcb;
 270  271          void    *cb_arg;
 271  272  
 272  273          ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 273  274          cb->ascb_events |= AS_CALLBACK_CALLED;
 274  275          mutex_exit(&as->a_contents);
 275  276          (*cb->ascb_func)(as, cb->ascb_arg, events);
 276  277          mutex_enter(&as->a_contents);
 277  278          /*
 278  279           * the callback function is required to delete the callback
 279  280           * when the callback function determines it is OK for
 280  281           * this thread to continue. as_delete_callback will clear
 281  282           * the AS_ALL_EVENT in the events field when it is deleted.
 282  283           * If the callback function called as_delete_callback,
 283  284           * events will already be cleared and there will be no blocking.
 284  285           */
 285  286          while ((cb->ascb_events & events) != 0) {
 286  287                  cv_wait(&as->a_cv, &as->a_contents);
 287  288          }
 288  289          /*
 289  290           * This entry needs to be taken off the list. Normally, the
 290  291           * callback func itself does that, but unfortunately the list
 291  292           * may have changed while the callback was running because the
 292  293           * a_contents mutex was dropped and someone else other than the
 293  294           * callback func itself could have called as_delete_callback,
 294  295           * so we have to search to find this entry again.  The entry
 295  296           * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 296  297           */
 297  298          cb_arg = cb->ascb_arg;
 298  299          prevcb = &as->a_callbacks;
 299  300          for (cb = as->a_callbacks; cb != NULL;
 300  301              prevcb = &cb->ascb_next, cb = *prevcb) {
 301  302                  if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 302  303                      (cb_arg != cb->ascb_arg)) {
 303  304                          continue;
 304  305                  }
 305  306                  *prevcb = cb->ascb_next;
 306  307                  kmem_free(cb, sizeof (struct as_callback));
 307  308                  break;
 308  309          }
 309  310  }
 310  311  
 311  312  /*
 312  313   * Check the callback list for a matching event and intersection of
 313  314   * address range. If there is a match invoke the callback.  Skip an entry if:
 314  315   *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 315  316   *    - not event of interest
 316  317   *    - not address range of interest
 317  318   *
 318  319   * An event_len of zero indicates a request for an unconditional callback
 319  320   * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 320  321   * a_contents lock must be dropped before a callback, so only one callback
 321  322   * can be done before returning. Return -1 (true) if a callback was
 322  323   * executed and removed from the list, else return 0 (false).
 323  324   *
 324  325   * The logically separate parts, i.e. finding a matching callback and
 325  326   * executing a given callback have been separated into two functions
 326  327   * so that they can be called with different sets of locks held beyond
 327  328   * the always-required a_contents. as_find_callback does not sleep so
 328  329   * it is ok to call it if more locks than a_contents (i.e. the a_lock
 329  330   * rwlock) are held. as_execute_callback on the other hand may sleep
 330  331   * so all locks beyond a_contents must be dropped by the caller if one
 331  332   * does not want to end comatose.
 332  333   */
 333  334  static int
 334  335  as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 335  336      size_t event_len)
 336  337  {
 337  338          struct as_callback *cb;
 338  339  
 339  340          if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 340  341                  as_execute_callback(as, cb, events);
 341  342                  return (-1);
 342  343          }
 343  344          return (0);
 344  345  }
 345  346  
 346  347  /*
 347  348   * Search for the segment containing addr. If a segment containing addr
 348  349   * exists, that segment is returned.  If no such segment exists, and
 349  350   * the list spans addresses greater than addr, then the first segment
 350  351   * whose base is greater than addr is returned; otherwise, NULL is
 351  352   * returned unless tail is true, in which case the last element of the
 352  353   * list is returned.
 353  354   *
 354  355   * a_seglast is used to cache the last found segment for repeated
 355  356   * searches to the same addr (which happens frequently).
 356  357   */
 357  358  struct seg *
 358  359  as_findseg(struct as *as, caddr_t addr, int tail)
 359  360  {
 360  361          struct seg *seg = as->a_seglast;
 361  362          avl_index_t where;
 362  363  
 363  364          ASSERT(AS_LOCK_HELD(as));
 364  365  
 365  366          if (seg != NULL &&
 366  367              seg->s_base <= addr &&
 367  368              addr < seg->s_base + seg->s_size)
 368  369                  return (seg);
 369  370  
 370  371          seg = avl_find(&as->a_segtree, &addr, &where);
 371  372          if (seg != NULL)
 372  373                  return (as->a_seglast = seg);
 373  374  
 374  375          seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 375  376          if (seg == NULL && tail)
 376  377                  seg = avl_last(&as->a_segtree);
 377  378          return (as->a_seglast = seg);
 378  379  }
 379  380  
 380  381  #ifdef VERIFY_SEGLIST
 381  382  /*
 382  383   * verify that the linked list is coherent
 383  384   */
 384  385  static void
 385  386  as_verify(struct as *as)
 386  387  {
 387  388          struct seg *seg, *seglast, *p, *n;
 388  389          uint_t nsegs = 0;
 389  390  
 390  391          if (do_as_verify == 0)
 391  392                  return;
 392  393  
 393  394          seglast = as->a_seglast;
 394  395  
 395  396          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 396  397                  ASSERT(seg->s_as == as);
 397  398                  p = AS_SEGPREV(as, seg);
 398  399                  n = AS_SEGNEXT(as, seg);
 399  400                  ASSERT(p == NULL || p->s_as == as);
 400  401                  ASSERT(p == NULL || p->s_base < seg->s_base);
 401  402                  ASSERT(n == NULL || n->s_base > seg->s_base);
 402  403                  ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 403  404                  if (seg == seglast)
 404  405                          seglast = NULL;
 405  406                  nsegs++;
 406  407          }
 407  408          ASSERT(seglast == NULL);
 408  409          ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 409  410  }
 410  411  #endif /* VERIFY_SEGLIST */
 411  412  
 412  413  /*
 413  414   * Add a new segment to the address space. The avl_find()
 414  415   * may be expensive so we attempt to use last segment accessed
 415  416   * in as_gap() as an insertion point.
 416  417   */
 417  418  int
 418  419  as_addseg(struct as  *as, struct seg *newseg)
 419  420  {
 420  421          struct seg *seg;
 421  422          caddr_t addr;
 422  423          caddr_t eaddr;
 423  424          avl_index_t where;
 424  425  
 425  426          ASSERT(AS_WRITE_HELD(as));
 426  427  
 427  428          as->a_updatedir = 1;    /* inform /proc */
 428  429          gethrestime(&as->a_updatetime);
 429  430  
 430  431          if (as->a_lastgaphl != NULL) {
 431  432                  struct seg *hseg = NULL;
 432  433                  struct seg *lseg = NULL;
 433  434  
 434  435                  if (as->a_lastgaphl->s_base > newseg->s_base) {
 435  436                          hseg = as->a_lastgaphl;
 436  437                          lseg = AVL_PREV(&as->a_segtree, hseg);
 437  438                  } else {
 438  439                          lseg = as->a_lastgaphl;
 439  440                          hseg = AVL_NEXT(&as->a_segtree, lseg);
 440  441                  }
 441  442  
 442  443                  if (hseg && lseg && lseg->s_base < newseg->s_base &&
 443  444                      hseg->s_base > newseg->s_base) {
 444  445                          avl_insert_here(&as->a_segtree, newseg, lseg,
 445  446                              AVL_AFTER);
 446  447                          as->a_lastgaphl = NULL;
 447  448                          as->a_seglast = newseg;
 448  449                          return (0);
 449  450                  }
 450  451                  as->a_lastgaphl = NULL;
 451  452          }
 452  453  
 453  454          addr = newseg->s_base;
 454  455          eaddr = addr + newseg->s_size;
 455  456  again:
 456  457  
 457  458          seg = avl_find(&as->a_segtree, &addr, &where);
 458  459  
 459  460          if (seg == NULL)
 460  461                  seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 461  462  
 462  463          if (seg == NULL)
 463  464                  seg = avl_last(&as->a_segtree);
 464  465  
 465  466          if (seg != NULL) {
 466  467                  caddr_t base = seg->s_base;
 467  468  
 468  469                  /*
 469  470                   * If top of seg is below the requested address, then
 470  471                   * the insertion point is at the end of the linked list,
 471  472                   * and seg points to the tail of the list.  Otherwise,
 472  473                   * the insertion point is immediately before seg.
 473  474                   */
 474  475                  if (base + seg->s_size > addr) {
 475  476                          if (addr >= base || eaddr > base) {
 476  477  #ifdef __sparc
 477  478                                  extern struct seg_ops segnf_ops;
 478  479  
 479  480                                  /*
 480  481                                   * no-fault segs must disappear if overlaid.
 481  482                                   * XXX need new segment type so
 482  483                                   * we don't have to check s_ops
 483  484                                   */
 484  485                                  if (seg->s_ops == &segnf_ops) {
 485  486                                          seg_unmap(seg);
 486  487                                          goto again;
 487  488                                  }
 488  489  #endif
 489  490                                  return (-1);    /* overlapping segment */
 490  491                          }
 491  492                  }
 492  493          }
 493  494          as->a_seglast = newseg;
 494  495          avl_insert(&as->a_segtree, newseg, where);
 495  496  
 496  497  #ifdef VERIFY_SEGLIST
 497  498          as_verify(as);
 498  499  #endif
 499  500          return (0);
 500  501  }
 501  502  
 502  503  struct seg *
 503  504  as_removeseg(struct as *as, struct seg *seg)
 504  505  {
 505  506          avl_tree_t *t;
 506  507  
 507  508          ASSERT(AS_WRITE_HELD(as));
 508  509  
 509  510          as->a_updatedir = 1;    /* inform /proc */
 510  511          gethrestime(&as->a_updatetime);
 511  512  
 512  513          if (seg == NULL)
 513  514                  return (NULL);
 514  515  
 515  516          t = &as->a_segtree;
 516  517          if (as->a_seglast == seg)
 517  518                  as->a_seglast = NULL;
 518  519          as->a_lastgaphl = NULL;
 519  520  
 520  521          /*
 521  522           * if this segment is at an address higher than
 522  523           * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 523  524           */
 524  525          if (as->a_lastgap &&
 525  526              (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 526  527                  as->a_lastgap = AVL_NEXT(t, seg);
 527  528  
 528  529          /*
 529  530           * remove the segment from the seg tree
 530  531           */
 531  532          avl_remove(t, seg);
 532  533  
 533  534  #ifdef VERIFY_SEGLIST
 534  535          as_verify(as);
 535  536  #endif
 536  537          return (seg);
 537  538  }
 538  539  
 539  540  /*
 540  541   * Find a segment containing addr.
 541  542   */
 542  543  struct seg *
 543  544  as_segat(struct as *as, caddr_t addr)
 544  545  {
 545  546          struct seg *seg = as->a_seglast;
 546  547  
 547  548          ASSERT(AS_LOCK_HELD(as));
 548  549  
 549  550          if (seg != NULL && seg->s_base <= addr &&
 550  551              addr < seg->s_base + seg->s_size)
 551  552                  return (seg);
 552  553  
 553  554          seg = avl_find(&as->a_segtree, &addr, NULL);
 554  555          return (seg);
 555  556  }
 556  557  
 557  558  /*
 558  559   * Serialize all searches for holes in an address space to
 559  560   * prevent two or more threads from allocating the same virtual
 560  561   * address range.  The address space must not be "read/write"
 561  562   * locked by the caller since we may block.
 562  563   */
 563  564  void
 564  565  as_rangelock(struct as *as)
 565  566  {
 566  567          mutex_enter(&as->a_contents);
 567  568          while (AS_ISCLAIMGAP(as))
 568  569                  cv_wait(&as->a_cv, &as->a_contents);
 569  570          AS_SETCLAIMGAP(as);
 570  571          mutex_exit(&as->a_contents);
 571  572  }
 572  573  
 573  574  /*
 574  575   * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 575  576   */
 576  577  void
 577  578  as_rangeunlock(struct as *as)
 578  579  {
 579  580          mutex_enter(&as->a_contents);
 580  581          AS_CLRCLAIMGAP(as);
 581  582          cv_signal(&as->a_cv);
 582  583          mutex_exit(&as->a_contents);
 583  584  }
 584  585  
 585  586  /*
 586  587   * compar segments (or just an address) by segment address range
 587  588   */
 588  589  static int
 589  590  as_segcompar(const void *x, const void *y)
 590  591  {
 591  592          struct seg *a = (struct seg *)x;
 592  593          struct seg *b = (struct seg *)y;
 593  594  
 594  595          if (a->s_base < b->s_base)
 595  596                  return (-1);
 596  597          if (a->s_base >= b->s_base + b->s_size)
 597  598                  return (1);
 598  599          return (0);
 599  600  }
 600  601  
 601  602  
 602  603  void
 603  604  as_avlinit(struct as *as)
 604  605  {
 605  606          avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 606  607              offsetof(struct seg, s_tree));
 607  608          avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 608  609              offsetof(struct watched_page, wp_link));
 609  610  }
 610  611  
 611  612  /*ARGSUSED*/
 612  613  static int
 613  614  as_constructor(void *buf, void *cdrarg, int kmflags)
 614  615  {
 615  616          struct as *as = buf;
 616  617  
 617  618          mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 618  619          cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 619  620          rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 620  621          as_avlinit(as);
 621  622          return (0);
 622  623  }
 623  624  
 624  625  /*ARGSUSED1*/
 625  626  static void
 626  627  as_destructor(void *buf, void *cdrarg)
 627  628  {
 628  629          struct as *as = buf;
 629  630  
 630  631          avl_destroy(&as->a_segtree);
 631  632          mutex_destroy(&as->a_contents);
 632  633          cv_destroy(&as->a_cv);
 633  634          rw_destroy(&as->a_lock);
 634  635  }
 635  636  
 636  637  void
 637  638  as_init(void)
 638  639  {
 639  640          as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 640  641              as_constructor, as_destructor, NULL, NULL, NULL, 0);
 641  642  }
 642  643  
 643  644  /*
 644  645   * Allocate and initialize an address space data structure.
 645  646   * We call hat_alloc to allow any machine dependent
 646  647   * information in the hat structure to be initialized.
 647  648   */
 648  649  struct as *
 649  650  as_alloc(void)
 650  651  {
 651  652          struct as *as;
 652  653  
 653  654          as = kmem_cache_alloc(as_cache, KM_SLEEP);
 654  655  
 655  656          as->a_flags             = 0;
 656  657          as->a_vbits             = 0;
 657  658          as->a_hrm               = NULL;
 658  659          as->a_seglast           = NULL;
 659  660          as->a_size              = 0;
 660  661          as->a_resvsize          = 0;
 661  662          as->a_updatedir         = 0;
 662  663          gethrestime(&as->a_updatetime);
 663  664          as->a_objectdir         = NULL;
 664  665          as->a_sizedir           = 0;
 665  666          as->a_userlimit         = (caddr_t)USERLIMIT;
 666  667          as->a_lastgap           = NULL;
 667  668          as->a_lastgaphl         = NULL;
 668  669          as->a_callbacks         = NULL;
 669  670          as->a_proc              = NULL;
 670  671  
 671  672          AS_LOCK_ENTER(as, RW_WRITER);
 672  673          as->a_hat = hat_alloc(as);      /* create hat for default system mmu */
 673  674          AS_LOCK_EXIT(as);
 674  675  
 675  676          return (as);
 676  677  }
 677  678  
 678  679  /*
 679  680   * Free an address space data structure.
 680  681   * Need to free the hat first and then
 681  682   * all the segments on this as and finally
 682  683   * the space for the as struct itself.
 683  684   */
 684  685  void
 685  686  as_free(struct as *as)
 686  687  {
 687  688          struct hat *hat = as->a_hat;
 688  689          struct seg *seg, *next;
 689  690          boolean_t free_started = B_FALSE;
 690  691  
 691  692  top:
 692  693          /*
 693  694           * Invoke ALL callbacks. as_do_callbacks will do one callback
 694  695           * per call, and not return (-1) until the callback has completed.
 695  696           * When as_do_callbacks returns zero, all callbacks have completed.
 696  697           */
 697  698          mutex_enter(&as->a_contents);
 698  699          while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 699  700                  ;
 700  701  
 701  702          mutex_exit(&as->a_contents);
 702  703          AS_LOCK_ENTER(as, RW_WRITER);
 703  704  
 704  705          if (!free_started) {
 705  706                  free_started = B_TRUE;
 706  707                  hat_free_start(hat);
 707  708          }
 708  709          for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 709  710                  int err;
 710  711  
 711  712                  next = AS_SEGNEXT(as, seg);
 712  713  retry:
 713  714                  err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 714  715                  if (err == EAGAIN) {
 715  716                          mutex_enter(&as->a_contents);
 716  717                          if (as->a_callbacks) {
 717  718                                  AS_LOCK_EXIT(as);
 718  719                          } else if (!AS_ISNOUNMAPWAIT(as)) {
 719  720                                  /*
 720  721                                   * Memory is currently locked. Wait for a
 721  722                                   * cv_signal that it has been unlocked, then
 722  723                                   * try the operation again.
 723  724                                   */
 724  725                                  if (AS_ISUNMAPWAIT(as) == 0)
 725  726                                          cv_broadcast(&as->a_cv);
 726  727                                  AS_SETUNMAPWAIT(as);
 727  728                                  AS_LOCK_EXIT(as);
 728  729                                  while (AS_ISUNMAPWAIT(as))
 729  730                                          cv_wait(&as->a_cv, &as->a_contents);
 730  731                          } else {
 731  732                                  /*
 732  733                                   * We may have raced with
 733  734                                   * segvn_reclaim()/segspt_reclaim(). In this
 734  735                                   * case clean nounmapwait flag and retry since
 735  736                                   * softlockcnt in this segment may be already
 736  737                                   * 0.  We don't drop as writer lock so our
 737  738                                   * number of retries without sleeping should
 738  739                                   * be very small. See segvn_reclaim() for
 739  740                                   * more comments.
 740  741                                   */
 741  742                                  AS_CLRNOUNMAPWAIT(as);
 742  743                                  mutex_exit(&as->a_contents);
 743  744                                  goto retry;
 744  745                          }
 745  746                          mutex_exit(&as->a_contents);
 746  747                          goto top;
 747  748                  } else {
 748  749                          /*
 749  750                           * We do not expect any other error return at this
 750  751                           * time. This is similar to an ASSERT in seg_unmap()
 751  752                           */
 752  753                          ASSERT(err == 0);
 753  754                  }
 754  755          }
 755  756          hat_free_end(hat);
 756  757          AS_LOCK_EXIT(as);
 757  758  
 758  759          /* /proc stuff */
 759  760          ASSERT(avl_numnodes(&as->a_wpage) == 0);
 760  761          if (as->a_objectdir) {
 761  762                  kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 762  763                  as->a_objectdir = NULL;
 763  764                  as->a_sizedir = 0;
 764  765          }
 765  766  
 766  767          /*
 767  768           * Free the struct as back to kmem.  Assert it has no segments.
 768  769           */
 769  770          ASSERT(avl_numnodes(&as->a_segtree) == 0);
 770  771          kmem_cache_free(as_cache, as);
 771  772  }
 772  773  
 773  774  int
 774  775  as_dup(struct as *as, struct proc *forkedproc)
 775  776  {
 776  777          struct as *newas;
 777  778          struct seg *seg, *newseg;
 778  779          size_t  purgesize = 0;
 779  780          int error;
 780  781  
 781  782          AS_LOCK_ENTER(as, RW_WRITER);
 782  783          as_clearwatch(as);
 783  784          newas = as_alloc();
 784  785          newas->a_userlimit = as->a_userlimit;
 785  786          newas->a_proc = forkedproc;
 786  787  
 787  788          AS_LOCK_ENTER(newas, RW_WRITER);
 788  789  
 789  790          (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 790  791  
 791  792          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 792  793  
 793  794                  if (seg->s_flags & S_PURGE) {
 794  795                          purgesize += seg->s_size;
 795  796                          continue;
 796  797                  }
 797  798  
 798  799                  newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 799  800                  if (newseg == NULL) {
 800  801                          AS_LOCK_EXIT(newas);
 801  802                          as_setwatch(as);
 802  803                          AS_LOCK_EXIT(as);
 803  804                          as_free(newas);
 804  805                          return (-1);
 805  806                  }
 806  807                  if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 807  808                          /*
 808  809                           * We call seg_free() on the new seg
 809  810                           * because the segment is not set up
 810  811                           * completely; i.e. it has no ops.
 811  812                           */
 812  813                          as_setwatch(as);
 813  814                          AS_LOCK_EXIT(as);
 814  815                          seg_free(newseg);
 815  816                          AS_LOCK_EXIT(newas);
 816  817                          as_free(newas);
 817  818                          return (error);
 818  819                  }
 819  820                  newas->a_size += seg->s_size;
 820  821          }
 821  822          newas->a_resvsize = as->a_resvsize - purgesize;
 822  823  
 823  824          error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 824  825  
 825  826          AS_LOCK_EXIT(newas);
 826  827  
 827  828          as_setwatch(as);
 828  829          AS_LOCK_EXIT(as);
 829  830          if (error != 0) {
 830  831                  as_free(newas);
 831  832                  return (error);
 832  833          }
 833  834          forkedproc->p_as = newas;
 834  835          return (0);
 835  836  }
 836  837  
 837  838  /*
 838  839   * Handle a ``fault'' at addr for size bytes.
 839  840   */
 840  841  faultcode_t
 841  842  as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 842  843      enum fault_type type, enum seg_rw rw)

↓ open down ↓

772 lines elided

↑ open up ↑

 843  844  {
 844  845          struct seg *seg;
 845  846          caddr_t raddr;                  /* rounded down addr */
 846  847          size_t rsize;                   /* rounded up size */
 847  848          size_t ssize;
 848  849          faultcode_t res = 0;
 849  850          caddr_t addrsav;
 850  851          struct seg *segsav;
 851  852          int as_lock_held;
 852  853          klwp_t *lwp = ttolwp(curthread);
      854 +        zone_t *zonep = curzone;
 853  855  
 854      -
 855      -
 856  856  retry:
 857  857          /*
 858  858           * Indicate that the lwp is not to be stopped while waiting for a
 859  859           * pagefault.  This is to avoid deadlock while debugging a process
 860  860           * via /proc over NFS (in particular).
 861  861           */
 862  862          if (lwp != NULL)
 863  863                  lwp->lwp_nostop++;
 864  864  
 865  865          /*

 866  866           * same length must be used when we softlock and softunlock.  We
 867  867           * don't support softunlocking lengths less than the original length
 868  868           * when there is largepage support.  See seg_dev.c for more
 869  869           * comments.
 870  870           */
 871  871          switch (type) {
 872  872  
 873  873          case F_SOFTLOCK:
 874  874                  CPU_STATS_ADD_K(vm, softlock, 1);
 875  875                  break;
 876  876  
 877  877          case F_SOFTUNLOCK:
 878  878                  break;
 879  879

↓ open down ↓

14 lines elided

↑ open up ↑

 880  880          case F_PROT:
 881  881                  CPU_STATS_ADD_K(vm, prot_fault, 1);
 882  882                  break;
 883  883  
 884  884          case F_INVAL:
 885  885                  CPU_STATS_ENTER_K();
 886  886                  CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 887  887                  if (as == &kas)
 888  888                          CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 889  889                  CPU_STATS_EXIT_K();
      890 +                if (zonep->zone_pg_flt_delay != 0) {
      891 +                        /*
      892 +                         * The zone in which this process is running
      893 +                         * is currently over it's physical memory cap.
      894 +                         * Throttle page faults to help the user-land
      895 +                         * memory capper catch up. Note that
      896 +                         * drv_usectohz() rounds up.
      897 +                         */
      898 +                        atomic_add_64(&zonep->zone_pf_throttle, 1);
      899 +                        atomic_add_64(&zonep->zone_pf_throttle_usec,
      900 +                            zonep->zone_pg_flt_delay);
      901 +                        if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1))
      902 +                                drv_usecwait(zonep->zone_pg_flt_delay);
      903 +                        else
      904 +                                delay(drv_usectohz(zonep->zone_pg_flt_delay));
      905 +                }
 890  906                  break;
 891  907          }
 892  908  
 893  909          /* Kernel probe */
 894  910          TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 895  911              tnf_opaque, address,        addr,
 896  912              tnf_fault_type,     fault_type,     type,
 897  913              tnf_seg_access,     access,         rw);
 898  914  
 899  915          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);

 900  916          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 901  917              (size_t)raddr;
 902  918  
 903  919          /*
 904  920           * XXX -- Don't grab the as lock for segkmap. We should grab it for
 905  921           * correctness, but then we could be stuck holding this lock for
 906  922           * a LONG time if the fault needs to be resolved on a slow
 907  923           * filesystem, and then no-one will be able to exec new commands,
 908  924           * as exec'ing requires the write lock on the as.
 909  925           */
 910  926          if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 911  927              raddr + size < segkmap->s_base + segkmap->s_size) {
 912  928                  seg = segkmap;
 913  929                  as_lock_held = 0;
 914  930          } else {
 915  931                  AS_LOCK_ENTER(as, RW_READER);
 916  932  
 917  933                  seg = as_segat(as, raddr);
 918  934                  if (seg == NULL) {
 919  935                          AS_LOCK_EXIT(as);
 920  936                          if (lwp != NULL)
 921  937                                  lwp->lwp_nostop--;
 922  938                          return (FC_NOMAP);
 923  939                  }
 924  940  
 925  941                  as_lock_held = 1;
 926  942          }
 927  943  
 928  944          addrsav = raddr;
 929  945          segsav = seg;
 930  946  
 931  947          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 932  948                  if (raddr >= seg->s_base + seg->s_size) {
 933  949                          seg = AS_SEGNEXT(as, seg);
 934  950                          if (seg == NULL || raddr != seg->s_base) {
 935  951                                  res = FC_NOMAP;
 936  952                                  break;
 937  953                          }
 938  954                  }
 939  955                  if (raddr + rsize > seg->s_base + seg->s_size)
 940  956                          ssize = seg->s_base + seg->s_size - raddr;
 941  957                  else
 942  958                          ssize = rsize;
 943  959  
 944  960                  res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
 945  961                  if (res != 0)
 946  962                          break;
 947  963          }
 948  964  
 949  965          /*
 950  966           * If we were SOFTLOCKing and encountered a failure,
 951  967           * we must SOFTUNLOCK the range we already did. (Maybe we
 952  968           * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 953  969           * right here...)
 954  970           */
 955  971          if (res != 0 && type == F_SOFTLOCK) {
 956  972                  for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 957  973                          if (addrsav >= seg->s_base + seg->s_size)
 958  974                                  seg = AS_SEGNEXT(as, seg);
 959  975                          ASSERT(seg != NULL);
 960  976                          /*
 961  977                           * Now call the fault routine again to perform the
 962  978                           * unlock using S_OTHER instead of the rw variable
 963  979                           * since we never got a chance to touch the pages.
 964  980                           */
 965  981                          if (raddr > seg->s_base + seg->s_size)
 966  982                                  ssize = seg->s_base + seg->s_size - addrsav;
 967  983                          else
 968  984                                  ssize = raddr - addrsav;
 969  985                          (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
 970  986                              F_SOFTUNLOCK, S_OTHER);
 971  987                  }
 972  988          }
 973  989          if (as_lock_held)
 974  990                  AS_LOCK_EXIT(as);
 975  991          if (lwp != NULL)
 976  992                  lwp->lwp_nostop--;
 977  993  
 978  994          /*
 979  995           * If the lower levels returned EDEADLK for a fault,
 980  996           * It means that we should retry the fault.  Let's wait
 981  997           * a bit also to let the deadlock causing condition clear.
 982  998           * This is part of a gross hack to work around a design flaw
 983  999           * in the ufs/sds logging code and should go away when the
 984 1000           * logging code is re-designed to fix the problem. See bug
 985 1001           * 4125102 for details of the problem.
 986 1002           */
 987 1003          if (FC_ERRNO(res) == EDEADLK) {
 988 1004                  delay(deadlk_wait);
 989 1005                  res = 0;
 990 1006                  goto retry;
 991 1007          }
 992 1008          return (res);
 993 1009  }
 994 1010  
 995 1011  
 996 1012  
 997 1013  /*
 998 1014   * Asynchronous ``fault'' at addr for size bytes.
 999 1015   */
1000 1016  faultcode_t
1001 1017  as_faulta(struct as *as, caddr_t addr, size_t size)
1002 1018  {
1003 1019          struct seg *seg;
1004 1020          caddr_t raddr;                  /* rounded down addr */
1005 1021          size_t rsize;                   /* rounded up size */
1006 1022          faultcode_t res = 0;
1007 1023          klwp_t *lwp = ttolwp(curthread);
1008 1024  
1009 1025  retry:
1010 1026          /*
1011 1027           * Indicate that the lwp is not to be stopped while waiting
1012 1028           * for a pagefault.  This is to avoid deadlock while debugging
1013 1029           * a process via /proc over NFS (in particular).
1014 1030           */
1015 1031          if (lwp != NULL)
1016 1032                  lwp->lwp_nostop++;
1017 1033  
1018 1034          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1019 1035          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1020 1036              (size_t)raddr;
1021 1037  
1022 1038          AS_LOCK_ENTER(as, RW_READER);
1023 1039          seg = as_segat(as, raddr);
1024 1040          if (seg == NULL) {
1025 1041                  AS_LOCK_EXIT(as);
1026 1042                  if (lwp != NULL)
1027 1043                          lwp->lwp_nostop--;
1028 1044                  return (FC_NOMAP);
1029 1045          }
1030 1046  
1031 1047          for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1032 1048                  if (raddr >= seg->s_base + seg->s_size) {
1033 1049                          seg = AS_SEGNEXT(as, seg);
1034 1050                          if (seg == NULL || raddr != seg->s_base) {
1035 1051                                  res = FC_NOMAP;
1036 1052                                  break;
1037 1053                          }
1038 1054                  }
1039 1055                  res = SEGOP_FAULTA(seg, raddr);
1040 1056                  if (res != 0)
1041 1057                          break;
1042 1058          }
1043 1059          AS_LOCK_EXIT(as);
1044 1060          if (lwp != NULL)
1045 1061                  lwp->lwp_nostop--;
1046 1062          /*
1047 1063           * If the lower levels returned EDEADLK for a fault,
1048 1064           * It means that we should retry the fault.  Let's wait
1049 1065           * a bit also to let the deadlock causing condition clear.
1050 1066           * This is part of a gross hack to work around a design flaw
1051 1067           * in the ufs/sds logging code and should go away when the
1052 1068           * logging code is re-designed to fix the problem. See bug
1053 1069           * 4125102 for details of the problem.
1054 1070           */
1055 1071          if (FC_ERRNO(res) == EDEADLK) {
1056 1072                  delay(deadlk_wait);
1057 1073                  res = 0;
1058 1074                  goto retry;
1059 1075          }
1060 1076          return (res);
1061 1077  }
1062 1078  
1063 1079  /*
1064 1080   * Set the virtual mapping for the interval from [addr : addr + size)
1065 1081   * in address space `as' to have the specified protection.
1066 1082   * It is ok for the range to cross over several segments,
1067 1083   * as long as they are contiguous.
1068 1084   */
1069 1085  int
1070 1086  as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1071 1087  {
1072 1088          struct seg *seg;
1073 1089          struct as_callback *cb;
1074 1090          size_t ssize;
1075 1091          caddr_t raddr;                  /* rounded down addr */
1076 1092          size_t rsize;                   /* rounded up size */
1077 1093          int error = 0, writer = 0;
1078 1094          caddr_t saveraddr;
1079 1095          size_t saversize;
1080 1096  
1081 1097  setprot_top:
1082 1098          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1083 1099          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1084 1100              (size_t)raddr;
1085 1101  
1086 1102          if (raddr + rsize < raddr)              /* check for wraparound */
1087 1103                  return (ENOMEM);
1088 1104  
1089 1105          saveraddr = raddr;
1090 1106          saversize = rsize;
1091 1107  
1092 1108          /*
1093 1109           * Normally we only lock the as as a reader. But
1094 1110           * if due to setprot the segment driver needs to split
1095 1111           * a segment it will return IE_RETRY. Therefore we re-acquire
1096 1112           * the as lock as a writer so the segment driver can change
1097 1113           * the seg list. Also the segment driver will return IE_RETRY
1098 1114           * after it has changed the segment list so we therefore keep
1099 1115           * locking as a writer. Since these opeartions should be rare
1100 1116           * want to only lock as a writer when necessary.
1101 1117           */
1102 1118          if (writer || avl_numnodes(&as->a_wpage) != 0) {
1103 1119                  AS_LOCK_ENTER(as, RW_WRITER);
1104 1120          } else {
1105 1121                  AS_LOCK_ENTER(as, RW_READER);
1106 1122          }
1107 1123  
1108 1124          as_clearwatchprot(as, raddr, rsize);
1109 1125          seg = as_segat(as, raddr);
1110 1126          if (seg == NULL) {
1111 1127                  as_setwatch(as);
1112 1128                  AS_LOCK_EXIT(as);
1113 1129                  return (ENOMEM);
1114 1130          }
1115 1131  
1116 1132          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1117 1133                  if (raddr >= seg->s_base + seg->s_size) {
1118 1134                          seg = AS_SEGNEXT(as, seg);
1119 1135                          if (seg == NULL || raddr != seg->s_base) {
1120 1136                                  error = ENOMEM;
1121 1137                                  break;
1122 1138                          }
1123 1139                  }
1124 1140                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1125 1141                          ssize = seg->s_base + seg->s_size - raddr;
1126 1142                  else
1127 1143                          ssize = rsize;
1128 1144  retry:
1129 1145                  error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1130 1146  
1131 1147                  if (error == IE_NOMEM) {
1132 1148                          error = EAGAIN;
1133 1149                          break;
1134 1150                  }
1135 1151  
1136 1152                  if (error == IE_RETRY) {
1137 1153                          AS_LOCK_EXIT(as);
1138 1154                          writer = 1;
1139 1155                          goto setprot_top;
1140 1156                  }
1141 1157  
1142 1158                  if (error == EAGAIN) {
1143 1159                          /*
1144 1160                           * Make sure we have a_lock as writer.
1145 1161                           */
1146 1162                          if (writer == 0) {
1147 1163                                  AS_LOCK_EXIT(as);
1148 1164                                  writer = 1;
1149 1165                                  goto setprot_top;
1150 1166                          }
1151 1167  
1152 1168                          /*
1153 1169                           * Memory is currently locked.  It must be unlocked
1154 1170                           * before this operation can succeed through a retry.
1155 1171                           * The possible reasons for locked memory and
1156 1172                           * corresponding strategies for unlocking are:
1157 1173                           * (1) Normal I/O
1158 1174                           *      wait for a signal that the I/O operation
1159 1175                           *      has completed and the memory is unlocked.
1160 1176                           * (2) Asynchronous I/O
1161 1177                           *      The aio subsystem does not unlock pages when
1162 1178                           *      the I/O is completed. Those pages are unlocked
1163 1179                           *      when the application calls aiowait/aioerror.
1164 1180                           *      So, to prevent blocking forever, cv_broadcast()
1165 1181                           *      is done to wake up aio_cleanup_thread.
1166 1182                           *      Subsequently, segvn_reclaim will be called, and
1167 1183                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1168 1184                           * (3) Long term page locking:
1169 1185                           *      Drivers intending to have pages locked for a
1170 1186                           *      period considerably longer than for normal I/O
1171 1187                           *      (essentially forever) may have registered for a
1172 1188                           *      callback so they may unlock these pages on
1173 1189                           *      request. This is needed to allow this operation
1174 1190                           *      to succeed. Each entry on the callback list is
1175 1191                           *      examined. If the event or address range pertains
1176 1192                           *      the callback is invoked (unless it already is in
1177 1193                           *      progress). The a_contents lock must be dropped
1178 1194                           *      before the callback, so only one callback can
1179 1195                           *      be done at a time. Go to the top and do more
1180 1196                           *      until zero is returned. If zero is returned,
1181 1197                           *      either there were no callbacks for this event
1182 1198                           *      or they were already in progress.
1183 1199                           */
1184 1200                          mutex_enter(&as->a_contents);
1185 1201                          if (as->a_callbacks &&
1186 1202                              (cb = as_find_callback(as, AS_SETPROT_EVENT,
1187 1203                              seg->s_base, seg->s_size))) {
1188 1204                                  AS_LOCK_EXIT(as);
1189 1205                                  as_execute_callback(as, cb, AS_SETPROT_EVENT);
1190 1206                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1191 1207                                  if (AS_ISUNMAPWAIT(as) == 0)
1192 1208                                          cv_broadcast(&as->a_cv);
1193 1209                                  AS_SETUNMAPWAIT(as);
1194 1210                                  AS_LOCK_EXIT(as);
1195 1211                                  while (AS_ISUNMAPWAIT(as))
1196 1212                                          cv_wait(&as->a_cv, &as->a_contents);
1197 1213                          } else {
1198 1214                                  /*
1199 1215                                   * We may have raced with
1200 1216                                   * segvn_reclaim()/segspt_reclaim(). In this
1201 1217                                   * case clean nounmapwait flag and retry since
1202 1218                                   * softlockcnt in this segment may be already
1203 1219                                   * 0.  We don't drop as writer lock so our
1204 1220                                   * number of retries without sleeping should
1205 1221                                   * be very small. See segvn_reclaim() for
1206 1222                                   * more comments.
1207 1223                                   */
1208 1224                                  AS_CLRNOUNMAPWAIT(as);
1209 1225                                  mutex_exit(&as->a_contents);
1210 1226                                  goto retry;
1211 1227                          }
1212 1228                          mutex_exit(&as->a_contents);
1213 1229                          goto setprot_top;
1214 1230                  } else if (error != 0)
1215 1231                          break;
1216 1232          }
1217 1233          if (error != 0) {
1218 1234                  as_setwatch(as);
1219 1235          } else {
1220 1236                  as_setwatchprot(as, saveraddr, saversize, prot);
1221 1237          }
1222 1238          AS_LOCK_EXIT(as);
1223 1239          return (error);
1224 1240  }
1225 1241  
1226 1242  /*
1227 1243   * Check to make sure that the interval [addr, addr + size)
1228 1244   * in address space `as' has at least the specified protection.
1229 1245   * It is ok for the range to cross over several segments, as long
1230 1246   * as they are contiguous.
1231 1247   */
1232 1248  int
1233 1249  as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1234 1250  {
1235 1251          struct seg *seg;
1236 1252          size_t ssize;
1237 1253          caddr_t raddr;                  /* rounded down addr */
1238 1254          size_t rsize;                   /* rounded up size */
1239 1255          int error = 0;
1240 1256  
1241 1257          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1242 1258          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1243 1259              (size_t)raddr;
1244 1260  
1245 1261          if (raddr + rsize < raddr)              /* check for wraparound */
1246 1262                  return (ENOMEM);
1247 1263  
1248 1264          /*
1249 1265           * This is ugly as sin...
1250 1266           * Normally, we only acquire the address space readers lock.
1251 1267           * However, if the address space has watchpoints present,
1252 1268           * we must acquire the writer lock on the address space for
1253 1269           * the benefit of as_clearwatchprot() and as_setwatchprot().
1254 1270           */
1255 1271          if (avl_numnodes(&as->a_wpage) != 0)
1256 1272                  AS_LOCK_ENTER(as, RW_WRITER);
1257 1273          else
1258 1274                  AS_LOCK_ENTER(as, RW_READER);
1259 1275          as_clearwatchprot(as, raddr, rsize);
1260 1276          seg = as_segat(as, raddr);
1261 1277          if (seg == NULL) {
1262 1278                  as_setwatch(as);
1263 1279                  AS_LOCK_EXIT(as);
1264 1280                  return (ENOMEM);
1265 1281          }
1266 1282  
1267 1283          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1268 1284                  if (raddr >= seg->s_base + seg->s_size) {
1269 1285                          seg = AS_SEGNEXT(as, seg);
1270 1286                          if (seg == NULL || raddr != seg->s_base) {
1271 1287                                  error = ENOMEM;
1272 1288                                  break;
1273 1289                          }
1274 1290                  }
1275 1291                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1276 1292                          ssize = seg->s_base + seg->s_size - raddr;
1277 1293                  else
1278 1294                          ssize = rsize;
1279 1295  
1280 1296                  error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1281 1297                  if (error != 0)
1282 1298                          break;
1283 1299          }
1284 1300          as_setwatch(as);
1285 1301          AS_LOCK_EXIT(as);
1286 1302          return (error);
1287 1303  }
1288 1304  
1289 1305  int
1290 1306  as_unmap(struct as *as, caddr_t addr, size_t size)
1291 1307  {
1292 1308          struct seg *seg, *seg_next;
1293 1309          struct as_callback *cb;
1294 1310          caddr_t raddr, eaddr;
1295 1311          size_t ssize, rsize = 0;
1296 1312          int err;
1297 1313  
1298 1314  top:
1299 1315          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1300 1316          eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1301 1317              (uintptr_t)PAGEMASK);
1302 1318  
1303 1319          AS_LOCK_ENTER(as, RW_WRITER);
1304 1320  
1305 1321          as->a_updatedir = 1;    /* inform /proc */
1306 1322          gethrestime(&as->a_updatetime);
1307 1323  
1308 1324          /*
1309 1325           * Use as_findseg to find the first segment in the range, then
1310 1326           * step through the segments in order, following s_next.
1311 1327           */
1312 1328          as_clearwatchprot(as, raddr, eaddr - raddr);
1313 1329  
1314 1330          for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1315 1331                  if (eaddr <= seg->s_base)
1316 1332                          break;          /* eaddr was in a gap; all done */
1317 1333  
1318 1334                  /* this is implied by the test above */
1319 1335                  ASSERT(raddr < eaddr);
1320 1336  
1321 1337                  if (raddr < seg->s_base)
1322 1338                          raddr = seg->s_base;    /* raddr was in a gap */
1323 1339  
1324 1340                  if (eaddr > (seg->s_base + seg->s_size))
1325 1341                          ssize = seg->s_base + seg->s_size - raddr;
1326 1342                  else
1327 1343                          ssize = eaddr - raddr;
1328 1344  
1329 1345                  /*
1330 1346                   * Save next segment pointer since seg can be
1331 1347                   * destroyed during the segment unmap operation.
1332 1348                   */
1333 1349                  seg_next = AS_SEGNEXT(as, seg);
1334 1350  
1335 1351                  /*
1336 1352                   * We didn't count /dev/null mappings, so ignore them here.
1337 1353                   * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1338 1354                   * we have to do this check here while we have seg.)
1339 1355                   */
1340 1356                  rsize = 0;
1341 1357                  if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1342 1358                      !SEG_IS_PARTIAL_RESV(seg))
1343 1359                          rsize = ssize;
1344 1360  
1345 1361  retry:
1346 1362                  err = SEGOP_UNMAP(seg, raddr, ssize);
1347 1363                  if (err == EAGAIN) {
1348 1364                          /*
1349 1365                           * Memory is currently locked.  It must be unlocked
1350 1366                           * before this operation can succeed through a retry.
1351 1367                           * The possible reasons for locked memory and
1352 1368                           * corresponding strategies for unlocking are:
1353 1369                           * (1) Normal I/O
1354 1370                           *      wait for a signal that the I/O operation
1355 1371                           *      has completed and the memory is unlocked.
1356 1372                           * (2) Asynchronous I/O
1357 1373                           *      The aio subsystem does not unlock pages when
1358 1374                           *      the I/O is completed. Those pages are unlocked
1359 1375                           *      when the application calls aiowait/aioerror.
1360 1376                           *      So, to prevent blocking forever, cv_broadcast()
1361 1377                           *      is done to wake up aio_cleanup_thread.
1362 1378                           *      Subsequently, segvn_reclaim will be called, and
1363 1379                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1364 1380                           * (3) Long term page locking:
1365 1381                           *      Drivers intending to have pages locked for a
1366 1382                           *      period considerably longer than for normal I/O
1367 1383                           *      (essentially forever) may have registered for a
1368 1384                           *      callback so they may unlock these pages on
1369 1385                           *      request. This is needed to allow this operation
1370 1386                           *      to succeed. Each entry on the callback list is
1371 1387                           *      examined. If the event or address range pertains
1372 1388                           *      the callback is invoked (unless it already is in
1373 1389                           *      progress). The a_contents lock must be dropped
1374 1390                           *      before the callback, so only one callback can
1375 1391                           *      be done at a time. Go to the top and do more
1376 1392                           *      until zero is returned. If zero is returned,
1377 1393                           *      either there were no callbacks for this event
1378 1394                           *      or they were already in progress.
1379 1395                           */
1380 1396                          mutex_enter(&as->a_contents);
1381 1397                          if (as->a_callbacks &&
1382 1398                              (cb = as_find_callback(as, AS_UNMAP_EVENT,
1383 1399                              seg->s_base, seg->s_size))) {
1384 1400                                  AS_LOCK_EXIT(as);
1385 1401                                  as_execute_callback(as, cb, AS_UNMAP_EVENT);
1386 1402                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1387 1403                                  if (AS_ISUNMAPWAIT(as) == 0)
1388 1404                                          cv_broadcast(&as->a_cv);
1389 1405                                  AS_SETUNMAPWAIT(as);
1390 1406                                  AS_LOCK_EXIT(as);
1391 1407                                  while (AS_ISUNMAPWAIT(as))
1392 1408                                          cv_wait(&as->a_cv, &as->a_contents);
1393 1409                          } else {
1394 1410                                  /*
1395 1411                                   * We may have raced with
1396 1412                                   * segvn_reclaim()/segspt_reclaim(). In this
1397 1413                                   * case clean nounmapwait flag and retry since
1398 1414                                   * softlockcnt in this segment may be already
1399 1415                                   * 0.  We don't drop as writer lock so our
1400 1416                                   * number of retries without sleeping should
1401 1417                                   * be very small. See segvn_reclaim() for
1402 1418                                   * more comments.
1403 1419                                   */
1404 1420                                  AS_CLRNOUNMAPWAIT(as);
1405 1421                                  mutex_exit(&as->a_contents);
1406 1422                                  goto retry;
1407 1423                          }
1408 1424                          mutex_exit(&as->a_contents);
1409 1425                          goto top;
1410 1426                  } else if (err == IE_RETRY) {
1411 1427                          AS_LOCK_EXIT(as);
1412 1428                          goto top;
1413 1429                  } else if (err) {
1414 1430                          as_setwatch(as);
1415 1431                          AS_LOCK_EXIT(as);
1416 1432                          return (-1);
1417 1433                  }
1418 1434  
1419 1435                  as->a_size -= ssize;
1420 1436                  if (rsize)
1421 1437                          as->a_resvsize -= rsize;
1422 1438                  raddr += ssize;
1423 1439          }
1424 1440          AS_LOCK_EXIT(as);
1425 1441          return (0);
1426 1442  }
1427 1443  
1428 1444  static int
1429 1445  as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1430 1446      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1431 1447  {
1432 1448          uint_t szc;
1433 1449          uint_t nszc;
1434 1450          int error;
1435 1451          caddr_t a;
1436 1452          caddr_t eaddr;
1437 1453          size_t segsize;
1438 1454          struct seg *seg;
1439 1455          size_t pgsz;
1440 1456          int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1441 1457          uint_t save_szcvec;
1442 1458  
1443 1459          ASSERT(AS_WRITE_HELD(as));
1444 1460          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1445 1461          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1446 1462          ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1447 1463          if (!do_off) {
1448 1464                  vn_a->offset = 0;
1449 1465          }
1450 1466  
1451 1467          if (szcvec <= 1) {
1452 1468                  seg = seg_alloc(as, addr, size);
1453 1469                  if (seg == NULL) {
1454 1470                          return (ENOMEM);
1455 1471                  }
1456 1472                  vn_a->szc = 0;
1457 1473                  error = (*crfp)(seg, vn_a);
1458 1474                  if (error != 0) {
1459 1475                          seg_free(seg);
1460 1476                  } else {
1461 1477                          as->a_size += size;
1462 1478                          as->a_resvsize += size;
1463 1479                  }
1464 1480                  return (error);
1465 1481          }
1466 1482  
1467 1483          eaddr = addr + size;
1468 1484          save_szcvec = szcvec;
1469 1485          szcvec >>= 1;
1470 1486          szc = 0;
1471 1487          nszc = 0;
1472 1488          while (szcvec) {
1473 1489                  if ((szcvec & 0x1) == 0) {
1474 1490                          nszc++;
1475 1491                          szcvec >>= 1;
1476 1492                          continue;
1477 1493                  }
1478 1494                  nszc++;
1479 1495                  pgsz = page_get_pagesize(nszc);
1480 1496                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1481 1497                  if (a != addr) {
1482 1498                          ASSERT(a < eaddr);
1483 1499                          segsize = a - addr;
1484 1500                          seg = seg_alloc(as, addr, segsize);
1485 1501                          if (seg == NULL) {
1486 1502                                  return (ENOMEM);
1487 1503                          }
1488 1504                          vn_a->szc = szc;
1489 1505                          error = (*crfp)(seg, vn_a);
1490 1506                          if (error != 0) {
1491 1507                                  seg_free(seg);
1492 1508                                  return (error);
1493 1509                          }
1494 1510                          as->a_size += segsize;
1495 1511                          as->a_resvsize += segsize;
1496 1512                          *segcreated = 1;
1497 1513                          if (do_off) {
1498 1514                                  vn_a->offset += segsize;
1499 1515                          }
1500 1516                          addr = a;
1501 1517                  }
1502 1518                  szc = nszc;
1503 1519                  szcvec >>= 1;
1504 1520          }
1505 1521  
1506 1522          ASSERT(addr < eaddr);
1507 1523          szcvec = save_szcvec | 1; /* add 8K pages */
1508 1524          while (szcvec) {
1509 1525                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1510 1526                  ASSERT(a >= addr);
1511 1527                  if (a != addr) {
1512 1528                          segsize = a - addr;
1513 1529                          seg = seg_alloc(as, addr, segsize);
1514 1530                          if (seg == NULL) {
1515 1531                                  return (ENOMEM);
1516 1532                          }
1517 1533                          vn_a->szc = szc;
1518 1534                          error = (*crfp)(seg, vn_a);
1519 1535                          if (error != 0) {
1520 1536                                  seg_free(seg);
1521 1537                                  return (error);
1522 1538                          }
1523 1539                          as->a_size += segsize;
1524 1540                          as->a_resvsize += segsize;
1525 1541                          *segcreated = 1;
1526 1542                          if (do_off) {
1527 1543                                  vn_a->offset += segsize;
1528 1544                          }
1529 1545                          addr = a;
1530 1546                  }
1531 1547                  szcvec &= ~(1 << szc);
1532 1548                  if (szcvec) {
1533 1549                          szc = highbit(szcvec) - 1;
1534 1550                          pgsz = page_get_pagesize(szc);
1535 1551                  }
1536 1552          }
1537 1553          ASSERT(addr == eaddr);
1538 1554  
1539 1555          return (0);
1540 1556  }
1541 1557  
1542 1558  static int
1543 1559  as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1544 1560      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1545 1561  {
1546 1562          uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1547 1563          int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1548 1564          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1549 1565              type, 0);
1550 1566          int error;
1551 1567          struct seg *seg;
1552 1568          struct vattr va;
1553 1569          u_offset_t eoff;
1554 1570          size_t save_size = 0;
1555 1571          extern size_t textrepl_size_thresh;
1556 1572  
1557 1573          ASSERT(AS_WRITE_HELD(as));
1558 1574          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1559 1575          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1560 1576          ASSERT(vn_a->vp != NULL);
1561 1577          ASSERT(vn_a->amp == NULL);
1562 1578  
1563 1579  again:
1564 1580          if (szcvec <= 1) {
1565 1581                  seg = seg_alloc(as, addr, size);
1566 1582                  if (seg == NULL) {
1567 1583                          return (ENOMEM);
1568 1584                  }
1569 1585                  vn_a->szc = 0;
1570 1586                  error = (*crfp)(seg, vn_a);
1571 1587                  if (error != 0) {
1572 1588                          seg_free(seg);
1573 1589                  } else {
1574 1590                          as->a_size += size;
1575 1591                          as->a_resvsize += size;
1576 1592                  }
1577 1593                  return (error);
1578 1594          }
1579 1595  
1580 1596          va.va_mask = AT_SIZE;
1581 1597          if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1582 1598                  szcvec = 0;
1583 1599                  goto again;
1584 1600          }
1585 1601          eoff = vn_a->offset & PAGEMASK;
1586 1602          if (eoff >= va.va_size) {
1587 1603                  szcvec = 0;
1588 1604                  goto again;
1589 1605          }
1590 1606          eoff += size;
1591 1607          if (btopr(va.va_size) < btopr(eoff)) {
1592 1608                  save_size = size;
1593 1609                  size = va.va_size - (vn_a->offset & PAGEMASK);
1594 1610                  size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1595 1611                  szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1596 1612                      type, 0);
1597 1613                  if (szcvec <= 1) {
1598 1614                          size = save_size;
1599 1615                          goto again;
1600 1616                  }
1601 1617          }
1602 1618  
1603 1619          if (size > textrepl_size_thresh) {
1604 1620                  vn_a->flags |= _MAP_TEXTREPL;
1605 1621          }
1606 1622          error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1607 1623              segcreated);
1608 1624          if (error != 0) {
1609 1625                  return (error);
1610 1626          }
1611 1627          if (save_size) {
1612 1628                  addr += size;
1613 1629                  size = save_size - size;
1614 1630                  szcvec = 0;
1615 1631                  goto again;
1616 1632          }
1617 1633          return (0);
1618 1634  }
1619 1635  
1620 1636  /*
1621 1637   * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1622 1638   * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1623 1639   */
1624 1640  static int
1625 1641  as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1626 1642      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1627 1643  {
1628 1644          uint_t szcvec;
1629 1645          uchar_t type;
1630 1646  
1631 1647          ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1632 1648          if (vn_a->type == MAP_SHARED) {
1633 1649                  type = MAPPGSZC_SHM;
1634 1650          } else if (vn_a->type == MAP_PRIVATE) {
1635 1651                  if (vn_a->szc == AS_MAP_HEAP) {
1636 1652                          type = MAPPGSZC_HEAP;
1637 1653                  } else if (vn_a->szc == AS_MAP_STACK) {
1638 1654                          type = MAPPGSZC_STACK;
1639 1655                  } else {
1640 1656                          type = MAPPGSZC_PRIVM;
1641 1657                  }
1642 1658          }
1643 1659          szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1644 1660              (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1645 1661              (vn_a->flags & MAP_TEXT), type, 0);
1646 1662          ASSERT(AS_WRITE_HELD(as));
1647 1663          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1648 1664          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1649 1665          ASSERT(vn_a->vp == NULL);
1650 1666  
1651 1667          return (as_map_segvn_segs(as, addr, size, szcvec,
1652 1668              crfp, vn_a, segcreated));
1653 1669  }
1654 1670  
1655 1671  int
1656 1672  as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1657 1673  {
1658 1674          AS_LOCK_ENTER(as, RW_WRITER);
1659 1675          return (as_map_locked(as, addr, size, crfp, argsp));
1660 1676  }
1661 1677  
1662 1678  int
1663 1679  as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1664 1680      void *argsp)
1665 1681  {
1666 1682          struct seg *seg = NULL;
1667 1683          caddr_t raddr;                  /* rounded down addr */
1668 1684          size_t rsize;                   /* rounded up size */
1669 1685          int error;
1670 1686          int unmap = 0;
1671 1687          /*
1672 1688           * The use of a_proc is preferred to handle the case where curproc is
1673 1689           * a door_call server and is allocating memory in the client's (a_proc)
1674 1690           * address space.
1675 1691           * When creating a shared memory segment a_proc will be NULL so we
1676 1692           * fallback to curproc in that case.
1677 1693           */
1678 1694          struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1679 1695          struct segvn_crargs crargs;
1680 1696  
1681 1697          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1682 1698          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1683 1699              (size_t)raddr;
1684 1700  
1685 1701          /*
1686 1702           * check for wrap around
1687 1703           */
1688 1704          if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1689 1705                  AS_LOCK_EXIT(as);
1690 1706                  return (ENOMEM);
1691 1707          }
1692 1708  
1693 1709          as->a_updatedir = 1;    /* inform /proc */
1694 1710          gethrestime(&as->a_updatetime);
1695 1711  
1696 1712          if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1697 1713                  AS_LOCK_EXIT(as);
1698 1714  
1699 1715                  (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1700 1716                      RCA_UNSAFE_ALL);
1701 1717  
1702 1718                  return (ENOMEM);
1703 1719          }
1704 1720  
1705 1721          if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1706 1722                  crargs = *(struct segvn_crargs *)argsp;
1707 1723                  error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1708 1724                  if (error != 0) {
1709 1725                          AS_LOCK_EXIT(as);
1710 1726                          if (unmap) {
1711 1727                                  (void) as_unmap(as, addr, size);
1712 1728                          }
1713 1729                          return (error);
1714 1730                  }
1715 1731          } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1716 1732                  crargs = *(struct segvn_crargs *)argsp;
1717 1733                  error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1718 1734                  if (error != 0) {
1719 1735                          AS_LOCK_EXIT(as);
1720 1736                          if (unmap) {
1721 1737                                  (void) as_unmap(as, addr, size);
1722 1738                          }
1723 1739                          return (error);
1724 1740                  }
1725 1741          } else {
1726 1742                  seg = seg_alloc(as, addr, size);
1727 1743                  if (seg == NULL) {
1728 1744                          AS_LOCK_EXIT(as);
1729 1745                          return (ENOMEM);
1730 1746                  }
1731 1747  
1732 1748                  error = (*crfp)(seg, argsp);
1733 1749                  if (error != 0) {
1734 1750                          seg_free(seg);
1735 1751                          AS_LOCK_EXIT(as);
1736 1752                          return (error);
1737 1753                  }
1738 1754                  /*
1739 1755                   * Add size now so as_unmap will work if as_ctl fails.
1740 1756                   */
1741 1757                  as->a_size += rsize;
1742 1758                  as->a_resvsize += rsize;
1743 1759          }
1744 1760  
1745 1761          as_setwatch(as);
1746 1762  
1747 1763          /*
1748 1764           * If the address space is locked,
1749 1765           * establish memory locks for the new segment.
1750 1766           */
1751 1767          mutex_enter(&as->a_contents);
1752 1768          if (AS_ISPGLCK(as)) {
1753 1769                  mutex_exit(&as->a_contents);
1754 1770                  AS_LOCK_EXIT(as);
1755 1771                  error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1756 1772                  if (error != 0)
1757 1773                          (void) as_unmap(as, addr, size);
1758 1774          } else {
1759 1775                  mutex_exit(&as->a_contents);
1760 1776                  AS_LOCK_EXIT(as);
1761 1777          }
1762 1778          return (error);
1763 1779  }
1764 1780  
1765 1781  
1766 1782  /*
1767 1783   * Delete all segments in the address space marked with S_PURGE.
1768 1784   * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1769 1785   * These segments are deleted as a first step before calls to as_gap(), so
1770 1786   * that they don't affect mmap() or shmat().
1771 1787   */
1772 1788  void
1773 1789  as_purge(struct as *as)
1774 1790  {
1775 1791          struct seg *seg;
1776 1792          struct seg *next_seg;
1777 1793  
1778 1794          /*
1779 1795           * the setting of NEEDSPURGE is protect by as_rangelock(), so
1780 1796           * no need to grab a_contents mutex for this check
1781 1797           */
1782 1798          if ((as->a_flags & AS_NEEDSPURGE) == 0)
1783 1799                  return;
1784 1800  
1785 1801          AS_LOCK_ENTER(as, RW_WRITER);
1786 1802          next_seg = NULL;
1787 1803          seg = AS_SEGFIRST(as);
1788 1804          while (seg != NULL) {
1789 1805                  next_seg = AS_SEGNEXT(as, seg);
1790 1806                  if (seg->s_flags & S_PURGE)
1791 1807                          SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1792 1808                  seg = next_seg;
1793 1809          }
1794 1810          AS_LOCK_EXIT(as);
1795 1811  
1796 1812          mutex_enter(&as->a_contents);
1797 1813          as->a_flags &= ~AS_NEEDSPURGE;
1798 1814          mutex_exit(&as->a_contents);
1799 1815  }
1800 1816  
1801 1817  /*
1802 1818   * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1803 1819   * range of addresses at least "minlen" long, where the base of the range is
1804 1820   * at "off" phase from an "align" boundary and there is space for a
1805 1821   * "redzone"-sized redzone on eithe rside of the range.  Thus,
1806 1822   * if align was 4M and off was 16k, the user wants a hole which will start
1807 1823   * 16k into a 4M page.
1808 1824   *
1809 1825   * If flags specifies AH_HI, the hole will have the highest possible address
1810 1826   * in the range.  We use the as->a_lastgap field to figure out where to
1811 1827   * start looking for a gap.
1812 1828   *
1813 1829   * Otherwise, the gap will have the lowest possible address.
1814 1830   *
1815 1831   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1816 1832   *
1817 1833   * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1818 1834   * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1819 1835   *
1820 1836   * NOTE: This routine is not correct when base+len overflows caddr_t.
1821 1837   */
1822 1838  int
1823 1839  as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1824 1840      uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1825 1841  {
1826 1842          caddr_t lobound = *basep;
1827 1843          caddr_t hibound = lobound + *lenp;
1828 1844          struct seg *lseg, *hseg;
1829 1845          caddr_t lo, hi;
1830 1846          int forward;
1831 1847          caddr_t save_base;
1832 1848          size_t save_len;
1833 1849          size_t save_minlen;
1834 1850          size_t save_redzone;
1835 1851          int fast_path = 1;
1836 1852  
1837 1853          save_base = *basep;
1838 1854          save_len = *lenp;
1839 1855          save_minlen = minlen;
1840 1856          save_redzone = redzone;
1841 1857  
1842 1858          /*
1843 1859           * For the first pass/fast_path, just add align and redzone into
1844 1860           * minlen since if we get an allocation, we can guarantee that it
1845 1861           * will fit the alignment and redzone requested.
1846 1862           * This increases the chance that hibound will be adjusted to
1847 1863           * a_lastgap->s_base which will likely allow us to find an
1848 1864           * acceptable hole in the address space quicker.
1849 1865           * If we can't find a hole with this fast_path, then we look for
1850 1866           * smaller holes in which the alignment and offset may allow
1851 1867           * the allocation to fit.
1852 1868           */
1853 1869          minlen += align;
1854 1870          minlen += 2 * redzone;
1855 1871          redzone = 0;
1856 1872  
1857 1873          AS_LOCK_ENTER(as, RW_READER);
1858 1874          if (AS_SEGFIRST(as) == NULL) {
1859 1875                  if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1860 1876                      align, redzone, off)) {
1861 1877                          AS_LOCK_EXIT(as);
1862 1878                          return (0);
1863 1879                  } else {
1864 1880                          AS_LOCK_EXIT(as);
1865 1881                          *basep = save_base;
1866 1882                          *lenp = save_len;
1867 1883                          return (-1);
1868 1884                  }
1869 1885          }
1870 1886  
1871 1887  retry:
1872 1888          /*
1873 1889           * Set up to iterate over all the inter-segment holes in the given
1874 1890           * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1875 1891           * NULL for the highest-addressed hole.  If moving backwards, we reset
1876 1892           * sseg to denote the highest-addressed segment.
1877 1893           */
1878 1894          forward = (flags & AH_DIR) == AH_LO;
1879 1895          if (forward) {
1880 1896                  hseg = as_findseg(as, lobound, 1);
1881 1897                  lseg = AS_SEGPREV(as, hseg);
1882 1898          } else {
1883 1899  
1884 1900                  /*
1885 1901                   * If allocating at least as much as the last allocation,
1886 1902                   * use a_lastgap's base as a better estimate of hibound.
1887 1903                   */
1888 1904                  if (as->a_lastgap &&
1889 1905                      minlen >= as->a_lastgap->s_size &&
1890 1906                      hibound >= as->a_lastgap->s_base)
1891 1907                          hibound = as->a_lastgap->s_base;
1892 1908  
1893 1909                  hseg = as_findseg(as, hibound, 1);
1894 1910                  if (hseg->s_base + hseg->s_size < hibound) {
1895 1911                          lseg = hseg;
1896 1912                          hseg = NULL;
1897 1913                  } else {
1898 1914                          lseg = AS_SEGPREV(as, hseg);
1899 1915                  }
1900 1916          }
1901 1917  
1902 1918          for (;;) {
1903 1919                  /*
1904 1920                   * Set lo and hi to the hole's boundaries.  (We should really
1905 1921                   * use MAXADDR in place of hibound in the expression below,
1906 1922                   * but can't express it easily; using hibound in its place is
1907 1923                   * harmless.)
1908 1924                   */
1909 1925                  lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1910 1926                  hi = (hseg == NULL) ? hibound : hseg->s_base;
1911 1927                  /*
1912 1928                   * If the iteration has moved past the interval from lobound
1913 1929                   * to hibound it's pointless to continue.
1914 1930                   */
1915 1931                  if ((forward && lo > hibound) || (!forward && hi < lobound))
1916 1932                          break;
1917 1933                  else if (lo > hibound || hi < lobound)
1918 1934                          goto cont;
1919 1935                  /*
1920 1936                   * Candidate hole lies at least partially within the allowable
1921 1937                   * range.  Restrict it to fall completely within that range,
1922 1938                   * i.e., to [max(lo, lobound), min(hi, hibound)].
1923 1939                   */
1924 1940                  if (lo < lobound)
1925 1941                          lo = lobound;
1926 1942                  if (hi > hibound)
1927 1943                          hi = hibound;
1928 1944                  /*
1929 1945                   * Verify that the candidate hole is big enough and meets
1930 1946                   * hardware constraints.  If the hole is too small, no need
1931 1947                   * to do the further checks since they will fail.
1932 1948                   */
1933 1949                  *basep = lo;
1934 1950                  *lenp = hi - lo;
1935 1951                  if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1936 1952                      minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1937 1953                      ((flags & AH_CONTAIN) == 0 ||
1938 1954                      (*basep <= addr && *basep + *lenp > addr))) {
1939 1955                          if (!forward)
1940 1956                                  as->a_lastgap = hseg;
1941 1957                          if (hseg != NULL)
1942 1958                                  as->a_lastgaphl = hseg;
1943 1959                          else
1944 1960                                  as->a_lastgaphl = lseg;
1945 1961                          AS_LOCK_EXIT(as);
1946 1962                          return (0);
1947 1963                  }
1948 1964          cont:
1949 1965                  /*
1950 1966                   * Move to the next hole.
1951 1967                   */
1952 1968                  if (forward) {
1953 1969                          lseg = hseg;
1954 1970                          if (lseg == NULL)
1955 1971                                  break;
1956 1972                          hseg = AS_SEGNEXT(as, hseg);
1957 1973                  } else {
1958 1974                          hseg = lseg;
1959 1975                          if (hseg == NULL)
1960 1976                                  break;
1961 1977                          lseg = AS_SEGPREV(as, lseg);
1962 1978                  }
1963 1979          }
1964 1980          if (fast_path && (align != 0 || save_redzone != 0)) {
1965 1981                  fast_path = 0;
1966 1982                  minlen = save_minlen;
1967 1983                  redzone = save_redzone;
1968 1984                  goto retry;
1969 1985          }
1970 1986          *basep = save_base;
1971 1987          *lenp = save_len;
1972 1988          AS_LOCK_EXIT(as);
1973 1989          return (-1);
1974 1990  }
1975 1991  
1976 1992  /*
1977 1993   * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1978 1994   *
1979 1995   * If flags specifies AH_HI, the hole will have the highest possible address
1980 1996   * in the range.  We use the as->a_lastgap field to figure out where to
1981 1997   * start looking for a gap.
1982 1998   *
1983 1999   * Otherwise, the gap will have the lowest possible address.
1984 2000   *
1985 2001   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1986 2002   *
1987 2003   * If an adequate hole is found, base and len are set to reflect the part of
1988 2004   * the hole that is within range, and 0 is returned, otherwise,
1989 2005   * -1 is returned.
1990 2006   *
1991 2007   * NOTE: This routine is not correct when base+len overflows caddr_t.
1992 2008   */
1993 2009  int
1994 2010  as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1995 2011      caddr_t addr)
1996 2012  {
1997 2013  
1998 2014          return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1999 2015  }
2000 2016  
2001 2017  /*
2002 2018   * Return the next range within [base, base + len) that is backed
2003 2019   * with "real memory".  Skip holes and non-seg_vn segments.
2004 2020   * We're lazy and only return one segment at a time.
2005 2021   */
2006 2022  int
2007 2023  as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2008 2024  {
2009 2025          extern struct seg_ops segspt_shmops;    /* needs a header file */
2010 2026          struct seg *seg;
2011 2027          caddr_t addr, eaddr;
2012 2028          caddr_t segend;
2013 2029  
2014 2030          AS_LOCK_ENTER(as, RW_READER);
2015 2031  
2016 2032          addr = *basep;
2017 2033          eaddr = addr + *lenp;
2018 2034  
2019 2035          seg = as_findseg(as, addr, 0);
2020 2036          if (seg != NULL)
2021 2037                  addr = MAX(seg->s_base, addr);
2022 2038  
2023 2039          for (;;) {
2024 2040                  if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2025 2041                          AS_LOCK_EXIT(as);
2026 2042                          return (EINVAL);
2027 2043                  }
2028 2044  
2029 2045                  if (seg->s_ops == &segvn_ops) {
2030 2046                          segend = seg->s_base + seg->s_size;
2031 2047                          break;
2032 2048                  }
2033 2049  
2034 2050                  /*
2035 2051                   * We do ISM by looking into the private data
2036 2052                   * to determine the real size of the segment.
2037 2053                   */
2038 2054                  if (seg->s_ops == &segspt_shmops) {
2039 2055                          segend = seg->s_base + spt_realsize(seg);
2040 2056                          if (addr < segend)
2041 2057                                  break;
2042 2058                  }
2043 2059  
2044 2060                  seg = AS_SEGNEXT(as, seg);
2045 2061  
2046 2062                  if (seg != NULL)
2047 2063                          addr = seg->s_base;
2048 2064          }
2049 2065  
2050 2066          *basep = addr;
2051 2067  
2052 2068          if (segend > eaddr)
2053 2069                  *lenp = eaddr - addr;
2054 2070          else
2055 2071                  *lenp = segend - addr;
2056 2072  
2057 2073          AS_LOCK_EXIT(as);
2058 2074          return (0);
2059 2075  }
2060 2076  
2061 2077  /*
2062 2078   * Swap the pages associated with the address space as out to
2063 2079   * secondary storage, returning the number of bytes actually
2064 2080   * swapped.
2065 2081   *
2066 2082   * The value returned is intended to correlate well with the process's
2067 2083   * memory requirements.  Its usefulness for this purpose depends on
2068 2084   * how well the segment-level routines do at returning accurate
2069 2085   * information.
2070 2086   */
2071 2087  size_t
2072 2088  as_swapout(struct as *as)
2073 2089  {
2074 2090          struct seg *seg;
2075 2091          size_t swpcnt = 0;
2076 2092  
2077 2093          /*
2078 2094           * Kernel-only processes have given up their address
2079 2095           * spaces.  Of course, we shouldn't be attempting to
2080 2096           * swap out such processes in the first place...
2081 2097           */
2082 2098          if (as == NULL)
2083 2099                  return (0);
2084 2100  
2085 2101          AS_LOCK_ENTER(as, RW_READER);
2086 2102  
2087 2103          /*
2088 2104           * Free all mapping resources associated with the address
2089 2105           * space.  The segment-level swapout routines capitalize
2090 2106           * on this unmapping by scavanging pages that have become
2091 2107           * unmapped here.
2092 2108           */
2093 2109          hat_swapout(as->a_hat);
2094 2110  
2095 2111          /*
2096 2112           * Call the swapout routines of all segments in the address
2097 2113           * space to do the actual work, accumulating the amount of
2098 2114           * space reclaimed.
2099 2115           */
2100 2116          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2101 2117                  struct seg_ops *ov = seg->s_ops;
2102 2118  
2103 2119                  /*
2104 2120                   * We have to check to see if the seg has
2105 2121                   * an ops vector because the seg may have
2106 2122                   * been in the middle of being set up when
2107 2123                   * the process was picked for swapout.
2108 2124                   */
2109 2125                  if ((ov != NULL) && (ov->swapout != NULL))
2110 2126                          swpcnt += SEGOP_SWAPOUT(seg);
2111 2127          }
2112 2128          AS_LOCK_EXIT(as);
2113 2129          return (swpcnt);
2114 2130  }
2115 2131  
2116 2132  /*
2117 2133   * Determine whether data from the mappings in interval [addr, addr + size)
2118 2134   * are in the primary memory (core) cache.
2119 2135   */
2120 2136  int
2121 2137  as_incore(struct as *as, caddr_t addr,
2122 2138      size_t size, char *vec, size_t *sizep)
2123 2139  {
2124 2140          struct seg *seg;
2125 2141          size_t ssize;
2126 2142          caddr_t raddr;          /* rounded down addr */
2127 2143          size_t rsize;           /* rounded up size */
2128 2144          size_t isize;                   /* iteration size */
2129 2145          int error = 0;          /* result, assume success */
2130 2146  
2131 2147          *sizep = 0;
2132 2148          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2133 2149          rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2134 2150              (size_t)raddr;
2135 2151  
2136 2152          if (raddr + rsize < raddr)              /* check for wraparound */
2137 2153                  return (ENOMEM);
2138 2154  
2139 2155          AS_LOCK_ENTER(as, RW_READER);
2140 2156          seg = as_segat(as, raddr);
2141 2157          if (seg == NULL) {
2142 2158                  AS_LOCK_EXIT(as);
2143 2159                  return (-1);
2144 2160          }
2145 2161  
2146 2162          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2147 2163                  if (raddr >= seg->s_base + seg->s_size) {
2148 2164                          seg = AS_SEGNEXT(as, seg);
2149 2165                          if (seg == NULL || raddr != seg->s_base) {
2150 2166                                  error = -1;
2151 2167                                  break;
2152 2168                          }
2153 2169                  }
2154 2170                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2155 2171                          ssize = seg->s_base + seg->s_size - raddr;
2156 2172                  else
2157 2173                          ssize = rsize;
2158 2174                  *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2159 2175                  if (isize != ssize) {
2160 2176                          error = -1;
2161 2177                          break;
2162 2178                  }
2163 2179                  vec += btopr(ssize);
2164 2180          }
2165 2181          AS_LOCK_EXIT(as);
2166 2182          return (error);
2167 2183  }
2168 2184  
2169 2185  static void
2170 2186  as_segunlock(struct seg *seg, caddr_t addr, int attr,
2171 2187      ulong_t *bitmap, size_t position, size_t npages)
2172 2188  {
2173 2189          caddr_t range_start;
2174 2190          size_t  pos1 = position;
2175 2191          size_t  pos2;
2176 2192          size_t  size;
2177 2193          size_t  end_pos = npages + position;
2178 2194  
2179 2195          while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2180 2196                  size = ptob((pos2 - pos1));
2181 2197                  range_start = (caddr_t)((uintptr_t)addr +
2182 2198                      ptob(pos1 - position));
2183 2199  
2184 2200                  (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2185 2201                      (ulong_t *)NULL, (size_t)NULL);
2186 2202                  pos1 = pos2;
2187 2203          }
2188 2204  }
2189 2205  
2190 2206  static void
2191 2207  as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2192 2208      caddr_t raddr, size_t rsize)
2193 2209  {
2194 2210          struct seg *seg = as_segat(as, raddr);
2195 2211          size_t ssize;
2196 2212  
2197 2213          while (rsize != 0) {
2198 2214                  if (raddr >= seg->s_base + seg->s_size)
2199 2215                          seg = AS_SEGNEXT(as, seg);
2200 2216  
2201 2217                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2202 2218                          ssize = seg->s_base + seg->s_size - raddr;
2203 2219                  else
2204 2220                          ssize = rsize;
2205 2221  
2206 2222                  as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2207 2223  
2208 2224                  rsize -= ssize;
2209 2225                  raddr += ssize;
2210 2226          }
2211 2227  }
2212 2228  
2213 2229  /*
2214 2230   * Cache control operations over the interval [addr, addr + size) in
2215 2231   * address space "as".
2216 2232   */
2217 2233  /*ARGSUSED*/
2218 2234  int
2219 2235  as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2220 2236      uintptr_t arg, ulong_t *lock_map, size_t pos)
2221 2237  {
2222 2238          struct seg *seg;        /* working segment */
2223 2239          caddr_t raddr;          /* rounded down addr */
2224 2240          caddr_t initraddr;      /* saved initial rounded down addr */
2225 2241          size_t rsize;           /* rounded up size */
2226 2242          size_t initrsize;       /* saved initial rounded up size */
2227 2243          size_t ssize;           /* size of seg */
2228 2244          int error = 0;                  /* result */
2229 2245          size_t mlock_size;      /* size of bitmap */
2230 2246          ulong_t *mlock_map;     /* pointer to bitmap used */
2231 2247                                  /* to represent the locked */
2232 2248                                  /* pages. */
2233 2249  retry:
2234 2250          if (error == IE_RETRY)
2235 2251                  AS_LOCK_ENTER(as, RW_WRITER);
2236 2252          else
2237 2253                  AS_LOCK_ENTER(as, RW_READER);
2238 2254  
2239 2255          /*
2240 2256           * If these are address space lock/unlock operations, loop over
2241 2257           * all segments in the address space, as appropriate.
2242 2258           */
2243 2259          if (func == MC_LOCKAS) {
2244 2260                  size_t npages, idx;
2245 2261                  size_t rlen = 0;        /* rounded as length */
2246 2262  
2247 2263                  idx = pos;
2248 2264  
2249 2265                  if (arg & MCL_FUTURE) {
2250 2266                          mutex_enter(&as->a_contents);
2251 2267                          AS_SETPGLCK(as);
2252 2268                          mutex_exit(&as->a_contents);
2253 2269                  }
2254 2270                  if ((arg & MCL_CURRENT) == 0) {
2255 2271                          AS_LOCK_EXIT(as);
2256 2272                          return (0);
2257 2273                  }
2258 2274  
2259 2275                  seg = AS_SEGFIRST(as);
2260 2276                  if (seg == NULL) {
2261 2277                          AS_LOCK_EXIT(as);
2262 2278                          return (0);
2263 2279                  }
2264 2280  
2265 2281                  do {
2266 2282                          raddr = (caddr_t)((uintptr_t)seg->s_base &
2267 2283                              (uintptr_t)PAGEMASK);
2268 2284                          rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2269 2285                              PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2270 2286                  } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2271 2287  
2272 2288                  mlock_size = BT_BITOUL(btopr(rlen));
2273 2289                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2274 2290                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2275 2291                                  AS_LOCK_EXIT(as);
2276 2292                                  return (EAGAIN);
2277 2293                  }
2278 2294  
2279 2295                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2280 2296                          error = SEGOP_LOCKOP(seg, seg->s_base,
2281 2297                              seg->s_size, attr, MC_LOCK, mlock_map, pos);
2282 2298                          if (error != 0)
2283 2299                                  break;
2284 2300                          pos += seg_pages(seg);
2285 2301                  }
2286 2302  
2287 2303                  if (error) {
2288 2304                          for (seg = AS_SEGFIRST(as); seg != NULL;
2289 2305                              seg = AS_SEGNEXT(as, seg)) {
2290 2306  
2291 2307                                  raddr = (caddr_t)((uintptr_t)seg->s_base &
2292 2308                                      (uintptr_t)PAGEMASK);
2293 2309                                  npages = seg_pages(seg);
2294 2310                                  as_segunlock(seg, raddr, attr, mlock_map,
2295 2311                                      idx, npages);
2296 2312                                  idx += npages;
2297 2313                          }
2298 2314                  }
2299 2315  
2300 2316                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2301 2317                  AS_LOCK_EXIT(as);
2302 2318                  goto lockerr;
2303 2319          } else if (func == MC_UNLOCKAS) {
2304 2320                  mutex_enter(&as->a_contents);
2305 2321                  AS_CLRPGLCK(as);
2306 2322                  mutex_exit(&as->a_contents);
2307 2323  
2308 2324                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2309 2325                          error = SEGOP_LOCKOP(seg, seg->s_base,
2310 2326                              seg->s_size, attr, MC_UNLOCK, NULL, 0);
2311 2327                          if (error != 0)
2312 2328                                  break;
2313 2329                  }
2314 2330  
2315 2331                  AS_LOCK_EXIT(as);
2316 2332                  goto lockerr;
2317 2333          }
2318 2334  
2319 2335          /*
2320 2336           * Normalize addresses and sizes.
2321 2337           */
2322 2338          initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2323 2339          initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2324 2340              (size_t)raddr;
2325 2341  
2326 2342          if (raddr + rsize < raddr) {            /* check for wraparound */
2327 2343                  AS_LOCK_EXIT(as);
2328 2344                  return (ENOMEM);
2329 2345          }
2330 2346  
2331 2347          /*
2332 2348           * Get initial segment.
2333 2349           */
2334 2350          if ((seg = as_segat(as, raddr)) == NULL) {
2335 2351                  AS_LOCK_EXIT(as);
2336 2352                  return (ENOMEM);
2337 2353          }
2338 2354  
2339 2355          if (func == MC_LOCK) {
2340 2356                  mlock_size = BT_BITOUL(btopr(rsize));
2341 2357                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2342 2358                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2343 2359                                  AS_LOCK_EXIT(as);
2344 2360                                  return (EAGAIN);
2345 2361                  }
2346 2362          }
2347 2363  
2348 2364          /*
2349 2365           * Loop over all segments.  If a hole in the address range is
2350 2366           * discovered, then fail.  For each segment, perform the appropriate
2351 2367           * control operation.
2352 2368           */
2353 2369          while (rsize != 0) {
2354 2370  
2355 2371                  /*
2356 2372                   * Make sure there's no hole, calculate the portion
2357 2373                   * of the next segment to be operated over.
2358 2374                   */
2359 2375                  if (raddr >= seg->s_base + seg->s_size) {
2360 2376                          seg = AS_SEGNEXT(as, seg);
2361 2377                          if (seg == NULL || raddr != seg->s_base) {
2362 2378                                  if (func == MC_LOCK) {
2363 2379                                          as_unlockerr(as, attr, mlock_map,
2364 2380                                              initraddr, initrsize - rsize);
2365 2381                                          kmem_free(mlock_map,
2366 2382                                              mlock_size * sizeof (ulong_t));
2367 2383                                  }
2368 2384                                  AS_LOCK_EXIT(as);
2369 2385                                  return (ENOMEM);
2370 2386                          }
2371 2387                  }
2372 2388                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2373 2389                          ssize = seg->s_base + seg->s_size - raddr;
2374 2390                  else
2375 2391                          ssize = rsize;
2376 2392  
2377 2393                  /*
2378 2394                   * Dispatch on specific function.
2379 2395                   */
2380 2396                  switch (func) {
2381 2397  
2382 2398                  /*
2383 2399                   * Synchronize cached data from mappings with backing
2384 2400                   * objects.
2385 2401                   */
2386 2402                  case MC_SYNC:
2387 2403                          if (error = SEGOP_SYNC(seg, raddr, ssize,
2388 2404                              attr, (uint_t)arg)) {
2389 2405                                  AS_LOCK_EXIT(as);
2390 2406                                  return (error);
2391 2407                          }
2392 2408                          break;
2393 2409  
2394 2410                  /*
2395 2411                   * Lock pages in memory.
2396 2412                   */
2397 2413                  case MC_LOCK:
2398 2414                          if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2399 2415                              attr, func, mlock_map, pos)) {
2400 2416                                  as_unlockerr(as, attr, mlock_map, initraddr,
2401 2417                                      initrsize - rsize + ssize);
2402 2418                                  kmem_free(mlock_map, mlock_size *
2403 2419                                      sizeof (ulong_t));
2404 2420                                  AS_LOCK_EXIT(as);
2405 2421                                  goto lockerr;
2406 2422                          }
2407 2423                          break;
2408 2424  
2409 2425                  /*
2410 2426                   * Unlock mapped pages.
2411 2427                   */
2412 2428                  case MC_UNLOCK:
2413 2429                          (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2414 2430                              (ulong_t *)NULL, (size_t)NULL);
2415 2431                          break;
2416 2432  
2417 2433                  /*
2418 2434                   * Store VM advise for mapped pages in segment layer.
2419 2435                   */
2420 2436                  case MC_ADVISE:
2421 2437                          error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2422 2438  
2423 2439                          /*
2424 2440                           * Check for regular errors and special retry error
2425 2441                           */
2426 2442                          if (error) {
2427 2443                                  if (error == IE_RETRY) {
2428 2444                                          /*
2429 2445                                           * Need to acquire writers lock, so
2430 2446                                           * have to drop readers lock and start
2431 2447                                           * all over again
2432 2448                                           */
2433 2449                                          AS_LOCK_EXIT(as);
2434 2450                                          goto retry;
2435 2451                                  } else if (error == IE_REATTACH) {
2436 2452                                          /*
2437 2453                                           * Find segment for current address
2438 2454                                           * because current segment just got
2439 2455                                           * split or concatenated
2440 2456                                           */
2441 2457                                          seg = as_segat(as, raddr);
2442 2458                                          if (seg == NULL) {
2443 2459                                                  AS_LOCK_EXIT(as);
2444 2460                                                  return (ENOMEM);
2445 2461                                          }
2446 2462                                  } else {
2447 2463                                          /*
2448 2464                                           * Regular error
2449 2465                                           */
2450 2466                                          AS_LOCK_EXIT(as);
2451 2467                                          return (error);
2452 2468                                  }
2453 2469                          }
2454 2470                          break;
2455 2471  
2456 2472                  case MC_INHERIT_ZERO:
2457 2473                          if (seg->s_ops->inherit == NULL) {
2458 2474                                  error = ENOTSUP;
2459 2475                          } else {
2460 2476                                  error = SEGOP_INHERIT(seg, raddr, ssize,
2461 2477                                      SEGP_INH_ZERO);
2462 2478                          }
2463 2479                          if (error != 0) {
2464 2480                                  AS_LOCK_EXIT(as);
2465 2481                                  return (error);
2466 2482                          }
2467 2483                          break;
2468 2484  
2469 2485                  /*
2470 2486                   * Can't happen.
2471 2487                   */
2472 2488                  default:
2473 2489                          panic("as_ctl: bad operation %d", func);
2474 2490                          /*NOTREACHED*/
2475 2491                  }
2476 2492  
2477 2493                  rsize -= ssize;
2478 2494                  raddr += ssize;
2479 2495          }
2480 2496  
2481 2497          if (func == MC_LOCK)
2482 2498                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2483 2499          AS_LOCK_EXIT(as);
2484 2500          return (0);
2485 2501  lockerr:
2486 2502  
2487 2503          /*
2488 2504           * If the lower levels returned EDEADLK for a segment lockop,
2489 2505           * it means that we should retry the operation.  Let's wait
2490 2506           * a bit also to let the deadlock causing condition clear.
2491 2507           * This is part of a gross hack to work around a design flaw
2492 2508           * in the ufs/sds logging code and should go away when the
2493 2509           * logging code is re-designed to fix the problem. See bug
2494 2510           * 4125102 for details of the problem.
2495 2511           */
2496 2512          if (error == EDEADLK) {
2497 2513                  delay(deadlk_wait);
2498 2514                  error = 0;
2499 2515                  goto retry;
2500 2516          }
2501 2517          return (error);
2502 2518  }
2503 2519  
2504 2520  int
2505 2521  fc_decode(faultcode_t fault_err)
2506 2522  {
2507 2523          int error = 0;
2508 2524  
2509 2525          switch (FC_CODE(fault_err)) {
2510 2526          case FC_OBJERR:
2511 2527                  error = FC_ERRNO(fault_err);
2512 2528                  break;
2513 2529          case FC_PROT:
2514 2530                  error = EACCES;
2515 2531                  break;
2516 2532          default:
2517 2533                  error = EFAULT;
2518 2534                  break;
2519 2535          }
2520 2536          return (error);
2521 2537  }
2522 2538  
2523 2539  /*
2524 2540   * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2525 2541   * lists from each segment and copy them to one contiguous shadow list (plist)
2526 2542   * as expected by the caller.  Save pointers to per segment shadow lists at
2527 2543   * the tail of plist so that they can be used during as_pageunlock().
2528 2544   */
2529 2545  static int
2530 2546  as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2531 2547      caddr_t addr, size_t size, enum seg_rw rw)
2532 2548  {
2533 2549          caddr_t sv_addr = addr;
2534 2550          size_t sv_size = size;
2535 2551          struct seg *sv_seg = seg;
2536 2552          ulong_t segcnt = 1;
2537 2553          ulong_t cnt;
2538 2554          size_t ssize;
2539 2555          pgcnt_t npages = btop(size);
2540 2556          page_t **plist;
2541 2557          page_t **pl;
2542 2558          int error;
2543 2559          caddr_t eaddr;
2544 2560          faultcode_t fault_err = 0;
2545 2561          pgcnt_t pl_off;
2546 2562          extern struct seg_ops segspt_shmops;
2547 2563  
2548 2564          ASSERT(AS_LOCK_HELD(as));
2549 2565          ASSERT(seg != NULL);
2550 2566          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2551 2567          ASSERT(addr + size > seg->s_base + seg->s_size);
2552 2568          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2553 2569          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2554 2570  
2555 2571          /*
2556 2572           * Count the number of segments covered by the range we are about to
2557 2573           * lock. The segment count is used to size the shadow list we return
2558 2574           * back to the caller.
2559 2575           */
2560 2576          for (; size != 0; size -= ssize, addr += ssize) {
2561 2577                  if (addr >= seg->s_base + seg->s_size) {
2562 2578  
2563 2579                          seg = AS_SEGNEXT(as, seg);
2564 2580                          if (seg == NULL || addr != seg->s_base) {
2565 2581                                  AS_LOCK_EXIT(as);
2566 2582                                  return (EFAULT);
2567 2583                          }
2568 2584                          /*
2569 2585                           * Do a quick check if subsequent segments
2570 2586                           * will most likely support pagelock.
2571 2587                           */
2572 2588                          if (seg->s_ops == &segvn_ops) {
2573 2589                                  vnode_t *vp;
2574 2590  
2575 2591                                  if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2576 2592                                      vp != NULL) {
2577 2593                                          AS_LOCK_EXIT(as);
2578 2594                                          goto slow;
2579 2595                                  }
2580 2596                          } else if (seg->s_ops != &segspt_shmops) {
2581 2597                                  AS_LOCK_EXIT(as);
2582 2598                                  goto slow;
2583 2599                          }
2584 2600                          segcnt++;
2585 2601                  }
2586 2602                  if (addr + size > seg->s_base + seg->s_size) {
2587 2603                          ssize = seg->s_base + seg->s_size - addr;
2588 2604                  } else {
2589 2605                          ssize = size;
2590 2606                  }
2591 2607          }
2592 2608          ASSERT(segcnt > 1);
2593 2609  
2594 2610          plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2595 2611  
2596 2612          addr = sv_addr;
2597 2613          size = sv_size;
2598 2614          seg = sv_seg;
2599 2615  
2600 2616          for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2601 2617                  if (addr >= seg->s_base + seg->s_size) {
2602 2618                          seg = AS_SEGNEXT(as, seg);
2603 2619                          ASSERT(seg != NULL && addr == seg->s_base);
2604 2620                          cnt++;
2605 2621                          ASSERT(cnt < segcnt);
2606 2622                  }
2607 2623                  if (addr + size > seg->s_base + seg->s_size) {
2608 2624                          ssize = seg->s_base + seg->s_size - addr;
2609 2625                  } else {
2610 2626                          ssize = size;
2611 2627                  }
2612 2628                  pl = &plist[npages + cnt];
2613 2629                  error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2614 2630                      L_PAGELOCK, rw);
2615 2631                  if (error) {
2616 2632                          break;
2617 2633                  }
2618 2634                  ASSERT(plist[npages + cnt] != NULL);
2619 2635                  ASSERT(pl_off + btop(ssize) <= npages);
2620 2636                  bcopy(plist[npages + cnt], &plist[pl_off],
2621 2637                      btop(ssize) * sizeof (page_t *));
2622 2638                  pl_off += btop(ssize);
2623 2639          }
2624 2640  
2625 2641          if (size == 0) {
2626 2642                  AS_LOCK_EXIT(as);
2627 2643                  ASSERT(cnt == segcnt - 1);
2628 2644                  *ppp = plist;
2629 2645                  return (0);
2630 2646          }
2631 2647  
2632 2648          /*
2633 2649           * one of pagelock calls failed. The error type is in error variable.
2634 2650           * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2635 2651           * type is either EFAULT or ENOTSUP. Otherwise just return the error
2636 2652           * back to the caller.
2637 2653           */
2638 2654  
2639 2655          eaddr = addr;
2640 2656          seg = sv_seg;
2641 2657  
2642 2658          for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2643 2659                  if (addr >= seg->s_base + seg->s_size) {
2644 2660                          seg = AS_SEGNEXT(as, seg);
2645 2661                          ASSERT(seg != NULL && addr == seg->s_base);
2646 2662                          cnt++;
2647 2663                          ASSERT(cnt < segcnt);
2648 2664                  }
2649 2665                  if (eaddr > seg->s_base + seg->s_size) {
2650 2666                          ssize = seg->s_base + seg->s_size - addr;
2651 2667                  } else {
2652 2668                          ssize = eaddr - addr;
2653 2669                  }
2654 2670                  pl = &plist[npages + cnt];
2655 2671                  ASSERT(*pl != NULL);
2656 2672                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2657 2673                      L_PAGEUNLOCK, rw);
2658 2674          }
2659 2675  
2660 2676          AS_LOCK_EXIT(as);
2661 2677  
2662 2678          kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2663 2679  
2664 2680          if (error != ENOTSUP && error != EFAULT) {
2665 2681                  return (error);
2666 2682          }
2667 2683  
2668 2684  slow:
2669 2685          /*
2670 2686           * If we are here because pagelock failed due to the need to cow fault
2671 2687           * in the pages we want to lock F_SOFTLOCK will do this job and in
2672 2688           * next as_pagelock() call for this address range pagelock will
2673 2689           * hopefully succeed.
2674 2690           */
2675 2691          fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2676 2692          if (fault_err != 0) {
2677 2693                  return (fc_decode(fault_err));
2678 2694          }
2679 2695          *ppp = NULL;
2680 2696  
2681 2697          return (0);
2682 2698  }
2683 2699  
2684 2700  /*
2685 2701   * lock pages in a given address space. Return shadow list. If
2686 2702   * the list is NULL, the MMU mapping is also locked.
2687 2703   */
2688 2704  int
2689 2705  as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2690 2706      size_t size, enum seg_rw rw)
2691 2707  {
2692 2708          size_t rsize;
2693 2709          caddr_t raddr;
2694 2710          faultcode_t fault_err;
2695 2711          struct seg *seg;
2696 2712          int err;
2697 2713  
2698 2714          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2699 2715              "as_pagelock_start: addr %p size %ld", addr, size);
2700 2716  
2701 2717          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2702 2718          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2703 2719              (size_t)raddr;
2704 2720  
2705 2721          /*
2706 2722           * if the request crosses two segments let
2707 2723           * as_fault handle it.
2708 2724           */
2709 2725          AS_LOCK_ENTER(as, RW_READER);
2710 2726  
2711 2727          seg = as_segat(as, raddr);
2712 2728          if (seg == NULL) {
2713 2729                  AS_LOCK_EXIT(as);
2714 2730                  return (EFAULT);
2715 2731          }
2716 2732          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2717 2733          if (raddr + rsize > seg->s_base + seg->s_size) {
2718 2734                  return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2719 2735          }
2720 2736          if (raddr + rsize <= raddr) {
2721 2737                  AS_LOCK_EXIT(as);
2722 2738                  return (EFAULT);
2723 2739          }
2724 2740  
2725 2741          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2726 2742              "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2727 2743  
2728 2744          /*
2729 2745           * try to lock pages and pass back shadow list
2730 2746           */
2731 2747          err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2732 2748  
2733 2749          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2734 2750  
2735 2751          AS_LOCK_EXIT(as);
2736 2752  
2737 2753          if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2738 2754                  return (err);
2739 2755          }
2740 2756  
2741 2757          /*
2742 2758           * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2743 2759           * to no pagelock support for this segment or pages need to be cow
2744 2760           * faulted in. If fault is needed F_SOFTLOCK will do this job for
2745 2761           * this as_pagelock() call and in the next as_pagelock() call for the
2746 2762           * same address range pagelock call will hopefull succeed.
2747 2763           */
2748 2764          fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2749 2765          if (fault_err != 0) {
2750 2766                  return (fc_decode(fault_err));
2751 2767          }
2752 2768          *ppp = NULL;
2753 2769  
2754 2770          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2755 2771          return (0);
2756 2772  }
2757 2773  
2758 2774  /*
2759 2775   * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2760 2776   * lists from the end of plist and call pageunlock interface for each segment.
2761 2777   * Drop as lock and free plist.
2762 2778   */
2763 2779  static void
2764 2780  as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2765 2781      struct page **plist, enum seg_rw rw)
2766 2782  {
2767 2783          ulong_t cnt;
2768 2784          caddr_t eaddr = addr + size;
2769 2785          pgcnt_t npages = btop(size);
2770 2786          size_t ssize;
2771 2787          page_t **pl;
2772 2788  
2773 2789          ASSERT(AS_LOCK_HELD(as));
2774 2790          ASSERT(seg != NULL);
2775 2791          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2776 2792          ASSERT(addr + size > seg->s_base + seg->s_size);
2777 2793          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2778 2794          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2779 2795          ASSERT(plist != NULL);
2780 2796  
2781 2797          for (cnt = 0; addr < eaddr; addr += ssize) {
2782 2798                  if (addr >= seg->s_base + seg->s_size) {
2783 2799                          seg = AS_SEGNEXT(as, seg);
2784 2800                          ASSERT(seg != NULL && addr == seg->s_base);
2785 2801                          cnt++;
2786 2802                  }
2787 2803                  if (eaddr > seg->s_base + seg->s_size) {
2788 2804                          ssize = seg->s_base + seg->s_size - addr;
2789 2805                  } else {
2790 2806                          ssize = eaddr - addr;
2791 2807                  }
2792 2808                  pl = &plist[npages + cnt];
2793 2809                  ASSERT(*pl != NULL);
2794 2810                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2795 2811                      L_PAGEUNLOCK, rw);
2796 2812          }
2797 2813          ASSERT(cnt > 0);
2798 2814          AS_LOCK_EXIT(as);
2799 2815  
2800 2816          cnt++;
2801 2817          kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2802 2818  }
2803 2819  
2804 2820  /*
2805 2821   * unlock pages in a given address range
2806 2822   */
2807 2823  void
2808 2824  as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2809 2825      enum seg_rw rw)
2810 2826  {
2811 2827          struct seg *seg;
2812 2828          size_t rsize;
2813 2829          caddr_t raddr;
2814 2830  
2815 2831          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2816 2832              "as_pageunlock_start: addr %p size %ld", addr, size);
2817 2833  
2818 2834          /*
2819 2835           * if the shadow list is NULL, as_pagelock was
2820 2836           * falling back to as_fault
2821 2837           */
2822 2838          if (pp == NULL) {
2823 2839                  (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2824 2840                  return;
2825 2841          }
2826 2842  
2827 2843          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2828 2844          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2829 2845              (size_t)raddr;
2830 2846  
2831 2847          AS_LOCK_ENTER(as, RW_READER);
2832 2848          seg = as_segat(as, raddr);
2833 2849          ASSERT(seg != NULL);
2834 2850  
2835 2851          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2836 2852              "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2837 2853  
2838 2854          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2839 2855          if (raddr + rsize <= seg->s_base + seg->s_size) {
2840 2856                  SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2841 2857          } else {
2842 2858                  as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2843 2859                  return;
2844 2860          }
2845 2861          AS_LOCK_EXIT(as);
2846 2862          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2847 2863  }
2848 2864  
2849 2865  int
2850 2866  as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2851 2867      boolean_t wait)
2852 2868  {
2853 2869          struct seg *seg;
2854 2870          size_t ssize;
2855 2871          caddr_t raddr;                  /* rounded down addr */
2856 2872          size_t rsize;                   /* rounded up size */
2857 2873          int error = 0;
2858 2874          size_t pgsz = page_get_pagesize(szc);
2859 2875  
2860 2876  setpgsz_top:
2861 2877          if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2862 2878                  return (EINVAL);
2863 2879          }
2864 2880  
2865 2881          raddr = addr;
2866 2882          rsize = size;
2867 2883  
2868 2884          if (raddr + rsize < raddr)              /* check for wraparound */
2869 2885                  return (ENOMEM);
2870 2886  
2871 2887          AS_LOCK_ENTER(as, RW_WRITER);
2872 2888          as_clearwatchprot(as, raddr, rsize);
2873 2889          seg = as_segat(as, raddr);
2874 2890          if (seg == NULL) {
2875 2891                  as_setwatch(as);
2876 2892                  AS_LOCK_EXIT(as);
2877 2893                  return (ENOMEM);
2878 2894          }
2879 2895  
2880 2896          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2881 2897                  if (raddr >= seg->s_base + seg->s_size) {
2882 2898                          seg = AS_SEGNEXT(as, seg);
2883 2899                          if (seg == NULL || raddr != seg->s_base) {
2884 2900                                  error = ENOMEM;
2885 2901                                  break;
2886 2902                          }
2887 2903                  }
2888 2904                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2889 2905                          ssize = seg->s_base + seg->s_size - raddr;
2890 2906                  } else {
2891 2907                          ssize = rsize;
2892 2908                  }
2893 2909  
2894 2910  retry:
2895 2911                  error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2896 2912  
2897 2913                  if (error == IE_NOMEM) {
2898 2914                          error = EAGAIN;
2899 2915                          break;
2900 2916                  }
2901 2917  
2902 2918                  if (error == IE_RETRY) {
2903 2919                          AS_LOCK_EXIT(as);
2904 2920                          goto setpgsz_top;
2905 2921                  }
2906 2922  
2907 2923                  if (error == ENOTSUP) {
2908 2924                          error = EINVAL;
2909 2925                          break;
2910 2926                  }
2911 2927  
2912 2928                  if (wait && (error == EAGAIN)) {
2913 2929                          /*
2914 2930                           * Memory is currently locked.  It must be unlocked
2915 2931                           * before this operation can succeed through a retry.
2916 2932                           * The possible reasons for locked memory and
2917 2933                           * corresponding strategies for unlocking are:
2918 2934                           * (1) Normal I/O
2919 2935                           *      wait for a signal that the I/O operation
2920 2936                           *      has completed and the memory is unlocked.
2921 2937                           * (2) Asynchronous I/O
2922 2938                           *      The aio subsystem does not unlock pages when
2923 2939                           *      the I/O is completed. Those pages are unlocked
2924 2940                           *      when the application calls aiowait/aioerror.
2925 2941                           *      So, to prevent blocking forever, cv_broadcast()
2926 2942                           *      is done to wake up aio_cleanup_thread.
2927 2943                           *      Subsequently, segvn_reclaim will be called, and
2928 2944                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
2929 2945                           * (3) Long term page locking:
2930 2946                           *      This is not relevant for as_setpagesize()
2931 2947                           *      because we cannot change the page size for
2932 2948                           *      driver memory. The attempt to do so will
2933 2949                           *      fail with a different error than EAGAIN so
2934 2950                           *      there's no need to trigger as callbacks like
2935 2951                           *      as_unmap, as_setprot or as_free would do.
2936 2952                           */
2937 2953                          mutex_enter(&as->a_contents);
2938 2954                          if (!AS_ISNOUNMAPWAIT(as)) {
2939 2955                                  if (AS_ISUNMAPWAIT(as) == 0) {
2940 2956                                          cv_broadcast(&as->a_cv);
2941 2957                                  }
2942 2958                                  AS_SETUNMAPWAIT(as);
2943 2959                                  AS_LOCK_EXIT(as);
2944 2960                                  while (AS_ISUNMAPWAIT(as)) {
2945 2961                                          cv_wait(&as->a_cv, &as->a_contents);
2946 2962                                  }
2947 2963                          } else {
2948 2964                                  /*
2949 2965                                   * We may have raced with
2950 2966                                   * segvn_reclaim()/segspt_reclaim(). In this
2951 2967                                   * case clean nounmapwait flag and retry since
2952 2968                                   * softlockcnt in this segment may be already
2953 2969                                   * 0.  We don't drop as writer lock so our
2954 2970                                   * number of retries without sleeping should
2955 2971                                   * be very small. See segvn_reclaim() for
2956 2972                                   * more comments.
2957 2973                                   */
2958 2974                                  AS_CLRNOUNMAPWAIT(as);
2959 2975                                  mutex_exit(&as->a_contents);
2960 2976                                  goto retry;
2961 2977                          }
2962 2978                          mutex_exit(&as->a_contents);
2963 2979                          goto setpgsz_top;
2964 2980                  } else if (error != 0) {
2965 2981                          break;
2966 2982                  }
2967 2983          }
2968 2984          as_setwatch(as);
2969 2985          AS_LOCK_EXIT(as);
2970 2986          return (error);
2971 2987  }
2972 2988  
2973 2989  /*
2974 2990   * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2975 2991   * in its chunk where s_szc is less than the szc we want to set.
2976 2992   */
2977 2993  static int
2978 2994  as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2979 2995      int *retry)
2980 2996  {
2981 2997          struct seg *seg;
2982 2998          size_t ssize;
2983 2999          int error;
2984 3000  
2985 3001          ASSERT(AS_WRITE_HELD(as));
2986 3002  
2987 3003          seg = as_segat(as, raddr);
2988 3004          if (seg == NULL) {
2989 3005                  panic("as_iset3_default_lpsize: no seg");
2990 3006          }
2991 3007  
2992 3008          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2993 3009                  if (raddr >= seg->s_base + seg->s_size) {
2994 3010                          seg = AS_SEGNEXT(as, seg);
2995 3011                          if (seg == NULL || raddr != seg->s_base) {
2996 3012                                  panic("as_iset3_default_lpsize: as changed");
2997 3013                          }
2998 3014                  }
2999 3015                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3000 3016                          ssize = seg->s_base + seg->s_size - raddr;
3001 3017                  } else {
3002 3018                          ssize = rsize;
3003 3019                  }
3004 3020  
3005 3021                  if (szc > seg->s_szc) {
3006 3022                          error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3007 3023                          /* Only retry on EINVAL segments that have no vnode. */
3008 3024                          if (error == EINVAL) {
3009 3025                                  vnode_t *vp = NULL;
3010 3026                                  if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3011 3027                                      (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3012 3028                                      vp == NULL)) {
3013 3029                                          *retry = 1;
3014 3030                                  } else {
3015 3031                                          *retry = 0;
3016 3032                                  }
3017 3033                          }
3018 3034                          if (error) {
3019 3035                                  return (error);
3020 3036                          }
3021 3037                  }
3022 3038          }
3023 3039          return (0);
3024 3040  }
3025 3041  
3026 3042  /*
3027 3043   * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3028 3044   * pagesize on each segment in its range, but if any fails with EINVAL,
3029 3045   * then it reduces the pagesizes to the next size in the bitmap and
3030 3046   * retries as_iset3_default_lpsize(). The reason why the code retries
3031 3047   * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3032 3048   * match the bigger sizes, and (b) it's hard to get this offset (to begin
3033 3049   * with) to pass to map_pgszcvec().
3034 3050   */
3035 3051  static int
3036 3052  as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3037 3053      uint_t szcvec)
3038 3054  {
3039 3055          int error;
3040 3056          int retry;
3041 3057  
3042 3058          ASSERT(AS_WRITE_HELD(as));
3043 3059  
3044 3060          for (;;) {
3045 3061                  error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3046 3062                  if (error == EINVAL && retry) {
3047 3063                          szcvec &= ~(1 << szc);
3048 3064                          if (szcvec <= 1) {
3049 3065                                  return (EINVAL);
3050 3066                          }
3051 3067                          szc = highbit(szcvec) - 1;
3052 3068                  } else {
3053 3069                          return (error);
3054 3070                  }
3055 3071          }
3056 3072  }
3057 3073  
3058 3074  /*
3059 3075   * as_iset1_default_lpsize() breaks its chunk into areas where existing
3060 3076   * segments have a smaller szc than we want to set. For each such area,
3061 3077   * it calls as_iset2_default_lpsize()
3062 3078   */
3063 3079  static int
3064 3080  as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3065 3081      uint_t szcvec)
3066 3082  {
3067 3083          struct seg *seg;
3068 3084          size_t ssize;
3069 3085          caddr_t setaddr = raddr;
3070 3086          size_t setsize = 0;
3071 3087          int set;
3072 3088          int error;
3073 3089  
3074 3090          ASSERT(AS_WRITE_HELD(as));
3075 3091  
3076 3092          seg = as_segat(as, raddr);
3077 3093          if (seg == NULL) {
3078 3094                  panic("as_iset1_default_lpsize: no seg");
3079 3095          }
3080 3096          if (seg->s_szc < szc) {
3081 3097                  set = 1;
3082 3098          } else {
3083 3099                  set = 0;
3084 3100          }
3085 3101  
3086 3102          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3087 3103                  if (raddr >= seg->s_base + seg->s_size) {
3088 3104                          seg = AS_SEGNEXT(as, seg);
3089 3105                          if (seg == NULL || raddr != seg->s_base) {
3090 3106                                  panic("as_iset1_default_lpsize: as changed");
3091 3107                          }
3092 3108                          if (seg->s_szc >= szc && set) {
3093 3109                                  ASSERT(setsize != 0);
3094 3110                                  error = as_iset2_default_lpsize(as,
3095 3111                                      setaddr, setsize, szc, szcvec);
3096 3112                                  if (error) {
3097 3113                                          return (error);
3098 3114                                  }
3099 3115                                  set = 0;
3100 3116                          } else if (seg->s_szc < szc && !set) {
3101 3117                                  setaddr = raddr;
3102 3118                                  setsize = 0;
3103 3119                                  set = 1;
3104 3120                          }
3105 3121                  }
3106 3122                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3107 3123                          ssize = seg->s_base + seg->s_size - raddr;
3108 3124                  } else {
3109 3125                          ssize = rsize;
3110 3126                  }
3111 3127          }
3112 3128          error = 0;
3113 3129          if (set) {
3114 3130                  ASSERT(setsize != 0);
3115 3131                  error = as_iset2_default_lpsize(as, setaddr, setsize,
3116 3132                      szc, szcvec);
3117 3133          }
3118 3134          return (error);
3119 3135  }
3120 3136  
3121 3137  /*
3122 3138   * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3123 3139   * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3124 3140   * chunk to as_iset1_default_lpsize().
3125 3141   */
3126 3142  static int
3127 3143  as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3128 3144      int type)
3129 3145  {
3130 3146          int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3131 3147          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3132 3148              flags, rtype, 1);
3133 3149          uint_t szc;
3134 3150          uint_t nszc;
3135 3151          int error;
3136 3152          caddr_t a;
3137 3153          caddr_t eaddr;
3138 3154          size_t segsize;
3139 3155          size_t pgsz;
3140 3156          uint_t save_szcvec;
3141 3157  
3142 3158          ASSERT(AS_WRITE_HELD(as));
3143 3159          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3144 3160          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3145 3161  
3146 3162          szcvec &= ~1;
3147 3163          if (szcvec <= 1) {      /* skip if base page size */
3148 3164                  return (0);
3149 3165          }
3150 3166  
3151 3167          /* Get the pagesize of the first larger page size. */
3152 3168          szc = lowbit(szcvec) - 1;
3153 3169          pgsz = page_get_pagesize(szc);
3154 3170          eaddr = addr + size;
3155 3171          addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3156 3172          eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3157 3173  
3158 3174          save_szcvec = szcvec;
3159 3175          szcvec >>= (szc + 1);
3160 3176          nszc = szc;
3161 3177          while (szcvec) {
3162 3178                  if ((szcvec & 0x1) == 0) {
3163 3179                          nszc++;
3164 3180                          szcvec >>= 1;
3165 3181                          continue;
3166 3182                  }
3167 3183                  nszc++;
3168 3184                  pgsz = page_get_pagesize(nszc);
3169 3185                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3170 3186                  if (a != addr) {
3171 3187                          ASSERT(szc > 0);
3172 3188                          ASSERT(a < eaddr);
3173 3189                          segsize = a - addr;
3174 3190                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3175 3191                              save_szcvec);
3176 3192                          if (error) {
3177 3193                                  return (error);
3178 3194                          }
3179 3195                          addr = a;
3180 3196                  }
3181 3197                  szc = nszc;
3182 3198                  szcvec >>= 1;
3183 3199          }
3184 3200  
3185 3201          ASSERT(addr < eaddr);
3186 3202          szcvec = save_szcvec;
3187 3203          while (szcvec) {
3188 3204                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3189 3205                  ASSERT(a >= addr);
3190 3206                  if (a != addr) {
3191 3207                          ASSERT(szc > 0);
3192 3208                          segsize = a - addr;
3193 3209                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3194 3210                              save_szcvec);
3195 3211                          if (error) {
3196 3212                                  return (error);
3197 3213                          }
3198 3214                          addr = a;
3199 3215                  }
3200 3216                  szcvec &= ~(1 << szc);
3201 3217                  if (szcvec) {
3202 3218                          szc = highbit(szcvec) - 1;
3203 3219                          pgsz = page_get_pagesize(szc);
3204 3220                  }
3205 3221          }
3206 3222          ASSERT(addr == eaddr);
3207 3223  
3208 3224          return (0);
3209 3225  }
3210 3226  
3211 3227  /*
3212 3228   * Set the default large page size for the range. Called via memcntl with
3213 3229   * page size set to 0. as_set_default_lpsize breaks the range down into
3214 3230   * chunks with the same type/flags, ignores-non segvn segments, and passes
3215 3231   * each chunk to as_iset_default_lpsize().
3216 3232   */
3217 3233  int
3218 3234  as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3219 3235  {
3220 3236          struct seg *seg;
3221 3237          caddr_t raddr;
3222 3238          size_t rsize;
3223 3239          size_t ssize;
3224 3240          int rtype, rflags;
3225 3241          int stype, sflags;
3226 3242          int error;
3227 3243          caddr_t setaddr;
3228 3244          size_t setsize;
3229 3245          int segvn;
3230 3246  
3231 3247          if (size == 0)
3232 3248                  return (0);
3233 3249  
3234 3250          AS_LOCK_ENTER(as, RW_WRITER);
3235 3251  again:
3236 3252          error = 0;
3237 3253  
3238 3254          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3239 3255          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3240 3256              (size_t)raddr;
3241 3257  
3242 3258          if (raddr + rsize < raddr) {            /* check for wraparound */
3243 3259                  AS_LOCK_EXIT(as);
3244 3260                  return (ENOMEM);
3245 3261          }
3246 3262          as_clearwatchprot(as, raddr, rsize);
3247 3263          seg = as_segat(as, raddr);
3248 3264          if (seg == NULL) {
3249 3265                  as_setwatch(as);
3250 3266                  AS_LOCK_EXIT(as);
3251 3267                  return (ENOMEM);
3252 3268          }
3253 3269          if (seg->s_ops == &segvn_ops) {
3254 3270                  rtype = SEGOP_GETTYPE(seg, addr);
3255 3271                  rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3256 3272                  rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3257 3273                  segvn = 1;
3258 3274          } else {
3259 3275                  segvn = 0;
3260 3276          }
3261 3277          setaddr = raddr;
3262 3278          setsize = 0;
3263 3279  
3264 3280          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3265 3281                  if (raddr >= (seg->s_base + seg->s_size)) {
3266 3282                          seg = AS_SEGNEXT(as, seg);
3267 3283                          if (seg == NULL || raddr != seg->s_base) {
3268 3284                                  error = ENOMEM;
3269 3285                                  break;
3270 3286                          }
3271 3287                          if (seg->s_ops == &segvn_ops) {
3272 3288                                  stype = SEGOP_GETTYPE(seg, raddr);
3273 3289                                  sflags = stype & (MAP_TEXT | MAP_INITDATA);
3274 3290                                  stype &= (MAP_SHARED | MAP_PRIVATE);
3275 3291                                  if (segvn && (rflags != sflags ||
3276 3292                                      rtype != stype)) {
3277 3293                                          /*
3278 3294                                           * The next segment is also segvn but
3279 3295                                           * has different flags and/or type.
3280 3296                                           */
3281 3297                                          ASSERT(setsize != 0);
3282 3298                                          error = as_iset_default_lpsize(as,
3283 3299                                              setaddr, setsize, rflags, rtype);
3284 3300                                          if (error) {
3285 3301                                                  break;
3286 3302                                          }
3287 3303                                          rflags = sflags;
3288 3304                                          rtype = stype;
3289 3305                                          setaddr = raddr;
3290 3306                                          setsize = 0;
3291 3307                                  } else if (!segvn) {
3292 3308                                          rflags = sflags;
3293 3309                                          rtype = stype;
3294 3310                                          setaddr = raddr;
3295 3311                                          setsize = 0;
3296 3312                                          segvn = 1;
3297 3313                                  }
3298 3314                          } else if (segvn) {
3299 3315                                  /* The next segment is not segvn. */
3300 3316                                  ASSERT(setsize != 0);
3301 3317                                  error = as_iset_default_lpsize(as,
3302 3318                                      setaddr, setsize, rflags, rtype);
3303 3319                                  if (error) {
3304 3320                                          break;
3305 3321                                  }
3306 3322                                  segvn = 0;
3307 3323                          }
3308 3324                  }
3309 3325                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3310 3326                          ssize = seg->s_base + seg->s_size - raddr;
3311 3327                  } else {
3312 3328                          ssize = rsize;
3313 3329                  }
3314 3330          }
3315 3331          if (error == 0 && segvn) {
3316 3332                  /* The last chunk when rsize == 0. */
3317 3333                  ASSERT(setsize != 0);
3318 3334                  error = as_iset_default_lpsize(as, setaddr, setsize,
3319 3335                      rflags, rtype);
3320 3336          }
3321 3337  
3322 3338          if (error == IE_RETRY) {
3323 3339                  goto again;
3324 3340          } else if (error == IE_NOMEM) {
3325 3341                  error = EAGAIN;
3326 3342          } else if (error == ENOTSUP) {
3327 3343                  error = EINVAL;
3328 3344          } else if (error == EAGAIN) {
3329 3345                  mutex_enter(&as->a_contents);
3330 3346                  if (!AS_ISNOUNMAPWAIT(as)) {
3331 3347                          if (AS_ISUNMAPWAIT(as) == 0) {
3332 3348                                  cv_broadcast(&as->a_cv);
3333 3349                          }
3334 3350                          AS_SETUNMAPWAIT(as);
3335 3351                          AS_LOCK_EXIT(as);
3336 3352                          while (AS_ISUNMAPWAIT(as)) {
3337 3353                                  cv_wait(&as->a_cv, &as->a_contents);
3338 3354                          }
3339 3355                          mutex_exit(&as->a_contents);
3340 3356                          AS_LOCK_ENTER(as, RW_WRITER);
3341 3357                  } else {
3342 3358                          /*
3343 3359                           * We may have raced with
3344 3360                           * segvn_reclaim()/segspt_reclaim(). In this case
3345 3361                           * clean nounmapwait flag and retry since softlockcnt
3346 3362                           * in this segment may be already 0.  We don't drop as
3347 3363                           * writer lock so our number of retries without
3348 3364                           * sleeping should be very small. See segvn_reclaim()
3349 3365                           * for more comments.
3350 3366                           */
3351 3367                          AS_CLRNOUNMAPWAIT(as);
3352 3368                          mutex_exit(&as->a_contents);
3353 3369                  }
3354 3370                  goto again;
3355 3371          }
3356 3372  
3357 3373          as_setwatch(as);
3358 3374          AS_LOCK_EXIT(as);
3359 3375          return (error);
3360 3376  }
3361 3377  
3362 3378  /*
3363 3379   * Setup all of the uninitialized watched pages that we can.
3364 3380   */
3365 3381  void
3366 3382  as_setwatch(struct as *as)
3367 3383  {
3368 3384          struct watched_page *pwp;
3369 3385          struct seg *seg;
3370 3386          caddr_t vaddr;
3371 3387          uint_t prot;
3372 3388          int  err, retrycnt;
3373 3389  
3374 3390          if (avl_numnodes(&as->a_wpage) == 0)
3375 3391                  return;
3376 3392  
3377 3393          ASSERT(AS_WRITE_HELD(as));
3378 3394  
3379 3395          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3380 3396              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3381 3397                  retrycnt = 0;
3382 3398          retry:
3383 3399                  vaddr = pwp->wp_vaddr;
3384 3400                  if (pwp->wp_oprot != 0 ||       /* already set up */
3385 3401                      (seg = as_segat(as, vaddr)) == NULL ||
3386 3402                      SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3387 3403                          continue;
3388 3404  
3389 3405                  pwp->wp_oprot = prot;
3390 3406                  if (pwp->wp_read)
3391 3407                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3392 3408                  if (pwp->wp_write)
3393 3409                          prot &= ~PROT_WRITE;
3394 3410                  if (pwp->wp_exec)
3395 3411                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3396 3412                  if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3397 3413                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3398 3414                          if (err == IE_RETRY) {
3399 3415                                  pwp->wp_oprot = 0;
3400 3416                                  ASSERT(retrycnt == 0);
3401 3417                                  retrycnt++;
3402 3418                                  goto retry;
3403 3419                          }
3404 3420                  }
3405 3421                  pwp->wp_prot = prot;
3406 3422          }
3407 3423  }
3408 3424  
3409 3425  /*
3410 3426   * Clear all of the watched pages in the address space.
3411 3427   */
3412 3428  void
3413 3429  as_clearwatch(struct as *as)
3414 3430  {
3415 3431          struct watched_page *pwp;
3416 3432          struct seg *seg;
3417 3433          caddr_t vaddr;
3418 3434          uint_t prot;
3419 3435          int err, retrycnt;
3420 3436  
3421 3437          if (avl_numnodes(&as->a_wpage) == 0)
3422 3438                  return;
3423 3439  
3424 3440          ASSERT(AS_WRITE_HELD(as));
3425 3441  
3426 3442          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3427 3443              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3428 3444                  retrycnt = 0;
3429 3445          retry:
3430 3446                  vaddr = pwp->wp_vaddr;
3431 3447                  if (pwp->wp_oprot == 0 ||       /* not set up */
3432 3448                      (seg = as_segat(as, vaddr)) == NULL)
3433 3449                          continue;
3434 3450  
3435 3451                  if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3436 3452                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3437 3453                          if (err == IE_RETRY) {
3438 3454                                  ASSERT(retrycnt == 0);
3439 3455                                  retrycnt++;
3440 3456                                  goto retry;
3441 3457                          }
3442 3458                  }
3443 3459                  pwp->wp_oprot = 0;
3444 3460                  pwp->wp_prot = 0;
3445 3461          }
3446 3462  }
3447 3463  
3448 3464  /*
3449 3465   * Force a new setup for all the watched pages in the range.
3450 3466   */
3451 3467  static void
3452 3468  as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3453 3469  {
3454 3470          struct watched_page *pwp;
3455 3471          struct watched_page tpw;
3456 3472          caddr_t eaddr = addr + size;
3457 3473          caddr_t vaddr;
3458 3474          struct seg *seg;
3459 3475          int err, retrycnt;
3460 3476          uint_t  wprot;
3461 3477          avl_index_t where;
3462 3478  
3463 3479          if (avl_numnodes(&as->a_wpage) == 0)
3464 3480                  return;
3465 3481  
3466 3482          ASSERT(AS_WRITE_HELD(as));
3467 3483  
3468 3484          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3469 3485          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3470 3486                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3471 3487  
3472 3488          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3473 3489                  retrycnt = 0;
3474 3490                  vaddr = pwp->wp_vaddr;
3475 3491  
3476 3492                  wprot = prot;
3477 3493                  if (pwp->wp_read)
3478 3494                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3479 3495                  if (pwp->wp_write)
3480 3496                          wprot &= ~PROT_WRITE;
3481 3497                  if (pwp->wp_exec)
3482 3498                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3483 3499                  if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3484 3500                  retry:
3485 3501                          seg = as_segat(as, vaddr);
3486 3502                          if (seg == NULL) {
3487 3503                                  panic("as_setwatchprot: no seg");
3488 3504                                  /*NOTREACHED*/
3489 3505                          }
3490 3506                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3491 3507                          if (err == IE_RETRY) {
3492 3508                                  ASSERT(retrycnt == 0);
3493 3509                                  retrycnt++;
3494 3510                                  goto retry;
3495 3511                          }
3496 3512                  }
3497 3513                  pwp->wp_oprot = prot;
3498 3514                  pwp->wp_prot = wprot;
3499 3515  
3500 3516                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3501 3517          }
3502 3518  }
3503 3519  
3504 3520  /*
3505 3521   * Clear all of the watched pages in the range.
3506 3522   */
3507 3523  static void
3508 3524  as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3509 3525  {
3510 3526          caddr_t eaddr = addr + size;
3511 3527          struct watched_page *pwp;
3512 3528          struct watched_page tpw;
3513 3529          uint_t prot;
3514 3530          struct seg *seg;
3515 3531          int err, retrycnt;
3516 3532          avl_index_t where;
3517 3533  
3518 3534          if (avl_numnodes(&as->a_wpage) == 0)
3519 3535                  return;
3520 3536  
3521 3537          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3522 3538          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3523 3539                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3524 3540  
3525 3541          ASSERT(AS_WRITE_HELD(as));
3526 3542  
3527 3543          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3528 3544  
3529 3545                  if ((prot = pwp->wp_oprot) != 0) {
3530 3546                          retrycnt = 0;
3531 3547  
3532 3548                          if (prot != pwp->wp_prot) {
3533 3549                          retry:
3534 3550                                  seg = as_segat(as, pwp->wp_vaddr);
3535 3551                                  if (seg == NULL)
3536 3552                                          continue;
3537 3553                                  err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3538 3554                                      PAGESIZE, prot);
3539 3555                                  if (err == IE_RETRY) {
3540 3556                                          ASSERT(retrycnt == 0);
3541 3557                                          retrycnt++;
3542 3558                                          goto retry;
3543 3559  
3544 3560                                  }
3545 3561                          }
3546 3562                          pwp->wp_oprot = 0;
3547 3563                          pwp->wp_prot = 0;
3548 3564                  }
3549 3565  
3550 3566                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3551 3567          }
3552 3568  }
3553 3569  
3554 3570  void
3555 3571  as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3556 3572  {
3557 3573          struct proc *p;
3558 3574  
3559 3575          mutex_enter(&pidlock);
3560 3576          for (p = practive; p; p = p->p_next) {
3561 3577                  if (p->p_as == as) {
3562 3578                          mutex_enter(&p->p_lock);
3563 3579                          if (p->p_as == as)
3564 3580                                  sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3565 3581                          mutex_exit(&p->p_lock);
3566 3582                  }
3567 3583          }
3568 3584          mutex_exit(&pidlock);
3569 3585  }
3570 3586  
3571 3587  /*
3572 3588   * return memory object ID
3573 3589   */
3574 3590  int
3575 3591  as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3576 3592  {
3577 3593          struct seg      *seg;
3578 3594          int             sts;
3579 3595  
3580 3596          AS_LOCK_ENTER(as, RW_READER);
3581 3597          seg = as_segat(as, addr);
3582 3598          if (seg == NULL) {
3583 3599                  AS_LOCK_EXIT(as);
3584 3600                  return (EFAULT);
3585 3601          }
3586 3602          /*
3587 3603           * catch old drivers which may not support getmemid
3588 3604           */
3589 3605          if (seg->s_ops->getmemid == NULL) {
3590 3606                  AS_LOCK_EXIT(as);
3591 3607                  return (ENODEV);
3592 3608          }
3593 3609  
3594 3610          sts = SEGOP_GETMEMID(seg, addr, memidp);
3595 3611  
3596 3612          AS_LOCK_EXIT(as);
3597 3613          return (sts);
3598 3614  }

↓ open down ↓

2699 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX